aboutsummaryrefslogtreecommitdiff
path: root/llvm/test
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test')
-rw-r--r--llvm/test/Analysis/BranchProbabilityInfo/pr22718.ll17
-rw-r--r--llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll81
-rw-r--r--llvm/test/Analysis/CostModel/ARM/add-cast-vect.ll82
-rw-r--r--llvm/test/Analysis/CostModel/ARM/cast_ldst.ll10
-rw-r--r--llvm/test/Analysis/CostModel/ARM/fparith.ll172
-rw-r--r--llvm/test/Analysis/CostModel/ARM/fptoi_sat.ll612
-rw-r--r--llvm/test/Analysis/CostModel/ARM/freeshift.ll80
-rw-r--r--llvm/test/Analysis/CostModel/ARM/gep.ll1820
-rw-r--r--llvm/test/Analysis/CostModel/ARM/immediates.ll180
-rw-r--r--llvm/test/Analysis/CostModel/ARM/insertelement.ll39
-rw-r--r--llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll371
-rw-r--r--llvm/test/Analysis/CostModel/ARM/load-to-trunc.ll14
-rw-r--r--llvm/test/Analysis/CostModel/ARM/load_store.ll502
-rw-r--r--llvm/test/Analysis/CostModel/ARM/logicalop.ll176
-rw-r--r--llvm/test/Analysis/CostModel/ARM/mul-cast-vect.ll77
-rw-r--r--llvm/test/Analysis/CostModel/ARM/muls-in-smlal-patterns.ll100
-rw-r--r--llvm/test/Analysis/CostModel/ARM/muls-in-umull-patterns.ll81
-rw-r--r--llvm/test/Analysis/CostModel/ARM/mve-intrinsic-cost-kinds.ll181
-rw-r--r--llvm/test/Analysis/CostModel/ARM/mve-target-intrinsics.ll24
-rw-r--r--llvm/test/Analysis/CostModel/ARM/select.ll388
-rw-r--r--llvm/test/Analysis/CostModel/ARM/shl-cast-vect.ll77
-rw-r--r--llvm/test/Analysis/CostModel/ARM/shuffle.ll452
-rw-r--r--llvm/test/Analysis/CostModel/ARM/sub-cast-vect.ll77
-rw-r--r--llvm/test/Analysis/CostModel/ARM/target-intrinsics.ll41
-rw-r--r--llvm/test/Analysis/CostModel/SystemZ/intrinsic-cost-crash.ll6
-rw-r--r--llvm/test/Analysis/Delinearization/constant_functions_multi_dim.ll15
-rw-r--r--llvm/test/Analysis/DependenceAnalysis/MIVCheckConst.ll4
-rw-r--r--llvm/test/Analysis/DependenceAnalysis/NonCanonicalizedSubscript.ll4
-rw-r--r--llvm/test/Analysis/DependenceAnalysis/compute-absolute-value.ll48
-rw-r--r--llvm/test/Analysis/MemoryDependenceAnalysis/invariant.group-bug.ll13
-rw-r--r--llvm/test/Analysis/MemorySSA/pr28880.ll5
-rw-r--r--llvm/test/Analysis/MemorySSA/pr39197.ll34
-rw-r--r--llvm/test/Analysis/MemorySSA/pr40038.ll11
-rw-r--r--llvm/test/Analysis/MemorySSA/pr43569.ll8
-rw-r--r--llvm/test/Analysis/ScalarEvolution/pr22674.ll4
-rw-r--r--llvm/test/Analysis/ScalarEvolution/scev-canonical-mode.ll4
-rw-r--r--llvm/test/Analysis/TypeBasedAliasAnalysis/PR17620.ll7
-rw-r--r--llvm/test/Analysis/TypeBasedAliasAnalysis/tbaa-path.ll26
-rw-r--r--llvm/test/Assembler/atomic.ll19
-rw-r--r--llvm/test/Assembler/metadata-annotations.ll9
-rw-r--r--llvm/test/Bitcode/DILocation-implicit-code.ll31
-rw-r--r--llvm/test/Bitcode/drop-debug-info.3.5.ll4
-rw-r--r--llvm/test/Bitcode/thinlto-deadstrip-flag.ll20
-rw-r--r--llvm/test/Bitcode/thinlto-index-flags.ll39
-rw-r--r--llvm/test/Bitcode/upgrade-tbaa.ll4
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll6
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir30
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector-disjoint-mask.mir19
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector-undef-rhs.mir20
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector.mir102
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/regbank-fp-use-def.mir2
-rw-r--r--llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll489
-rw-r--r--llvm/test/CodeGen/AArch64/aarch64-smull.ll14
-rw-r--r--llvm/test/CodeGen/AArch64/arm64-cvt-simd-fptoi.ll1943
-rw-r--r--llvm/test/CodeGen/AArch64/arm64-neon-copy.ll57
-rw-r--r--llvm/test/CodeGen/AArch64/arm64-vcvt.ll30
-rw-r--r--llvm/test/CodeGen/AArch64/dup-ext-load-combine.ll178
-rw-r--r--llvm/test/CodeGen/AArch64/dup.ll12
-rw-r--r--llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll21
-rw-r--r--llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll21
-rw-r--r--llvm/test/CodeGen/AArch64/ldst-prepost-uses.ll73
-rw-r--r--llvm/test/CodeGen/AArch64/load-zext-bitcast.ll80
-rw-r--r--llvm/test/CodeGen/AArch64/pr164181.ll640
-rw-r--r--llvm/test/CodeGen/AArch64/sme-intrinsics-rdsvl.ll109
-rw-r--r--llvm/test/CodeGen/AArch64/sme-za-exceptions.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll22
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/combine-or-s64-s32.mir97
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll21
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll69
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll738
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaximum.mir275
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fminimum.mir275
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir48
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll24
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/or.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/prelegalizer-combiner-shuffle.mir10
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll56
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/shufflevector-pointer-crash.mir6
-rw-r--r--llvm/test/CodeGen/AMDGPU/add-max.ll151
-rw-r--r--llvm/test/CodeGen/AMDGPU/addsub64_carry.ll36
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll1260
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll135
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll210
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll585
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll90
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll90
-rw-r--r--llvm/test/CodeGen/AMDGPU/bf16.ll218
-rw-r--r--llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll144
-rw-r--r--llvm/test/CodeGen/AMDGPU/carryout-selection.ll614
-rw-r--r--llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll3
-rw-r--r--llvm/test/CodeGen/AMDGPU/ctpop16.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/div_i128.ll32
-rw-r--r--llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll25
-rw-r--r--llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll35
-rw-r--r--llvm/test/CodeGen/AMDGPU/fdiv.f64.ll7
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll22
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmaximum.ll921
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmed3.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/fminimum.ll921
-rw-r--r--llvm/test/CodeGen/AMDGPU/fneg-combines.legal.f16.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/fneg-combines.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll128
-rw-r--r--llvm/test/CodeGen/AMDGPU/fptrunc.ll36
-rw-r--r--llvm/test/CodeGen/AMDGPU/frem.ll11
-rw-r--r--llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll3
-rw-r--r--llvm/test/CodeGen/AMDGPU/fsqrt.r600.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-saddr-load.ll44
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll115
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll81
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll81
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll115
-rw-r--r--llvm/test/CodeGen/AMDGPU/inline-attr.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/itofp.i128.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.add.min.max.ll191
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.exp2.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.log2.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll128
-rw-r--r--llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll222
-rw-r--r--llvm/test/CodeGen/AMDGPU/minmax.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/optimize-compare.mir82
-rw-r--r--llvm/test/CodeGen/AMDGPU/s_cmp_0.ll64
-rw-r--r--llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/sdiv64.ll146
-rw-r--r--llvm/test/CodeGen/AMDGPU/srem.ll654
-rw-r--r--llvm/test/CodeGen/AMDGPU/srem64.ll207
-rw-r--r--llvm/test/CodeGen/AMDGPU/stackguard.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/uaddo.ll54
-rw-r--r--llvm/test/CodeGen/AMDGPU/udiv64.ll80
-rw-r--r--llvm/test/CodeGen/AMDGPU/urem64.ll146
-rw-r--r--llvm/test/CodeGen/AMDGPU/usubo.ll54
-rw-r--r--llvm/test/CodeGen/AMDGPU/wave32.ll190
-rw-r--r--llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll8
-rw-r--r--llvm/test/CodeGen/ARM/2014-05-14-DwarfEHCrash.ll2
-rw-r--r--llvm/test/CodeGen/ARM/ARMLoadStoreDBG.mir4
-rw-r--r--llvm/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll6
-rw-r--r--llvm/test/CodeGen/ARM/O3-pipeline.ll1
-rw-r--r--llvm/test/CodeGen/ARM/Windows/wineh-basic.ll4
-rw-r--r--llvm/test/CodeGen/ARM/byval_load_align.ll4
-rw-r--r--llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll2
-rw-r--r--llvm/test/CodeGen/ARM/call-graph-section-assembly.ll2
-rw-r--r--llvm/test/CodeGen/ARM/cfguard-module-flag.ll2
-rw-r--r--llvm/test/CodeGen/ARM/clang-section.ll4
-rw-r--r--llvm/test/CodeGen/ARM/cmse-clear-float-bigend.mir2
-rw-r--r--llvm/test/CodeGen/ARM/coalesce-dbgvalue.ll4
-rw-r--r--llvm/test/CodeGen/ARM/constantpool-promote-dbg.ll2
-rw-r--r--llvm/test/CodeGen/ARM/constantpool-promote.ll4
-rw-r--r--llvm/test/CodeGen/ARM/early-cfi-sections.ll2
-rw-r--r--llvm/test/CodeGen/ARM/fp16-vld.ll2
-rw-r--r--llvm/test/CodeGen/ARM/global-merge-1.ll6
-rw-r--r--llvm/test/CodeGen/ARM/isel-v8i32-crash.ll2
-rw-r--r--llvm/test/CodeGen/ARM/kcfi-arm.ll138
-rw-r--r--llvm/test/CodeGen/ARM/kcfi-cbz-range.ll81
-rw-r--r--llvm/test/CodeGen/ARM/kcfi-patchable-function-prefix.ll99
-rw-r--r--llvm/test/CodeGen/ARM/kcfi-thumb.ll215
-rw-r--r--llvm/test/CodeGen/ARM/kcfi-thumb2.ll163
-rw-r--r--llvm/test/CodeGen/ARM/kcfi.ll28
-rw-r--r--llvm/test/CodeGen/ARM/out-of-registers.ll2
-rw-r--r--llvm/test/CodeGen/ARM/relax-per-target-feature.ll2
-rw-r--r--llvm/test/CodeGen/ARM/softfp-constant-comparison.ll2
-rw-r--r--llvm/test/CodeGen/ARM/stack-protector-bmovpcb_call.ll4
-rw-r--r--llvm/test/CodeGen/ARM/stack_guard_remat.ll2
-rw-r--r--llvm/test/CodeGen/ARM/struct-byval-frame-index.ll2
-rw-r--r--llvm/test/CodeGen/ARM/subtarget-align.ll2
-rw-r--r--llvm/test/CodeGen/ARM/unschedule-first-call.ll2
-rw-r--r--llvm/test/CodeGen/ARM/vector-spilling.ll2
-rw-r--r--llvm/test/CodeGen/ARM/vldm-sched-a9.ll2
-rw-r--r--llvm/test/CodeGen/AVR/dynalloca.ll20
-rw-r--r--llvm/test/CodeGen/AVR/issue-163015.ll32
-rw-r--r--llvm/test/CodeGen/BPF/BTF/binary-format.ll7
-rw-r--r--llvm/test/CodeGen/BPF/BTF/builtin-btf-type-id.ll11
-rw-r--r--llvm/test/CodeGen/BPF/BTF/char-no-debuginfo.ll4
-rw-r--r--llvm/test/CodeGen/BPF/BTF/extern-builtin.ll10
-rw-r--r--llvm/test/CodeGen/BPF/BTF/extern-func-arg.ll10
-rw-r--r--llvm/test/CodeGen/BPF/BTF/extern-global-var.ll4
-rw-r--r--llvm/test/CodeGen/BPF/BTF/extern-var-func-weak-section.ll10
-rw-r--r--llvm/test/CodeGen/BPF/BTF/extern-var-func-weak.ll10
-rw-r--r--llvm/test/CodeGen/BPF/BTF/extern-var-func.ll10
-rw-r--r--llvm/test/CodeGen/BPF/BTF/extern-var-section.ll10
-rw-r--r--llvm/test/CodeGen/BPF/BTF/extern-var-struct-weak.ll4
-rw-r--r--llvm/test/CodeGen/BPF/BTF/extern-var-struct.ll4
-rw-r--r--llvm/test/CodeGen/BPF/BTF/extern-var-weak-section.ll10
-rw-r--r--llvm/test/CodeGen/BPF/BTF/filename.ll4
-rw-r--r--llvm/test/CodeGen/BPF/BTF/func-func-ptr.ll7
-rw-r--r--llvm/test/CodeGen/BPF/BTF/func-non-void.ll7
-rw-r--r--llvm/test/CodeGen/BPF/BTF/func-source.ll4
-rw-r--r--llvm/test/CodeGen/BPF/BTF/func-typedef.ll8
-rw-r--r--llvm/test/CodeGen/BPF/BTF/func-unused-arg.ll7
-rw-r--r--llvm/test/CodeGen/BPF/BTF/func-void.ll4
-rw-r--r--llvm/test/CodeGen/BPF/BTF/local-var-readonly-1.ll28
-rw-r--r--llvm/test/CodeGen/BPF/BTF/local-var-readonly-2.ll24
-rw-r--r--llvm/test/CodeGen/BPF/BTF/local-var.ll14
-rw-r--r--llvm/test/CodeGen/BPF/BTF/pruning-const.ll10
-rw-r--r--llvm/test/CodeGen/BPF/BTF/pruning-typedef.ll10
-rw-r--r--llvm/test/CodeGen/BPF/BTF/ptr-named-2.ll59
-rw-r--r--llvm/test/CodeGen/BPF/BTF/ptr-named.ll75
-rw-r--r--llvm/test/CodeGen/BPF/BTF/static-func.ll13
-rw-r--r--llvm/test/CodeGen/BPF/BTF/static-var-derived-type.ll4
-rw-r--r--llvm/test/CodeGen/BPF/BTF/static-var-inited-sec.ll4
-rw-r--r--llvm/test/CodeGen/BPF/BTF/static-var-inited.ll4
-rw-r--r--llvm/test/CodeGen/BPF/BTF/static-var-readonly-sec.ll4
-rw-r--r--llvm/test/CodeGen/BPF/BTF/static-var-readonly.ll4
-rw-r--r--llvm/test/CodeGen/BPF/BTF/static-var-sec.ll4
-rw-r--r--llvm/test/CodeGen/BPF/BTF/static-var-zerolen-array.ll4
-rw-r--r--llvm/test/CodeGen/BPF/BTF/static-var.ll4
-rw-r--r--llvm/test/CodeGen/BPF/BTF/struct-anon-2.ll8
-rw-r--r--llvm/test/CodeGen/BPF/BTF/weak-global-2.ll5
-rw-r--r--llvm/test/CodeGen/BPF/BTF/weak-global.ll4
-rw-r--r--llvm/test/CodeGen/BPF/CORE/btf-id-duplicate.ll12
-rw-r--r--llvm/test/CodeGen/BPF/CORE/field-reloc-alu32.ll9
-rw-r--r--llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1-bpfeb.ll13
-rw-r--r--llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1.ll12
-rw-r--r--llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-2-bpfeb.ll12
-rw-r--r--llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-2.ll12
-rw-r--r--llvm/test/CodeGen/BPF/CORE/field-reloc-duplicate.ll12
-rw-r--r--llvm/test/CodeGen/BPF/CORE/intrinsic-array-2.ll12
-rw-r--r--llvm/test/CodeGen/BPF/CORE/intrinsic-array.ll18
-rw-r--r--llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-1.ll14
-rw-r--r--llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-2.ll16
-rw-r--r--llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-3.ll16
-rw-r--r--llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-4.ll12
-rw-r--r--llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-1.ll16
-rw-r--r--llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-2.ll14
-rw-r--r--llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-3.ll16
-rw-r--r--llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-1-bpfeb.ll14
-rw-r--r--llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-1.ll14
-rw-r--r--llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-2.ll14
-rw-r--r--llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-1.ll14
-rw-r--r--llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-2.ll14
-rw-r--r--llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-3.ll16
-rw-r--r--llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-1.ll16
-rw-r--r--llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-2.ll14
-rw-r--r--llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-3.ll16
-rw-r--r--llvm/test/CodeGen/BPF/CORE/intrinsic-struct.ll16
-rw-r--r--llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-enum-value.ll7
-rw-r--r--llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-exist.ll7
-rw-r--r--llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-size-1.ll7
-rw-r--r--llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-size-2.ll10
-rw-r--r--llvm/test/CodeGen/BPF/CORE/intrinsic-union.ll16
-rw-r--r--llvm/test/CodeGen/BPF/CORE/no-elf-ama-symbol.ll10
-rw-r--r--llvm/test/CodeGen/BPF/CORE/no-narrow-load.ll27
-rw-r--r--llvm/test/CodeGen/BPF/CORE/offset-reloc-access-str.ll18
-rw-r--r--llvm/test/CodeGen/BPF/CORE/offset-reloc-basic.ll22
-rw-r--r--llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-array-1.ll18
-rw-r--r--llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-array-2.ll18
-rw-r--r--llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-1.ll18
-rw-r--r--llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-2.ll19
-rw-r--r--llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-3.ll18
-rw-r--r--llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-union-1.ll16
-rw-r--r--llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-union-2.ll18
-rw-r--r--llvm/test/CodeGen/BPF/CORE/offset-reloc-end-load.ll10
-rw-r--r--llvm/test/CodeGen/BPF/CORE/offset-reloc-end-ret.ll10
-rw-r--r--llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-1.ll27
-rw-r--r--llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2-bpfeb.ll12
-rw-r--r--llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2.ll12
-rw-r--r--llvm/test/CodeGen/BPF/CORE/offset-reloc-global-1.ll13
-rw-r--r--llvm/test/CodeGen/BPF/CORE/offset-reloc-global-2.ll16
-rw-r--r--llvm/test/CodeGen/BPF/CORE/offset-reloc-global-3.ll13
-rw-r--r--llvm/test/CodeGen/BPF/CORE/offset-reloc-ignore.ll16
-rw-r--r--llvm/test/CodeGen/BPF/CORE/offset-reloc-middle-chain.ll20
-rw-r--r--llvm/test/CodeGen/BPF/CORE/offset-reloc-multi-array-1.ll20
-rw-r--r--llvm/test/CodeGen/BPF/CORE/offset-reloc-multi-array-2.ll20
-rw-r--r--llvm/test/CodeGen/BPF/CORE/offset-reloc-multilevel.ll24
-rw-r--r--llvm/test/CodeGen/BPF/CORE/offset-reloc-pointer-1.ll16
-rw-r--r--llvm/test/CodeGen/BPF/CORE/offset-reloc-pointer-2.ll18
-rw-r--r--llvm/test/CodeGen/BPF/CORE/offset-reloc-struct-anonymous.ll26
-rw-r--r--llvm/test/CodeGen/BPF/CORE/offset-reloc-struct-array.ll26
-rw-r--r--llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-array.ll18
-rw-r--r--llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-struct-2.ll10
-rw-r--r--llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-struct.ll16
-rw-r--r--llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-union-2.ll10
-rw-r--r--llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-union.ll16
-rw-r--r--llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef.ll20
-rw-r--r--llvm/test/CodeGen/BPF/CORE/offset-reloc-union.ll26
-rw-r--r--llvm/test/CodeGen/BPF/CORE/store-addr.ll27
-rw-r--r--llvm/test/CodeGen/BPF/adjust-opt-icmp1.ll19
-rw-r--r--llvm/test/CodeGen/BPF/adjust-opt-icmp2.ll19
-rw-r--r--llvm/test/CodeGen/BPF/adjust-opt-speculative1.ll17
-rw-r--r--llvm/test/CodeGen/BPF/adjust-opt-speculative2.ll17
-rw-r--r--llvm/test/CodeGen/BPF/callx.ll7
-rw-r--r--llvm/test/CodeGen/BPF/dwarfdump.ll7
-rw-r--r--llvm/test/CodeGen/BPF/i128.ll22
-rw-r--r--llvm/test/CodeGen/BPF/is_trunc_free.ll10
-rw-r--r--llvm/test/CodeGen/BPF/is_zext_free.ll4
-rw-r--r--llvm/test/CodeGen/BPF/objdump_two_funcs.ll9
-rw-r--r--llvm/test/CodeGen/BPF/optnone-1.ll7
-rw-r--r--llvm/test/CodeGen/BPF/reloc-btf-2.ll4
-rw-r--r--llvm/test/CodeGen/BPF/reloc-btf.ll4
-rw-r--r--llvm/test/CodeGen/BPF/simplifycfg.ll27
-rw-r--r--llvm/test/CodeGen/BPF/warn-stack.ll30
-rw-r--r--llvm/test/CodeGen/BPF/xadd.ll7
-rw-r--r--llvm/test/CodeGen/DirectX/CBufferAccess/unused.ll13
-rw-r--r--llvm/test/CodeGen/DirectX/Metadata/resource-symbols.ll9
-rw-r--r--llvm/test/CodeGen/Hexagon/autohvx/ripple_scalarize_scatter.ll63
-rw-r--r--llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather.ll55
-rw-r--r--llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather_SpVV.ll54
-rw-r--r--llvm/test/CodeGen/Hexagon/autohvx/ripple_vscatter.ll52
-rw-r--r--llvm/test/CodeGen/Hexagon/masked_gather.ll58
-rw-r--r--llvm/test/CodeGen/Hexagon/vector-gather.ll27
-rw-r--r--llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll12
-rw-r--r--llvm/test/CodeGen/MIR/AArch64/parse-shufflemask-invalid-scalar.mir15
-rw-r--r--llvm/test/CodeGen/MIR/AArch64/parse-shufflemask.mir51
-rw-r--r--llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_2D_vocab.json18
-rw-r--r--llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json18
-rw-r--r--llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_inconsistent_dims.json11
-rw-r--r--llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_zero_vocab.json11
-rw-r--r--llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt291
-rw-r--r--llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt291
-rw-r--r--llvm/test/CodeGen/MIR2Vec/if-else.mir8
-rw-r--r--llvm/test/CodeGen/MIR2Vec/mir2vec-basic-symbolic.mir22
-rw-r--r--llvm/test/CodeGen/MIR2Vec/vocab-error-handling.ll6
-rw-r--r--llvm/test/CodeGen/MSP430/libcalls.ll14
-rw-r--r--llvm/test/CodeGen/RISCV/GlobalISel/store-fp-zero-to-x0.ll320
-rw-r--r--llvm/test/CodeGen/RISCV/rv32p.ll709
-rw-r--r--llvm/test/CodeGen/RISCV/rv64p.ll677
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/sf_vfbfexp16e.ll191
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/sf_vfexp16e.ll191
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/sf_vfexp32e.ll160
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/sf_vfexpa.ll335
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/sf_vfexpa64e.ll125
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll815
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll2336
-rw-r--r--llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll9
-rw-r--r--llvm/test/CodeGen/SPIRV/hlsl-resources/cbuffer.ll6
-rw-r--r--llvm/test/CodeGen/Thumb/PR17309.ll4
-rw-r--r--llvm/test/CodeGen/Thumb/fastcc.ll2
-rw-r--r--llvm/test/CodeGen/Thumb/ldm-merge-call.ll4
-rw-r--r--llvm/test/CodeGen/Thumb/stack_guard_remat.ll2
-rw-r--r--llvm/test/CodeGen/Thumb/stm-merge.ll2
-rw-r--r--llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-block-debug.mir2
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-1-pred.mir2
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-2-preds.mir2
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-ctrl-flow.mir2
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-non-consecutive-ins.mir2
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks.mir2
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-vpt-3-blocks-kill-vpr.mir2
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-vpt-block-1-ins.mir2
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-vpt-block-2-ins.mir2
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-vpt-block-4-ins.mir2
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-vpt-block-elses.mir2
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir2
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-vpt-block-optnone.mir2
-rw-r--r--llvm/test/CodeGen/Thumb2/pacbti-m-outliner-4.ll2
-rw-r--r--llvm/test/CodeGen/Thumb2/stack_guard_remat.ll2
-rw-r--r--llvm/test/CodeGen/Thumb2/t2sizereduction.mir2
-rw-r--r--llvm/test/CodeGen/WebAssembly/memory-interleave.ll5
-rw-r--r--llvm/test/CodeGen/WebAssembly/simd-relaxed-fmax.ll244
-rw-r--r--llvm/test/CodeGen/WebAssembly/simd-relaxed-fmin.ll246
-rw-r--r--llvm/test/CodeGen/X86/2013-03-13-VEX-DestReg.ll2
-rw-r--r--llvm/test/CodeGen/X86/GlobalISel/x86_64-legalize-sitofp.mir2
-rw-r--r--llvm/test/CodeGen/X86/GlobalISel/x86_64-select-sitofp.mir2
-rw-r--r--llvm/test/CodeGen/X86/StackColoring-use-between-allocas.mir4
-rw-r--r--llvm/test/CodeGen/X86/atom-fixup-lea4.ll2
-rw-r--r--llvm/test/CodeGen/X86/atomic-load-store.ll955
-rw-r--r--llvm/test/CodeGen/X86/avoid-sfb-g-no-change.mir2
-rw-r--r--llvm/test/CodeGen/X86/avoid-sfb-g-no-change2.mir2
-rw-r--r--llvm/test/CodeGen/X86/avoid-sfb-g-no-change3.mir2
-rw-r--r--llvm/test/CodeGen/X86/avoid-sfb-overlaps.ll2
-rw-r--r--llvm/test/CodeGen/X86/avx512-cmp-kor-sequence.ll2
-rw-r--r--llvm/test/CodeGen/X86/basic-block-address-map-empty-function.ll2
-rw-r--r--llvm/test/CodeGen/X86/basic-block-address-map-function-sections.ll6
-rw-r--r--llvm/test/CodeGen/X86/basic-block-address-map-pgo-features.ll2
-rw-r--r--llvm/test/CodeGen/X86/basic-block-address-map-with-basic-block-sections.ll2
-rw-r--r--llvm/test/CodeGen/X86/basic-block-address-map-with-emit-bb-hash.ll94
-rw-r--r--llvm/test/CodeGen/X86/basic-block-address-map-with-mfs.ll2
-rw-r--r--llvm/test/CodeGen/X86/basic-block-address-map.ll2
-rw-r--r--llvm/test/CodeGen/X86/bit-piece-comment.ll2
-rw-r--r--llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll2
-rw-r--r--llvm/test/CodeGen/X86/call-graph-section-assembly.ll2
-rw-r--r--llvm/test/CodeGen/X86/catchpad-regmask.ll2
-rw-r--r--llvm/test/CodeGen/X86/catchpad-weight.ll6
-rw-r--r--llvm/test/CodeGen/X86/clang-section-coff.ll4
-rw-r--r--llvm/test/CodeGen/X86/cleanuppad-inalloca.ll2
-rw-r--r--llvm/test/CodeGen/X86/combine-adc.ll48
-rw-r--r--llvm/test/CodeGen/X86/combine-sbb.ll81
-rw-r--r--llvm/test/CodeGen/X86/complex-fastmath.ll2
-rw-r--r--llvm/test/CodeGen/X86/crash-lre-eliminate-dead-def.ll4
-rw-r--r--llvm/test/CodeGen/X86/dag-optnone.ll5
-rw-r--r--llvm/test/CodeGen/X86/dag-update-nodetomatch.ll129
-rw-r--r--llvm/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll4
-rw-r--r--llvm/test/CodeGen/X86/dbg-changes-codegen.ll4
-rw-r--r--llvm/test/CodeGen/X86/dbg-combine.ll2
-rw-r--r--llvm/test/CodeGen/X86/debug-loclists-lto.ll4
-rw-r--r--llvm/test/CodeGen/X86/debugloc-argsize.ll2
-rw-r--r--llvm/test/CodeGen/X86/early-cfi-sections.ll2
-rw-r--r--llvm/test/CodeGen/X86/fadd-combines.ll2
-rw-r--r--llvm/test/CodeGen/X86/fastmath-float-half-conversion.ll4
-rw-r--r--llvm/test/CodeGen/X86/fdiv.ll9
-rw-r--r--llvm/test/CodeGen/X86/fma_patterns_wide.ll19
-rw-r--r--llvm/test/CodeGen/X86/fold-tied-op.ll2
-rw-r--r--llvm/test/CodeGen/X86/fp-intrinsics-flags.ll50
-rw-r--r--llvm/test/CodeGen/X86/fp128-g.ll4
-rw-r--r--llvm/test/CodeGen/X86/fp128-i128.ll4
-rw-r--r--llvm/test/CodeGen/X86/frame-order.ll4
-rw-r--r--llvm/test/CodeGen/X86/fsafdo_test2.ll6
-rw-r--r--llvm/test/CodeGen/X86/i386-shrink-wrapping.ll2
-rw-r--r--llvm/test/CodeGen/X86/inline-asm-A-constraint.ll2
-rw-r--r--llvm/test/CodeGen/X86/label-annotation.ll4
-rw-r--r--llvm/test/CodeGen/X86/label-heapallocsite.ll4
-rw-r--r--llvm/test/CodeGen/X86/late-remat-update.mir4
-rw-r--r--llvm/test/CodeGen/X86/lea-opt-memop-check-1.ll2
-rw-r--r--llvm/test/CodeGen/X86/lifetime-alias.ll6
-rw-r--r--llvm/test/CodeGen/X86/limit-split-cost.mir4
-rw-r--r--llvm/test/CodeGen/X86/merge-vector-stores-scale-idx-crash.ll2
-rw-r--r--llvm/test/CodeGen/X86/misched-copy.ll2
-rw-r--r--llvm/test/CodeGen/X86/misched-matmul.ll2
-rw-r--r--llvm/test/CodeGen/X86/movpc32-check.ll4
-rw-r--r--llvm/test/CodeGen/X86/ms-inline-asm-avx512.ll2
-rw-r--r--llvm/test/CodeGen/X86/nocf_check.ll4
-rw-r--r--llvm/test/CodeGen/X86/pr15705.ll2
-rw-r--r--llvm/test/CodeGen/X86/pr18846.ll2
-rw-r--r--llvm/test/CodeGen/X86/pr31045.ll2
-rw-r--r--llvm/test/CodeGen/X86/pr32610.ll2
-rw-r--r--llvm/test/CodeGen/X86/pr34080-2.ll2
-rw-r--r--llvm/test/CodeGen/X86/pr34080.ll2
-rw-r--r--llvm/test/CodeGen/X86/pr34629.ll2
-rw-r--r--llvm/test/CodeGen/X86/pr34634.ll2
-rw-r--r--llvm/test/CodeGen/X86/pr42727.ll2
-rw-r--r--llvm/test/CodeGen/X86/pr48064.mir4
-rw-r--r--llvm/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll2
-rw-r--r--llvm/test/CodeGen/X86/recip-fastmath.ll6
-rw-r--r--llvm/test/CodeGen/X86/recip-fastmath2.ll8
-rw-r--r--llvm/test/CodeGen/X86/regalloc-advanced-split-cost.ll2
-rw-r--r--llvm/test/CodeGen/X86/regparm.ll2
-rw-r--r--llvm/test/CodeGen/X86/seh-catchpad.ll8
-rw-r--r--llvm/test/CodeGen/X86/seh-except-finally.ll8
-rw-r--r--llvm/test/CodeGen/X86/seh-no-invokes.ll4
-rw-r--r--llvm/test/CodeGen/X86/shrinkwrap-hang.ll2
-rw-r--r--llvm/test/CodeGen/X86/sqrt-fastmath.ll14
-rw-r--r--llvm/test/CodeGen/X86/sse1.ll2
-rw-r--r--llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll57
-rw-r--r--llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll19
-rw-r--r--llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll39
-rw-r--r--llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll19
-rw-r--r--llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll35
-rw-r--r--llvm/test/CodeGen/X86/stack-folding-fp-sse42.ll43
-rw-r--r--llvm/test/CodeGen/X86/stack-protector-3.ll2
-rw-r--r--llvm/test/CodeGen/X86/stack-protector-vreg-to-vreg-copy.ll4
-rw-r--r--llvm/test/CodeGen/X86/stack_guard_remat.ll2
-rw-r--r--llvm/test/CodeGen/X86/tail-merge-wineh.ll2
-rw-r--r--llvm/test/CodeGen/X86/tls-shrink-wrapping.ll4
-rw-r--r--llvm/test/CodeGen/X86/unused_stackslots.ll4
-rw-r--r--llvm/test/CodeGen/X86/uwtables.ll2
-rw-r--r--llvm/test/CodeGen/X86/vec_int_to_fp.ll5
-rw-r--r--llvm/test/CodeGen/X86/vector-sqrt.ll4
-rw-r--r--llvm/test/CodeGen/X86/vector-width-store-merge.ll4
-rw-r--r--llvm/test/CodeGen/X86/win-cleanuppad.ll4
-rw-r--r--llvm/test/CodeGen/X86/win32-seh-catchpad.ll2
-rw-r--r--llvm/test/CodeGen/X86/win32-seh-nested-finally.ll4
-rw-r--r--llvm/test/CodeGen/X86/windows-seh-EHa-CppCatchDotDotDot.ll8
-rw-r--r--llvm/test/CodeGen/X86/windows-seh-EHa-CppDtors01.ll8
-rw-r--r--llvm/test/CodeGen/X86/windows-seh-EHa-TryInFinally.ll6
-rw-r--r--llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll6
-rw-r--r--llvm/test/DebugInfo/X86/aggressive-instcombine-store-merge-dbg.ll49
-rw-r--r--llvm/test/LTO/AArch64/Inputs/bar.ll (renamed from llvm/test/LTO/AArch64/TestInputs/bar.ll)0
-rw-r--r--llvm/test/LTO/AArch64/Inputs/fiz.ll (renamed from llvm/test/LTO/AArch64/TestInputs/fiz.ll)0
-rw-r--r--llvm/test/LTO/AArch64/Inputs/foo.ll (renamed from llvm/test/LTO/AArch64/TestInputs/foo.ll)0
-rw-r--r--llvm/test/LTO/AArch64/Inputs/old.ll (renamed from llvm/test/LTO/AArch64/TestInputs/old.ll)0
-rw-r--r--llvm/test/LTO/AArch64/link-branch-target-enforcement.ll2
-rw-r--r--llvm/test/LTO/AArch64/link-sign-return-address.ll8
-rw-r--r--llvm/test/MC/AArch64/FP8/fmmla-diagnostics.s2
-rw-r--r--llvm/test/MC/AArch64/SME2p3/luti6-diagnostics.s191
-rw-r--r--llvm/test/MC/AArch64/SME2p3/luti6.s472
-rw-r--r--llvm/test/MC/AArch64/SVE/bfmmla-diagnostics.s5
-rw-r--r--llvm/test/MC/AArch64/SVE2p1/sdot-diagnostics.s2
-rw-r--r--llvm/test/MC/AArch64/SVE2p1/udot-diagnostics.s4
-rw-r--r--llvm/test/MC/AArch64/SVE2p2/fmmla-diagnostics.s15
-rw-r--r--llvm/test/MC/AArch64/SVE2p2/fmmla.s45
-rw-r--r--llvm/test/MC/AArch64/SVE2p3/arithmetic-diagnostics.s147
-rw-r--r--llvm/test/MC/AArch64/SVE2p3/arithmetic.s275
-rw-r--r--llvm/test/MC/AArch64/SVE2p3/bfmmla-diagnostics.s19
-rw-r--r--llvm/test/MC/AArch64/SVE2p3/bfmmla.s45
-rw-r--r--llvm/test/MC/AArch64/SVE2p3/cvt-diagnostics.s193
-rw-r--r--llvm/test/MC/AArch64/SVE2p3/cvt.s321
-rw-r--r--llvm/test/MC/AArch64/SVE2p3/directive-arch-negative.s7
-rw-r--r--llvm/test/MC/AArch64/SVE2p3/directive-arch_extension-negative.s7
-rw-r--r--llvm/test/MC/AArch64/SVE2p3/directive-cpu-negative.s7
-rw-r--r--llvm/test/MC/AArch64/SVE2p3/dot-diagnostics.s137
-rw-r--r--llvm/test/MC/AArch64/SVE2p3/dot.s173
-rw-r--r--llvm/test/MC/AArch64/SVE2p3/luti6-diagnostics.s70
-rw-r--r--llvm/test/MC/AArch64/SVE2p3/luti6.s115
-rw-r--r--llvm/test/MC/AArch64/SVE2p3/qshrn-diagnostics.s343
-rw-r--r--llvm/test/MC/AArch64/SVE2p3/qshrn.s255
-rw-r--r--llvm/test/MC/AArch64/armv8.4a-mpam.s101
-rw-r--r--llvm/test/MC/AArch64/armv9.7a-gcie-diagnostics.s18
-rw-r--r--llvm/test/MC/AArch64/armv9.7a-gcie.s985
-rw-r--r--llvm/test/MC/AArch64/armv9.7a-memsys.s140
-rw-r--r--llvm/test/MC/AArch64/armv9.7a-mpamv2-diagnostics.s18
-rw-r--r--llvm/test/MC/AArch64/armv9.7a-mpamv2.s126
-rw-r--r--llvm/test/MC/AArch64/armv9.7a-mtetc-diagnostics.s16
-rw-r--r--llvm/test/MC/AArch64/armv9.7a-mtetc.s29
-rw-r--r--llvm/test/MC/AArch64/armv9.7a-tlbid-diagnostics.s64
-rw-r--r--llvm/test/MC/AArch64/armv9.7a-tlbid.s84
-rw-r--r--llvm/test/MC/AArch64/neon-fdot-diagnostics.s59
-rw-r--r--llvm/test/MC/AArch64/neon-fdot.s147
-rw-r--r--llvm/test/MC/AArch64/neon-fmmla-HtoS-diagnostics.s24
-rw-r--r--llvm/test/MC/AArch64/neon-fmmla-HtoS.s37
-rw-r--r--llvm/test/MC/AArch64/neon-fmmla-diagnostics.s24
-rw-r--r--llvm/test/MC/AArch64/neon-fmmla.s37
-rw-r--r--llvm/test/MC/AMDGPU/gfx1250_asm_sopp.s4
-rw-r--r--llvm/test/MC/AMDGPU/gfx1250_err.s10
-rw-r--r--llvm/test/MC/Disassembler/AArch64/armv8.4a-mpam.txt66
-rw-r--r--llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sopp.txt3
-rw-r--r--llvm/test/MC/Disassembler/Xtensa/debug.txt2
-rw-r--r--llvm/test/MC/ELF/cfi-sframe-cfi-escape-diagnostics.s36
-rw-r--r--llvm/test/MC/ELF/cfi-sframe-cfi-escape.s46
-rw-r--r--llvm/test/MC/Hexagon/arch-support.s3
-rw-r--r--llvm/test/MC/Hexagon/v81_arch.s10
-rw-r--r--llvm/test/MC/PowerPC/ppc64-encoding-ext.s14
-rw-r--r--llvm/test/MC/Xtensa/debug.s2
-rw-r--r--llvm/test/MachineVerifier/test_g_shuffle_vector.mir12
-rw-r--r--llvm/test/TableGen/intrinsic-manual-name.td6
-rw-r--r--llvm/test/ThinLTO/X86/devirt_external_comdat_same_guid.ll10
-rw-r--r--llvm/test/ThinLTO/X86/dtlto/json.ll20
-rw-r--r--llvm/test/Transforms/ADCE/2016-09-06.ll4
-rw-r--r--llvm/test/Transforms/ADCE/blocks-with-dead-term-nondeterministic.ll4
-rw-r--r--llvm/test/Transforms/AddDiscriminators/basic.ll4
-rw-r--r--llvm/test/Transforms/AddDiscriminators/call-nested.ll9
-rw-r--r--llvm/test/Transforms/AddDiscriminators/call.ll7
-rw-r--r--llvm/test/Transforms/AddDiscriminators/diamond.ll10
-rw-r--r--llvm/test/Transforms/AddDiscriminators/first-only.ll4
-rw-r--r--llvm/test/Transforms/AddDiscriminators/invoke.ll30
-rw-r--r--llvm/test/Transforms/AddDiscriminators/multiple.ll4
-rw-r--r--llvm/test/Transforms/AddDiscriminators/no-discriminators.ll7
-rw-r--r--llvm/test/Transforms/AddDiscriminators/oneline.ll7
-rw-r--r--llvm/test/Transforms/Attributor/reduced/register_benchmark_test.ll30
-rw-r--r--llvm/test/Transforms/CodeGenPrepare/ARM/bitreverse-recognize.ll4
-rw-r--r--llvm/test/Transforms/CodeGenPrepare/X86/bitreverse-hang.ll4
-rw-r--r--llvm/test/Transforms/CodeGenPrepare/dom-tree.ll4
-rw-r--r--llvm/test/Transforms/ConstantHoisting/X86/ehpad.ll7
-rw-r--r--llvm/test/Transforms/ConstraintElimination/add-nsw.ll6
-rw-r--r--llvm/test/Transforms/ConstraintElimination/gep-arithmetic-add.ll6
-rw-r--r--llvm/test/Transforms/Coroutines/coro-debug.ll37
-rw-r--r--llvm/test/Transforms/Coroutines/coro-split-dbg.ll49
-rw-r--r--llvm/test/Transforms/DeadArgElim/dbginfo.ll6
-rw-r--r--llvm/test/Transforms/DeadStoreElimination/matrix-intrinsics.ll8
-rw-r--r--llvm/test/Transforms/DeadStoreElimination/mda-with-dbg-values.ll3
-rw-r--r--llvm/test/Transforms/FunctionImport/Inputs/funcimport_debug.ll4
-rw-r--r--llvm/test/Transforms/FunctionImport/funcimport_debug.ll7
-rw-r--r--llvm/test/Transforms/GCOVProfiling/exit-block.ll8
-rw-r--r--llvm/test/Transforms/GCOVProfiling/linezero.ll19
-rw-r--r--llvm/test/Transforms/GCOVProfiling/split-indirectbr-critical-edges.ll4
-rw-r--r--llvm/test/Transforms/GVN/cond_br2.ll35
-rw-r--r--llvm/test/Transforms/GVN/matrix-intrinsics.ll12
-rw-r--r--llvm/test/Transforms/GVN/pr33549.ll6
-rw-r--r--llvm/test/Transforms/GVN/pr42605.ll2
-rw-r--r--llvm/test/Transforms/GVNHoist/hoist-unsafe-pr31729.ll10
-rw-r--r--llvm/test/Transforms/GVNHoist/pr30499.ll10
-rw-r--r--llvm/test/Transforms/IndVarSimplify/X86/widen-nsw.ll4
-rw-r--r--llvm/test/Transforms/Inline/always-inline-attr.ll10
-rw-r--r--llvm/test/Transforms/Inline/debug-info-duplicate-calls.ll7
-rw-r--r--llvm/test/Transforms/Inline/inline-vla.ll10
-rw-r--r--llvm/test/Transforms/Inline/optimization-remarks-hotness-threshold.ll6
-rw-r--r--llvm/test/Transforms/Inline/optimization-remarks-passed-deleted-callee-yaml.ll6
-rw-r--r--llvm/test/Transforms/Inline/optimization-remarks-passed-yaml.ll6
-rw-r--r--llvm/test/Transforms/Inline/optimization-remarks.ll6
-rw-r--r--llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-to-svbool-binops.ll33
-rw-r--r--llvm/test/Transforms/InstCombine/bitreverse-hang.ll4
-rw-r--r--llvm/test/Transforms/InstCombine/constant-vector-insert.ll156
-rw-r--r--llvm/test/Transforms/InstCombine/ctlz-cttz.ll145
-rw-r--r--llvm/test/Transforms/InstCombine/intrinsic-select.ll36
-rw-r--r--llvm/test/Transforms/InstCombine/phi.ll50
-rw-r--r--llvm/test/Transforms/InstCombine/ptrtoaddr.ll131
-rw-r--r--llvm/test/Transforms/InstCombine/scmp.ll56
-rw-r--r--llvm/test/Transforms/InstCombine/select-extractelement-inseltpoison.ll3
-rw-r--r--llvm/test/Transforms/InstCombine/select-extractelement.ll18
-rw-r--r--llvm/test/Transforms/InstCombine/select_frexp.ll8
-rw-r--r--llvm/test/Transforms/InstCombine/sub-gep.ll33
-rw-r--r--llvm/test/Transforms/JumpThreading/ddt-crash3.ll6
-rw-r--r--llvm/test/Transforms/LICM/volatile-alias.ll4
-rw-r--r--llvm/test/Transforms/LoopRotate/noalias.ll6
-rw-r--r--llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll2
-rw-r--r--llvm/test/Transforms/LoopStrengthReduce/X86/pr17473.ll4
-rw-r--r--llvm/test/Transforms/LoopStrengthReduce/pr18165.ll4
-rw-r--r--llvm/test/Transforms/LoopStrengthReduce/two-combinations-bug.ll2
-rw-r--r--llvm/test/Transforms/LoopUnroll/runtime-epilog-debuginfo.ll2
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll3
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll3
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll226
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll80
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll14
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll108
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll90
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll751
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-interleave.ll26
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll48
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll48
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll2
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll2
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll14
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll2
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll44
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll66
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-multi-block.ll276
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll8
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll121
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll32
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll6
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll11
-rw-r--r--llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll4
-rw-r--r--llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll52
-rw-r--r--llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll2
-rw-r--r--llvm/test/Transforms/LoopVectorize/WebAssembly/partial-reduce-accumulate.ll126
-rw-r--r--llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll2
-rw-r--r--llvm/test/Transforms/LoopVectorize/X86/int128_no_gather.ll4
-rw-r--r--llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll2
-rw-r--r--llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll2
-rw-r--r--llvm/test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll2
-rw-r--r--llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll137
-rw-r--r--llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll2
-rw-r--r--llvm/test/Transforms/LoopVectorize/constantfolder.ll69
-rw-r--r--llvm/test/Transforms/LoopVectorize/diag-missing-instr-debug-loc.ll2
-rw-r--r--llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll2
-rw-r--r--llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll2
-rw-r--r--llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll2
-rw-r--r--llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll4
-rw-r--r--llvm/test/Transforms/LoopVectorize/metadata-width.ll2
-rw-r--r--llvm/test/Transforms/LoopVectorize/multiple-address-spaces.ll2
-rw-r--r--llvm/test/Transforms/LoopVectorize/runtime-check-address-space.ll2
-rw-r--r--llvm/test/Transforms/LoopVectorize/runtime-check-readonly-address-space.ll2
-rw-r--r--llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll4
-rw-r--r--llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll30
-rw-r--r--llvm/test/Transforms/MergeICmps/X86/int64-and-ptr.ll8
-rw-r--r--llvm/test/Transforms/MergeICmps/X86/pr41917.ll16
-rw-r--r--llvm/test/Transforms/NewGVN/basic-cyclic-opt.ll8
-rw-r--r--llvm/test/Transforms/NewGVN/cond_br2-xfail.ll25
-rw-r--r--llvm/test/Transforms/NewGVN/equivalent-phi.ll4
-rw-r--r--llvm/test/Transforms/NewGVN/memory-handling.ll6
-rw-r--r--llvm/test/Transforms/NewGVN/pr31483.ll11
-rw-r--r--llvm/test/Transforms/NewGVN/pr31501.ll6
-rw-r--r--llvm/test/Transforms/NewGVN/pr33187.ll9
-rw-r--r--llvm/test/Transforms/NewGVN/pr33305.ll17
-rw-r--r--llvm/test/Transforms/NewGVN/pr34430.ll4
-rw-r--r--llvm/test/Transforms/NewGVN/pr34452.ll9
-rw-r--r--llvm/test/Transforms/OpenMP/dead_use.ll18
-rw-r--r--llvm/test/Transforms/OpenMP/icv_remarks.ll43
-rw-r--r--llvm/test/Transforms/PGOProfile/misexpect-branch-correct.ll29
-rw-r--r--llvm/test/Transforms/PGOProfile/misexpect-branch-overflow.ll28
-rw-r--r--llvm/test/Transforms/PGOProfile/misexpect-branch-stripped.ll29
-rw-r--r--llvm/test/Transforms/PGOProfile/misexpect-branch-unpredictable.ll26
-rw-r--r--llvm/test/Transforms/PGOProfile/misexpect-branch.ll28
-rw-r--r--llvm/test/Transforms/PGOProfile/misexpect-switch-default.ll47
-rw-r--r--llvm/test/Transforms/PGOProfile/misexpect-switch.ll47
-rw-r--r--llvm/test/Transforms/PhaseOrdering/AArch64/reduce_muladd.ll18
-rw-r--r--llvm/test/Transforms/PhaseOrdering/AArch64/reduce_submuladd.ll45
-rw-r--r--llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll80
-rw-r--r--llvm/test/Transforms/PhaseOrdering/ARM/arm_add_q7.ll4
-rw-r--r--llvm/test/Transforms/PhaseOrdering/ARM/arm_fill_q7.ll25
-rw-r--r--llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll2
-rw-r--r--llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll4
-rw-r--r--llvm/test/Transforms/PhaseOrdering/X86/fmaddsub.ll34
-rw-r--r--llvm/test/Transforms/PhaseOrdering/X86/vdiv-nounroll.ll14
-rw-r--r--llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll2
-rw-r--r--llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll18
-rw-r--r--llvm/test/Transforms/PhaseOrdering/always-inline-alloca-promotion.ll5
-rw-r--r--llvm/test/Transforms/SCCP/conditions-ranges.ll25
-rw-r--r--llvm/test/Transforms/SLPVectorizer/NVPTX/vectorizable-intrinsic.ll15
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/crash_bullet3.ll4
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/crash_flop7.ll4
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/debug_info.ll7
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/pr16899.ll4
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/vector_gep.ll5
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/vectorize-pair-path.ll4
-rw-r--r--llvm/test/Transforms/SLPVectorizer/consecutive-access.ll18
-rw-r--r--llvm/test/Transforms/SLPVectorizer/insert-element-build-vector-inseltpoison.ll28
-rw-r--r--llvm/test/Transforms/SLPVectorizer/insert-element-build-vector.ll30
-rw-r--r--llvm/test/Transforms/SROA/mem-par-metadata-sroa.ll8
-rw-r--r--llvm/test/Transforms/SafeStack/AArch64/abi_ssp.ll3
-rw-r--r--llvm/test/Transforms/SafeStack/ARM/debug.ll7
-rw-r--r--llvm/test/Transforms/SafeStack/X86/debug-loc.ll5
-rw-r--r--llvm/test/Transforms/SafeStack/X86/debug-loc2.ll5
-rw-r--r--llvm/test/Transforms/SampleProfile/Inputs/profile-symbol-list.ll8
-rw-r--r--llvm/test/Transforms/SampleProfile/branch.ll18
-rw-r--r--llvm/test/Transforms/SampleProfile/csspgo-import-list.ll2
-rw-r--r--llvm/test/Transforms/SampleProfile/csspgo-inline-debug.ll4
-rw-r--r--llvm/test/Transforms/SampleProfile/csspgo-inline.ll4
-rw-r--r--llvm/test/Transforms/SampleProfile/csspgo-summary.ll4
-rw-r--r--llvm/test/Transforms/SampleProfile/csspgo-use-preinliner.ll4
-rw-r--r--llvm/test/Transforms/SampleProfile/entry_counts_cold.ll5
-rw-r--r--llvm/test/Transforms/SampleProfile/entry_counts_missing_dbginfo.ll5
-rw-r--r--llvm/test/Transforms/SampleProfile/fsafdo_test.ll8
-rw-r--r--llvm/test/Transforms/SampleProfile/gcc-simple.ll13
-rw-r--r--llvm/test/Transforms/SampleProfile/inline-act.ll6
-rw-r--r--llvm/test/Transforms/SampleProfile/misexpect.ll22
-rw-r--r--llvm/test/Transforms/SampleProfile/norepeated-icp-2.ll11
-rw-r--r--llvm/test/Transforms/SampleProfile/norepeated-icp-3.ll7
-rw-r--r--llvm/test/Transforms/SampleProfile/norepeated-icp-4.ll2
-rw-r--r--llvm/test/Transforms/SampleProfile/norepeated-icp.ll8
-rw-r--r--llvm/test/Transforms/SampleProfile/offset.ll2
-rw-r--r--llvm/test/Transforms/SampleProfile/profile-context-order.ll4
-rw-r--r--llvm/test/Transforms/SampleProfile/profile-context-tracker-debug.ll4
-rw-r--r--llvm/test/Transforms/SampleProfile/profile-context-tracker.ll4
-rw-r--r--llvm/test/Transforms/SampleProfile/profile-topdown-order.ll4
-rw-r--r--llvm/test/Transforms/SampleProfile/propagate.ll11
-rw-r--r--llvm/test/Transforms/SampleProfile/pseudo-probe-discriminator.ll9
-rw-r--r--llvm/test/Transforms/SampleProfile/pseudo-probe-icp-factor.ll2
-rw-r--r--llvm/test/Transforms/SampleProfile/pseudo-probe-no-debug-info.ll8
-rw-r--r--llvm/test/Transforms/SampleProfile/remarks.ll4
-rw-r--r--llvm/test/Transforms/SampleProfile/uniqname.ll13
-rw-r--r--llvm/test/Transforms/Scalarizer/dbginfo.ll7
-rw-r--r--llvm/test/Transforms/SimplifyCFG/Hexagon/switch-to-lookup-table.ll4
-rw-r--r--llvm/test/Transforms/SimplifyCFG/X86/merge-cleanuppads.ll20
-rw-r--r--llvm/test/Transforms/SimplifyCFG/pr50060-constantfold-loopid.ll26
-rw-r--r--llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/pr23975.ll4
-rw-r--r--llvm/test/Transforms/StructurizeCFG/nested-loop-order.ll4
-rw-r--r--llvm/test/Transforms/Util/PredicateInfo/testandor.ll27
-rw-r--r--llvm/test/Transforms/Util/dbg-user-of-aext.ll7
-rw-r--r--llvm/test/Transforms/Util/libcalls-fast-math-inf-loop.ll14
-rw-r--r--llvm/test/Verifier/atomics.ll15
-rw-r--r--llvm/test/tools/llvm-ir2vec/embeddings-flowaware.ll2
-rw-r--r--llvm/test/tools/llvm-ir2vec/embeddings-symbolic.ll2
-rw-r--r--llvm/test/tools/llvm-ir2vec/embeddings-symbolic.mir92
-rw-r--r--llvm/test/tools/llvm-ir2vec/entities.mir28
-rw-r--r--llvm/test/tools/llvm-ir2vec/error-handling.ll2
-rw-r--r--llvm/test/tools/llvm-ir2vec/error-handling.mir41
-rw-r--r--llvm/test/tools/llvm-ir2vec/output/lit.local.cfg3
-rw-r--r--llvm/test/tools/llvm-ir2vec/output/reference_triplets.txt33
-rw-r--r--llvm/test/tools/llvm-ir2vec/output/reference_x86_entities.txt7174
-rw-r--r--llvm/test/tools/llvm-ir2vec/triplets.mir61
-rw-r--r--llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-sve-instructions.s34
-rw-r--r--llvm/test/tools/llvm-objdump/MachO/disassemble-source-dsym.test31
-rw-r--r--llvm/test/tools/llvm-profdata/input-wildcard.test15
-rw-r--r--llvm/test/tools/llvm-readobj/ELF/section-types.test5
-rw-r--r--llvm/test/tools/opt/no-target-machine.ll18
729 files changed, 36762 insertions, 12437 deletions
diff --git a/llvm/test/Analysis/BranchProbabilityInfo/pr22718.ll b/llvm/test/Analysis/BranchProbabilityInfo/pr22718.ll
index ed851f2..67ce44e 100644
--- a/llvm/test/Analysis/BranchProbabilityInfo/pr22718.ll
+++ b/llvm/test/Analysis/BranchProbabilityInfo/pr22718.ll
@@ -12,14 +12,14 @@
@.str = private unnamed_addr constant [17 x i8] c"x = %lu\0Ay = %lu\0A\00", align 1
; Function Attrs: inlinehint nounwind uwtable
-define i32 @main() #0 {
+define i32 @main() {
entry:
%retval = alloca i32, align 4
%i = alloca i64, align 8
store i32 0, ptr %retval
store i64 0, ptr @y, align 8
store i64 0, ptr @x, align 8
- call void @srand(i32 422304) #3
+ call void @srand(i32 422304)
store i64 0, ptr %i, align 8
br label %for.cond
@@ -29,7 +29,7 @@ for.cond: ; preds = %for.inc, %entry
br i1 %cmp, label %for.body, label %for.end, !prof !1
for.body: ; preds = %for.cond
- %call = call i32 @rand() #3
+ %call = call i32 @rand()
%conv = sitofp i32 %call to double
%mul = fmul double %conv, 1.000000e+02
%div = fdiv double %mul, 0x41E0000000000000
@@ -65,17 +65,12 @@ for.end: ; preds = %for.cond
}
; Function Attrs: nounwind
-declare void @srand(i32) #1
+declare void @srand(i32)
; Function Attrs: nounwind
-declare i32 @rand() #1
+declare i32 @rand()
-declare i32 @printf(ptr, ...) #2
-
-attributes #0 = { inlinehint nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
+declare i32 @printf(ptr, ...)
!llvm.ident = !{!0}
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
index e007800..ac654dd 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -S -mtriple=aarch64--linux-gnu -mattr=+sve | FileCheck %s --check-prefix=CHECK-VSCALE-1
-; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -mcpu=neoverse-v1 -disable-output -S -mtriple=aarch64--linux-gnu -mattr=+sve | FileCheck %s --check-prefix=CHECK-VSCALE-2
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -mcpu=neoverse-v1 -disable-output -S -mtriple=aarch64--linux-gnu -mattr=+sve | FileCheck %s --check-prefixes=CHECK-VSCALE-2,CHECK-SVE
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -mcpu=neoverse-v1 -disable-output -S -mtriple=aarch64--linux-gnu -mattr=+sve2p1 | FileCheck %s --check-prefixes=CHECK-VSCALE-2,CHECK-SVE2p1-OR-SME2
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -mcpu=neoverse-v1 -disable-output -S -mtriple=aarch64--linux-gnu -mattr=+sme2 | FileCheck %s --check-prefixes=CHECK-VSCALE-2,CHECK-SVE2p1-OR-SME2
; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -intrinsic-cost-strategy=type-based-intrinsic-cost -disable-output -S -mtriple=aarch64--linux-gnu -mattr=+sve | FileCheck %s --check-prefix=TYPE_BASED_ONLY
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
@@ -920,7 +922,8 @@ define void @get_lane_mask() #0 {
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 poison, i32 poison)
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i32 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 poison, i32 poison)
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 poison, i32 poison)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 2 for: %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 13 for: %mask_nxv64i1_i64 = call <vscale x 64 x i1> @llvm.get.active.lane.mask.nxv64i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 5 for: %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison)
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 poison, i16 poison)
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 poison, i64 poison)
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 16 for: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 poison, i64 poison)
@@ -934,28 +937,53 @@ define void @get_lane_mask() #0 {
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 6 for: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 poison, i16 poison)
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
-; CHECK-VSCALE-2-LABEL: 'get_lane_mask'
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i64 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 poison, i64 poison)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i64 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 poison, i64 poison)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i64 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 poison, i64 poison)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i64 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 poison, i64 poison)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i32 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 poison, i32 poison)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 poison, i32 poison)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i32 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 poison, i32 poison)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 poison, i32 poison)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 2 for: %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 poison, i16 poison)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 poison, i64 poison)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 16 for: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 poison, i64 poison)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 8 for: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 poison, i64 poison)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 4 for: %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 poison, i64 poison)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 poison, i32 poison)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 16 for: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 poison, i32 poison)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 8 for: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 poison, i32 poison)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 4 for: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 poison, i32 poison)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 48 for: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 poison, i64 poison)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 6 for: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 poison, i16 poison)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; CHECK-SVE-LABEL: 'get_lane_mask'
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i64 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 poison, i64 poison)
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i64 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 poison, i64 poison)
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i64 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 poison, i64 poison)
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i64 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 poison, i64 poison)
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i32 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 poison, i32 poison)
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 poison, i32 poison)
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i32 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 poison, i32 poison)
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 poison, i32 poison)
+; CHECK-SVE-NEXT: Cost Model: Found costs of 13 for: %mask_nxv64i1_i64 = call <vscale x 64 x i1> @llvm.get.active.lane.mask.nxv64i1.i64(i64 poison, i64 poison)
+; CHECK-SVE-NEXT: Cost Model: Found costs of 5 for: %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison)
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 poison, i16 poison)
+; CHECK-SVE-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 poison, i64 poison)
+; CHECK-SVE-NEXT: Cost Model: Found costs of 16 for: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 poison, i64 poison)
+; CHECK-SVE-NEXT: Cost Model: Found costs of 8 for: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 poison, i64 poison)
+; CHECK-SVE-NEXT: Cost Model: Found costs of 4 for: %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 poison, i64 poison)
+; CHECK-SVE-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 poison, i32 poison)
+; CHECK-SVE-NEXT: Cost Model: Found costs of 16 for: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 poison, i32 poison)
+; CHECK-SVE-NEXT: Cost Model: Found costs of 8 for: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 poison, i32 poison)
+; CHECK-SVE-NEXT: Cost Model: Found costs of 4 for: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 poison, i32 poison)
+; CHECK-SVE-NEXT: Cost Model: Found costs of 48 for: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 poison, i64 poison)
+; CHECK-SVE-NEXT: Cost Model: Found costs of 6 for: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 poison, i16 poison)
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-SVE2p1-OR-SME2-LABEL: 'get_lane_mask'
+; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i64 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 poison, i64 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i64 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 poison, i64 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i64 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 poison, i64 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i64 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 poison, i64 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i32 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 poison, i32 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 poison, i32 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i32 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 poison, i32 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 poison, i32 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 5 for: %mask_nxv64i1_i64 = call <vscale x 64 x i1> @llvm.get.active.lane.mask.nxv64i1.i64(i64 poison, i64 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 poison, i16 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 poison, i64 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 16 for: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 poison, i64 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 8 for: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 poison, i64 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 4 for: %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 poison, i64 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 poison, i32 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 16 for: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 poison, i32 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 8 for: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 poison, i32 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 4 for: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 poison, i32 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 48 for: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 poison, i64 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 6 for: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 poison, i16 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; TYPE_BASED_ONLY-LABEL: 'get_lane_mask'
; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i64 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 poison, i64 poison)
@@ -966,7 +994,8 @@ define void @get_lane_mask() #0 {
; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 poison, i32 poison)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i32 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 poison, i32 poison)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 poison, i32 poison)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 2 for: %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 13 for: %mask_nxv64i1_i64 = call <vscale x 64 x i1> @llvm.get.active.lane.mask.nxv64i1.i64(i64 poison, i64 poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 5 for: %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 poison, i16 poison)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 poison, i64 poison)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 16 for: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 poison, i64 poison)
@@ -990,6 +1019,7 @@ define void @get_lane_mask() #0 {
%mask_nxv4i1_i32 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 poison, i32 poison)
%mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 poison, i32 poison)
+ %mask_nxv64i1_i64 = call <vscale x 64 x i1> @llvm.get.active.lane.mask.nxv64i1.i64(i64 poison, i64 poison)
%mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison)
%mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 poison, i16 poison)
@@ -1416,6 +1446,7 @@ declare <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32, i32)
declare <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32, i32)
declare <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32, i32)
declare <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32, i32)
+declare <vscale x 64 x i1> @llvm.get.active.lane.mask.nxv64i1.i64(i64, i64)
declare <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64, i64)
declare <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16, i16)
declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64, i64)
diff --git a/llvm/test/Analysis/CostModel/ARM/add-cast-vect.ll b/llvm/test/Analysis/CostModel/ARM/add-cast-vect.ll
index 1e9a5f7..c0410cf 100644
--- a/llvm/test/Analysis/CostModel/ARM/add-cast-vect.ll
+++ b/llvm/test/Analysis/CostModel/ARM/add-cast-vect.ll
@@ -1,4 +1,6 @@
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 | FileCheck --check-prefix=COST %s
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 | FileCheck --check-prefix=COST %s
+
; To see the assembly output: llc -mcpu=cortex-a9 < %s | FileCheck --check-prefix=ASM %s
; ASM lines below are only for reference, tests on that direction should go to tests/CodeGen/ARM
@@ -15,13 +17,18 @@ target triple = "armv7--linux-gnueabihf"
%T464 = type <4 x i64>
define void @direct(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
-; COST: function 'direct'
+; COST-LABEL: 'direct'
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i32>, ptr %loadaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT: Cost Model: Found costs of 1 for: %r3 = add <4 x i32> %v0, %v1
+; COST-NEXT: Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
%v0 = load %T432, ptr %loadaddr
; ASM: vld1.64
%v1 = load %T432, ptr %loadaddr2
; ASM: vld1.64
- %r3 = add %T432 %v0, %v1
-; COST: cost of 1 for instruction: {{.*}} add <4 x i32>
+ %r3 = add %T432 %v0, %v1
; ASM: vadd.i32
store %T432 %r3, ptr %storeaddr
; ASM: vst1.64
@@ -29,16 +36,22 @@ define void @direct(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
}
define void @ups1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
-; COST: function 'ups1632'
+; COST-LABEL: 'ups1632'
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i16>, ptr %loadaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
+; COST-NEXT: Cost Model: Found costs of 0 for: %r1 = sext <4 x i16> %v0 to <4 x i32>
+; COST-NEXT: Cost Model: Found costs of 0 for: %r2 = sext <4 x i16> %v1 to <4 x i32>
+; COST-NEXT: Cost Model: Found costs of 1 for: %r3 = add <4 x i32> %r1, %r2
+; COST-NEXT: Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
%v0 = load %T416, ptr %loadaddr
; ASM: vldr
%v1 = load %T416, ptr %loadaddr2
; ASM: vldr
%r1 = sext %T416 %v0 to %T432
%r2 = sext %T416 %v1 to %T432
-; COST: cost of 0 for instruction: {{.*}} sext <4 x i16> {{.*}} to <4 x i32>
- %r3 = add %T432 %r1, %r2
-; COST: cost of 1 for instruction: {{.*}} add <4 x i32>
+ %r3 = add %T432 %r1, %r2
; ASM: vaddl.s16
store %T432 %r3, ptr %storeaddr
; ASM: vst1.64
@@ -46,16 +59,22 @@ define void @ups1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
}
define void @upu1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
-; COST: function 'upu1632'
+; COST-LABEL: 'upu1632'
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i16>, ptr %loadaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
+; COST-NEXT: Cost Model: Found costs of 0 for: %r1 = zext <4 x i16> %v0 to <4 x i32>
+; COST-NEXT: Cost Model: Found costs of 0 for: %r2 = zext <4 x i16> %v1 to <4 x i32>
+; COST-NEXT: Cost Model: Found costs of 1 for: %r3 = add <4 x i32> %r1, %r2
+; COST-NEXT: Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
%v0 = load %T416, ptr %loadaddr
; ASM: vldr
%v1 = load %T416, ptr %loadaddr2
; ASM: vldr
%r1 = zext %T416 %v0 to %T432
%r2 = zext %T416 %v1 to %T432
-; COST: cost of 0 for instruction: {{.*}} zext <4 x i16> {{.*}} to <4 x i32>
- %r3 = add %T432 %r1, %r2
-; COST: cost of 1 for instruction: {{.*}} add <4 x i32>
+ %r3 = add %T432 %r1, %r2
; ASM: vaddl.u16
store %T432 %r3, ptr %storeaddr
; ASM: vst1.64
@@ -63,51 +82,66 @@ define void @upu1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
}
define void @ups3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
-; COST: function 'ups3264'
+; COST-LABEL: 'ups3264'
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <2 x i32>, ptr %loadaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT: Cost Model: Found costs of 1 for: %r3 = add <2 x i32> %v0, %v1
+; COST-NEXT: Cost Model: Found costs of 1 for: %st = sext <2 x i32> %r3 to <2 x i64>
+; COST-NEXT: Cost Model: Found costs of 1 for: store <2 x i64> %st, ptr %storeaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
%v0 = load %T232, ptr %loadaddr
; ASM: vldr
%v1 = load %T232, ptr %loadaddr2
; ASM: vldr
- %r3 = add %T232 %v0, %v1
+ %r3 = add %T232 %v0, %v1
; ASM: vadd.i32
-; COST: cost of 1 for instruction: {{.*}} add <2 x i32>
%st = sext %T232 %r3 to %T264
; ASM: vmovl.s32
-; COST: cost of 1 for instruction: {{.*}} sext <2 x i32> {{.*}} to <2 x i64>
store %T264 %st, ptr %storeaddr
; ASM: vst1.64
ret void
}
define void @upu3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
-; COST: function 'upu3264'
+; COST-LABEL: 'upu3264'
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <2 x i32>, ptr %loadaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT: Cost Model: Found costs of 1 for: %r3 = add <2 x i32> %v0, %v1
+; COST-NEXT: Cost Model: Found costs of 1 for: %st = zext <2 x i32> %r3 to <2 x i64>
+; COST-NEXT: Cost Model: Found costs of 1 for: store <2 x i64> %st, ptr %storeaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
%v0 = load %T232, ptr %loadaddr
; ASM: vldr
%v1 = load %T232, ptr %loadaddr2
; ASM: vldr
- %r3 = add %T232 %v0, %v1
+ %r3 = add %T232 %v0, %v1
; ASM: vadd.i32
-; COST: cost of 1 for instruction: {{.*}} add <2 x i32>
%st = zext %T232 %r3 to %T264
; ASM: vmovl.u32
-; COST: cost of 1 for instruction: {{.*}} zext <2 x i32> {{.*}} to <2 x i64>
store %T264 %st, ptr %storeaddr
; ASM: vst1.64
ret void
}
define void @dn3216(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
-; COST: function 'dn3216'
+; COST-LABEL: 'dn3216'
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i32>, ptr %loadaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT: Cost Model: Found costs of 1 for: %r3 = add <4 x i32> %v0, %v1
+; COST-NEXT: Cost Model: Found costs of 1 for: %st = trunc <4 x i32> %r3 to <4 x i16>
+; COST-NEXT: Cost Model: Found costs of 1 for: store <4 x i16> %st, ptr %storeaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
%v0 = load %T432, ptr %loadaddr
; ASM: vld1.64
%v1 = load %T432, ptr %loadaddr2
; ASM: vld1.64
- %r3 = add %T432 %v0, %v1
+ %r3 = add %T432 %v0, %v1
; ASM: vadd.i32
-; COST: cost of 1 for instruction: {{.*}} add <4 x i32>
%st = trunc %T432 %r3 to %T416
; ASM: vmovn.i32
-; COST: cost of 1 for instruction: {{.*}} trunc <4 x i32> {{.*}} to <4 x i16>
store %T416 %st, ptr %storeaddr
; ASM: vstr
ret void
diff --git a/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll b/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll
index c2248c2..e5bbac6 100644
--- a/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll
+++ b/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mtriple=thumbv7-apple-ios6.0.0 -mcpu=cortex-a9 < %s | FileCheck %s --check-prefix=CHECK-NEON
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVE
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mtriple=thumbv8m.main-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-MAIN
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mtriple=thumbv8m.base-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-BASE
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mtriple=armv8r-none-eabi -mattr=+neon,+fp-armv8 < %s | FileCheck %s --check-prefix=CHECK-V8R
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=cortex-a9 < %s | FileCheck %s --check-prefix=CHECK-NEON
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVE
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8m.main-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-MAIN
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8m.base-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-BASE
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=armv8r-none-eabi -mattr=+neon,+fp-armv8 < %s | FileCheck %s --check-prefix=CHECK-V8R
target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
diff --git a/llvm/test/Analysis/CostModel/ARM/fparith.ll b/llvm/test/Analysis/CostModel/ARM/fparith.ll
index f9424e8..6f2626c 100644
--- a/llvm/test/Analysis/CostModel/ARM/fparith.ll
+++ b/llvm/test/Analysis/CostModel/ARM/fparith.ll
@@ -1,21 +1,21 @@
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefix=CHECK-MVE
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVEFP
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefix=CHECK-MVE
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVEFP
target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
define void @f32() {
; CHECK-MVE-LABEL: 'f32'
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c = fadd float undef, undef
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d = fsub float undef, undef
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e = fmul float undef, undef
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %c = fadd float undef, undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %d = fsub float undef, undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %e = fmul float undef, undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-MVEFP-LABEL: 'f32'
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c = fadd float undef, undef
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d = fsub float undef, undef
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e = fmul float undef, undef
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %c = fadd float undef, undef
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %d = fsub float undef, undef
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %e = fmul float undef, undef
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%c = fadd float undef, undef
%d = fsub float undef, undef
@@ -25,16 +25,16 @@ define void @f32() {
define void @f16() {
; CHECK-MVE-LABEL: 'f16'
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c = fadd half undef, undef
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d = fsub half undef, undef
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e = fmul half undef, undef
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %c = fadd half undef, undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %d = fsub half undef, undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %e = fmul half undef, undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-MVEFP-LABEL: 'f16'
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c = fadd half undef, undef
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d = fsub half undef, undef
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e = fmul half undef, undef
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %c = fadd half undef, undef
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %d = fsub half undef, undef
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %e = fmul half undef, undef
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%c = fadd half undef, undef
%d = fsub half undef, undef
@@ -44,16 +44,16 @@ define void @f16() {
define void @f64() {
; CHECK-MVE-LABEL: 'f64'
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c = fadd double undef, undef
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d = fsub double undef, undef
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e = fmul double undef, undef
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %c = fadd double undef, undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %d = fsub double undef, undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %e = fmul double undef, undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-MVEFP-LABEL: 'f64'
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c = fadd double undef, undef
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d = fsub double undef, undef
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e = fmul double undef, undef
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %c = fadd double undef, undef
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %d = fsub double undef, undef
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %e = fmul double undef, undef
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%c = fadd double undef, undef
%d = fsub double undef, undef
@@ -63,28 +63,28 @@ define void @f64() {
define void @vf32() {
; CHECK-MVE-LABEL: 'vf32'
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %c2 = fadd <2 x float> undef, undef
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %d2 = fsub <2 x float> undef, undef
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %e2 = fmul <2 x float> undef, undef
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %c4 = fadd <4 x float> undef, undef
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %d4 = fsub <4 x float> undef, undef
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %e4 = fmul <4 x float> undef, undef
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %c8 = fadd <8 x float> undef, undef
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %d8 = fsub <8 x float> undef, undef
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %e8 = fmul <8 x float> undef, undef
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT: Cost Model: Found costs of 4 for: %c2 = fadd <2 x float> undef, undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of 4 for: %d2 = fsub <2 x float> undef, undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of 4 for: %e2 = fmul <2 x float> undef, undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of 8 for: %c4 = fadd <4 x float> undef, undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of 8 for: %d4 = fsub <4 x float> undef, undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of 8 for: %e4 = fmul <4 x float> undef, undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of 16 for: %c8 = fadd <8 x float> undef, undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of 16 for: %d8 = fsub <8 x float> undef, undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of 16 for: %e8 = fmul <8 x float> undef, undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-MVEFP-LABEL: 'vf32'
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %c2 = fadd <2 x float> undef, undef
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %d2 = fsub <2 x float> undef, undef
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %e2 = fmul <2 x float> undef, undef
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %c4 = fadd <4 x float> undef, undef
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %d4 = fsub <4 x float> undef, undef
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %e4 = fmul <4 x float> undef, undef
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %c8 = fadd <8 x float> undef, undef
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %d8 = fsub <8 x float> undef, undef
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %e8 = fmul <8 x float> undef, undef
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %c2 = fadd <2 x float> undef, undef
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %d2 = fsub <2 x float> undef, undef
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %e2 = fmul <2 x float> undef, undef
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %c4 = fadd <4 x float> undef, undef
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %d4 = fsub <4 x float> undef, undef
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %e4 = fmul <4 x float> undef, undef
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %c8 = fadd <8 x float> undef, undef
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %d8 = fsub <8 x float> undef, undef
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %e8 = fmul <8 x float> undef, undef
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%c2 = fadd <2 x float> undef, undef
%d2 = fsub <2 x float> undef, undef
@@ -100,28 +100,28 @@ define void @vf32() {
define void @vf16() {
; CHECK-MVE-LABEL: 'vf16'
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %c2 = fadd <2 x half> undef, undef
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %d2 = fsub <2 x half> undef, undef
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %e2 = fmul <2 x half> undef, undef
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %c4 = fadd <4 x half> undef, undef
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %d4 = fsub <4 x half> undef, undef
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %e4 = fmul <4 x half> undef, undef
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %c8 = fadd <8 x half> undef, undef
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %d8 = fsub <8 x half> undef, undef
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %e8 = fmul <8 x half> undef, undef
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT: Cost Model: Found costs of 4 for: %c2 = fadd <2 x half> undef, undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of 4 for: %d2 = fsub <2 x half> undef, undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of 4 for: %e2 = fmul <2 x half> undef, undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of 8 for: %c4 = fadd <4 x half> undef, undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of 8 for: %d4 = fsub <4 x half> undef, undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of 8 for: %e4 = fmul <4 x half> undef, undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of 16 for: %c8 = fadd <8 x half> undef, undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of 16 for: %d8 = fsub <8 x half> undef, undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of 16 for: %e8 = fmul <8 x half> undef, undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-MVEFP-LABEL: 'vf16'
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %c2 = fadd <2 x half> undef, undef
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %d2 = fsub <2 x half> undef, undef
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %e2 = fmul <2 x half> undef, undef
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %c4 = fadd <4 x half> undef, undef
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %d4 = fsub <4 x half> undef, undef
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %e4 = fmul <4 x half> undef, undef
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %c8 = fadd <8 x half> undef, undef
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %d8 = fsub <8 x half> undef, undef
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %e8 = fmul <8 x half> undef, undef
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %c2 = fadd <2 x half> undef, undef
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %d2 = fsub <2 x half> undef, undef
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %e2 = fmul <2 x half> undef, undef
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %c4 = fadd <4 x half> undef, undef
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %d4 = fsub <4 x half> undef, undef
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %e4 = fmul <4 x half> undef, undef
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %c8 = fadd <8 x half> undef, undef
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %d8 = fsub <8 x half> undef, undef
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %e8 = fmul <8 x half> undef, undef
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%c2 = fadd <2 x half> undef, undef
%d2 = fsub <2 x half> undef, undef
@@ -137,28 +137,28 @@ define void @vf16() {
define void @vf64() {
; CHECK-MVE-LABEL: 'vf64'
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %c2 = fadd <2 x double> undef, undef
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %d2 = fsub <2 x double> undef, undef
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %e2 = fmul <2 x double> undef, undef
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %c4 = fadd <4 x double> undef, undef
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %d4 = fsub <4 x double> undef, undef
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %e4 = fmul <4 x double> undef, undef
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %c8 = fadd <8 x double> undef, undef
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %d8 = fsub <8 x double> undef, undef
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %e8 = fmul <8 x double> undef, undef
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT: Cost Model: Found costs of 4 for: %c2 = fadd <2 x double> undef, undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of 4 for: %d2 = fsub <2 x double> undef, undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of 4 for: %e2 = fmul <2 x double> undef, undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of 8 for: %c4 = fadd <4 x double> undef, undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of 8 for: %d4 = fsub <4 x double> undef, undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of 8 for: %e4 = fmul <4 x double> undef, undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of 16 for: %c8 = fadd <8 x double> undef, undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of 16 for: %d8 = fsub <8 x double> undef, undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of 16 for: %e8 = fmul <8 x double> undef, undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-MVEFP-LABEL: 'vf64'
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %c2 = fadd <2 x double> undef, undef
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %d2 = fsub <2 x double> undef, undef
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %e2 = fmul <2 x double> undef, undef
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %c4 = fadd <4 x double> undef, undef
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %d4 = fsub <4 x double> undef, undef
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %e4 = fmul <4 x double> undef, undef
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %c8 = fadd <8 x double> undef, undef
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %d8 = fsub <8 x double> undef, undef
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %e8 = fmul <8 x double> undef, undef
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 4 for: %c2 = fadd <2 x double> undef, undef
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 4 for: %d2 = fsub <2 x double> undef, undef
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 4 for: %e2 = fmul <2 x double> undef, undef
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 8 for: %c4 = fadd <4 x double> undef, undef
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 8 for: %d4 = fsub <4 x double> undef, undef
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 8 for: %e4 = fmul <4 x double> undef, undef
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 16 for: %c8 = fadd <8 x double> undef, undef
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 16 for: %d8 = fsub <8 x double> undef, undef
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 16 for: %e8 = fmul <8 x double> undef, undef
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%c2 = fadd <2 x double> undef, undef
%d2 = fsub <2 x double> undef, undef
diff --git a/llvm/test/Analysis/CostModel/ARM/fptoi_sat.ll b/llvm/test/Analysis/CostModel/ARM/fptoi_sat.ll
index aff7b19..af548f6 100644
--- a/llvm/test/Analysis/CostModel/ARM/fptoi_sat.ll
+++ b/llvm/test/Analysis/CostModel/ARM/fptoi_sat.ll
@@ -1,213 +1,213 @@
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefix=CHECK-MVE
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp,+fp64 < %s | FileCheck %s --check-prefix=CHECK-MVEFP
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefix=CHECK-MVE
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp,+fp64 < %s | FileCheck %s --check-prefix=CHECK-MVEFP
define void @casts() {
; CHECK-MVE-LABEL: 'casts'
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %f32s1 = call i1 @llvm.fptosi.sat.i1.f32(float undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %f32u1 = call i1 @llvm.fptoui.sat.i1.f32(float undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %f32s8 = call i8 @llvm.fptosi.sat.i8.f32(float undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %f32u8 = call i8 @llvm.fptoui.sat.i8.f32(float undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %f32s16 = call i16 @llvm.fptosi.sat.i16.f32(float undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %f32u16 = call i16 @llvm.fptoui.sat.i16.f32(float undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %f32s32 = call i32 @llvm.fptosi.sat.i32.f32(float undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %f32u32 = call i32 @llvm.fptoui.sat.i32.f32(float undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %f32s64 = call i64 @llvm.fptosi.sat.i64.f32(float undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %f32u64 = call i64 @llvm.fptoui.sat.i64.f32(float undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %f64s1 = call i1 @llvm.fptosi.sat.i1.f64(double undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %f64u1 = call i1 @llvm.fptoui.sat.i1.f64(double undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %f64s8 = call i8 @llvm.fptosi.sat.i8.f64(double undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %f64u8 = call i8 @llvm.fptoui.sat.i8.f64(double undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %f64s16 = call i16 @llvm.fptosi.sat.i16.f64(double undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %f64u16 = call i16 @llvm.fptoui.sat.i16.f64(double undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %f64s32 = call i32 @llvm.fptosi.sat.i32.f64(double undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %f64u32 = call i32 @llvm.fptoui.sat.i32.f64(double undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %f64s64 = call i64 @llvm.fptosi.sat.i64.f64(double undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %f64u64 = call i64 @llvm.fptoui.sat.i64.f64(double undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %v2f32s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f32(<2 x float> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v2f32u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f32(<2 x float> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %v2f32s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f32(<2 x float> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v2f32u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f32(<2 x float> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %v2f32s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f32(<2 x float> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v2f32u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f32(<2 x float> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %v2f32s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f32(<2 x float> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v2f32u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f32(<2 x float> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %v2f32s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f32(<2 x float> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %v2f32u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f32(<2 x float> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %v2f64s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f64(<2 x double> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v2f64u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f64(<2 x double> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %v2f64s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f64(<2 x double> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v2f64u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f64(<2 x double> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %v2f64s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f64(<2 x double> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v2f64u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f64(<2 x double> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %v2f64s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f64(<2 x double> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v2f64u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f64(<2 x double> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %v2f64s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f64(<2 x double> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %v2f64u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f64(<2 x double> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %v4f32s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f32(<4 x float> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %v4f32u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f32(<4 x float> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 202 for instruction: %v4f32s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f32(<4 x float> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %v4f32u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f32(<4 x float> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 202 for instruction: %v4f32s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f32(<4 x float> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %v4f32u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f32(<4 x float> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 202 for instruction: %v4f32s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %v4f32u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f32(<4 x float> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 406 for instruction: %v4f32s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f32(<4 x float> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 378 for instruction: %v4f32u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f32(<4 x float> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 314 for instruction: %v4f64s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f64(<4 x double> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 250 for instruction: %v4f64u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f64(<4 x double> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 276 for instruction: %v4f64s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f64(<4 x double> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 250 for instruction: %v4f64u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f64(<4 x double> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 276 for instruction: %v4f64s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f64(<4 x double> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 250 for instruction: %v4f64u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f64(<4 x double> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 276 for instruction: %v4f64s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f64(<4 x double> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 250 for instruction: %v4f64u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f64(<4 x double> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 404 for instruction: %v4f64s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f64(<4 x double> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 376 for instruction: %v4f64u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f64(<4 x double> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 626 for instruction: %v8f32s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f32(<8 x float> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 498 for instruction: %v8f32u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f32(<8 x float> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 548 for instruction: %v8f32s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f32(<8 x float> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 498 for instruction: %v8f32u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f32(<8 x float> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 548 for instruction: %v8f32s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f32(<8 x float> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 498 for instruction: %v8f32u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f32(<8 x float> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 548 for instruction: %v8f32s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f32(<8 x float> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 496 for instruction: %v8f32u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f32(<8 x float> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1360 for instruction: %v8f32s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f32(<8 x float> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1304 for instruction: %v8f32u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f32(<8 x float> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 922 for instruction: %v8f64s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f64(<8 x double> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 794 for instruction: %v8f64u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f64(<8 x double> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 844 for instruction: %v8f64s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f64(<8 x double> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 794 for instruction: %v8f64u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f64(<8 x double> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 844 for instruction: %v8f64s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f64(<8 x double> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 794 for instruction: %v8f64u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f64(<8 x double> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 844 for instruction: %v8f64s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f64(<8 x double> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 792 for instruction: %v8f64u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f64(<8 x double> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1352 for instruction: %v8f64s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f64(<8 x double> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1296 for instruction: %v8f64u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f64(<8 x double> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1834 for instruction: %v16f32s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f32(<16 x float> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1578 for instruction: %v16f32u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f32(<16 x float> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1676 for instruction: %v16f32s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f32(<16 x float> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1578 for instruction: %v16f32u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f32(<16 x float> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1676 for instruction: %v16f32s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f32(<16 x float> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1576 for instruction: %v16f32u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f32(<16 x float> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1672 for instruction: %v16f32s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f32(<16 x float> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1568 for instruction: %v16f32u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f32(<16 x float> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4912 for instruction: %v16f32s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f32(<16 x float> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4800 for instruction: %v16f32u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f32(<16 x float> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 3018 for instruction: %v16f64s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f64(<16 x double> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2762 for instruction: %v16f64u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f64(<16 x double> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2860 for instruction: %v16f64s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f64(<16 x double> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2762 for instruction: %v16f64u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f64(<16 x double> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2860 for instruction: %v16f64s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f64(<16 x double> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2760 for instruction: %v16f64u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f64(<16 x double> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2856 for instruction: %v16f64s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f64(<16 x double> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2752 for instruction: %v16f64u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f64(<16 x double> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4880 for instruction: %v16f64s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f64(<16 x double> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4768 for instruction: %v16f64u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f64(<16 x double> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:23 CodeSize:7 Lat:23 SizeLat:23 for: %f32s1 = call i1 @llvm.fptosi.sat.i1.f32(float undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f32u1 = call i1 @llvm.fptoui.sat.i1.f32(float undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:23 CodeSize:6 Lat:23 SizeLat:23 for: %f32s8 = call i8 @llvm.fptosi.sat.i8.f32(float undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f32u8 = call i8 @llvm.fptoui.sat.i8.f32(float undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:23 CodeSize:6 Lat:23 SizeLat:23 for: %f32s16 = call i16 @llvm.fptosi.sat.i16.f32(float undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f32u16 = call i16 @llvm.fptoui.sat.i16.f32(float undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:23 CodeSize:6 Lat:23 SizeLat:23 for: %f32s32 = call i32 @llvm.fptosi.sat.i32.f32(float undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f32u32 = call i32 @llvm.fptoui.sat.i32.f32(float undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:24 CodeSize:7 Lat:24 SizeLat:24 for: %f32s64 = call i64 @llvm.fptosi.sat.i64.f32(float undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f32u64 = call i64 @llvm.fptoui.sat.i64.f32(float undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:23 CodeSize:7 Lat:23 SizeLat:23 for: %f64s1 = call i1 @llvm.fptosi.sat.i1.f64(double undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f64u1 = call i1 @llvm.fptoui.sat.i1.f64(double undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:23 CodeSize:6 Lat:23 SizeLat:23 for: %f64s8 = call i8 @llvm.fptosi.sat.i8.f64(double undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f64u8 = call i8 @llvm.fptoui.sat.i8.f64(double undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:23 CodeSize:6 Lat:23 SizeLat:23 for: %f64s16 = call i16 @llvm.fptosi.sat.i16.f64(double undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f64u16 = call i16 @llvm.fptoui.sat.i16.f64(double undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:23 CodeSize:6 Lat:23 SizeLat:23 for: %f64s32 = call i32 @llvm.fptosi.sat.i32.f64(double undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f64u32 = call i32 @llvm.fptoui.sat.i32.f64(double undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:24 CodeSize:7 Lat:24 SizeLat:24 for: %f64s64 = call i64 @llvm.fptosi.sat.i64.f64(double undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f64u64 = call i64 @llvm.fptoui.sat.i64.f64(double undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:120 CodeSize:43 Lat:85 SizeLat:85 for: %v2f32s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f32(<2 x float> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f32u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f32(<2 x float> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:102 CodeSize:30 Lat:67 SizeLat:67 for: %v2f32s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f32(<2 x float> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f32u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f32(<2 x float> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:102 CodeSize:30 Lat:67 SizeLat:67 for: %v2f32s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f32(<2 x float> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f32u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f32(<2 x float> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:102 CodeSize:30 Lat:67 SizeLat:67 for: %v2f32s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f32(<2 x float> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f32u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f32(<2 x float> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:134 CodeSize:30 Lat:67 SizeLat:67 for: %v2f32s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f32(<2 x float> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:120 CodeSize:17 Lat:53 SizeLat:53 for: %v2f32u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f32(<2 x float> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:120 CodeSize:43 Lat:85 SizeLat:85 for: %v2f64s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f64(<2 x double> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f64u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f64(<2 x double> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:102 CodeSize:30 Lat:67 SizeLat:67 for: %v2f64s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f64(<2 x double> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f64u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f64(<2 x double> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:102 CodeSize:30 Lat:67 SizeLat:67 for: %v2f64s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f64(<2 x double> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f64u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f64(<2 x double> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:102 CodeSize:30 Lat:67 SizeLat:67 for: %v2f64s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f64(<2 x double> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f64u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f64(<2 x double> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:134 CodeSize:30 Lat:67 SizeLat:67 for: %v2f64s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f64(<2 x double> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:120 CodeSize:17 Lat:53 SizeLat:53 for: %v2f64u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f64(<2 x double> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:240 CodeSize:85 Lat:169 SizeLat:169 for: %v4f32s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f32(<4 x float> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:176 CodeSize:33 Lat:105 SizeLat:105 for: %v4f32u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f32(<4 x float> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:202 CodeSize:58 Lat:131 SizeLat:131 for: %v4f32s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f32(<4 x float> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:176 CodeSize:33 Lat:105 SizeLat:105 for: %v4f32u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f32(<4 x float> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:202 CodeSize:58 Lat:131 SizeLat:131 for: %v4f32s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f32(<4 x float> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:176 CodeSize:33 Lat:105 SizeLat:105 for: %v4f32u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f32(<4 x float> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:202 CodeSize:58 Lat:131 SizeLat:131 for: %v4f32s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:176 CodeSize:33 Lat:105 SizeLat:105 for: %v4f32u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f32(<4 x float> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:406 CodeSize:59 Lat:133 SizeLat:133 for: %v4f32s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f32(<4 x float> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:378 CodeSize:33 Lat:105 SizeLat:105 for: %v4f32u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f32(<4 x float> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:314 CodeSize:85 Lat:169 SizeLat:169 for: %v4f64s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f64(<4 x double> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:250 CodeSize:33 Lat:105 SizeLat:105 for: %v4f64u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f64(<4 x double> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:276 CodeSize:58 Lat:131 SizeLat:131 for: %v4f64s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f64(<4 x double> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:250 CodeSize:33 Lat:105 SizeLat:105 for: %v4f64u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f64(<4 x double> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:276 CodeSize:58 Lat:131 SizeLat:131 for: %v4f64s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f64(<4 x double> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:250 CodeSize:33 Lat:105 SizeLat:105 for: %v4f64u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f64(<4 x double> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:276 CodeSize:58 Lat:131 SizeLat:131 for: %v4f64s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f64(<4 x double> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:250 CodeSize:33 Lat:105 SizeLat:105 for: %v4f64u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f64(<4 x double> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:404 CodeSize:59 Lat:133 SizeLat:133 for: %v4f64s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f64(<4 x double> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:376 CodeSize:33 Lat:105 SizeLat:105 for: %v4f64u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f64(<4 x double> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:626 CodeSize:169 Lat:337 SizeLat:337 for: %v8f32s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f32(<8 x float> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:498 CodeSize:65 Lat:209 SizeLat:209 for: %v8f32u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f32(<8 x float> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:548 CodeSize:114 Lat:259 SizeLat:259 for: %v8f32s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f32(<8 x float> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:498 CodeSize:65 Lat:209 SizeLat:209 for: %v8f32u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f32(<8 x float> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:548 CodeSize:114 Lat:259 SizeLat:259 for: %v8f32s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f32(<8 x float> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:498 CodeSize:65 Lat:209 SizeLat:209 for: %v8f32u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f32(<8 x float> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:548 CodeSize:115 Lat:261 SizeLat:261 for: %v8f32s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f32(<8 x float> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:496 CodeSize:65 Lat:209 SizeLat:209 for: %v8f32u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f32(<8 x float> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1360 CodeSize:117 Lat:265 SizeLat:265 for: %v8f32s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f32(<8 x float> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1304 CodeSize:65 Lat:209 SizeLat:209 for: %v8f32u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f32(<8 x float> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:922 CodeSize:169 Lat:337 SizeLat:337 for: %v8f64s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f64(<8 x double> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:794 CodeSize:65 Lat:209 SizeLat:209 for: %v8f64u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f64(<8 x double> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:844 CodeSize:114 Lat:259 SizeLat:259 for: %v8f64s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f64(<8 x double> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:794 CodeSize:65 Lat:209 SizeLat:209 for: %v8f64u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f64(<8 x double> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:844 CodeSize:114 Lat:259 SizeLat:259 for: %v8f64s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f64(<8 x double> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:794 CodeSize:65 Lat:209 SizeLat:209 for: %v8f64u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f64(<8 x double> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:844 CodeSize:115 Lat:261 SizeLat:261 for: %v8f64s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f64(<8 x double> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:792 CodeSize:65 Lat:209 SizeLat:209 for: %v8f64u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f64(<8 x double> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1352 CodeSize:117 Lat:265 SizeLat:265 for: %v8f64s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f64(<8 x double> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1296 CodeSize:65 Lat:209 SizeLat:209 for: %v8f64u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f64(<8 x double> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1834 CodeSize:337 Lat:673 SizeLat:673 for: %v16f32s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f32(<16 x float> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1578 CodeSize:129 Lat:417 SizeLat:417 for: %v16f32u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f32(<16 x float> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1676 CodeSize:226 Lat:515 SizeLat:515 for: %v16f32s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f32(<16 x float> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1578 CodeSize:129 Lat:417 SizeLat:417 for: %v16f32u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f32(<16 x float> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1676 CodeSize:227 Lat:517 SizeLat:517 for: %v16f32s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f32(<16 x float> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1576 CodeSize:129 Lat:417 SizeLat:417 for: %v16f32u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f32(<16 x float> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1672 CodeSize:229 Lat:521 SizeLat:521 for: %v16f32s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f32(<16 x float> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1568 CodeSize:129 Lat:417 SizeLat:417 for: %v16f32u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f32(<16 x float> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:4912 CodeSize:233 Lat:529 SizeLat:529 for: %v16f32s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f32(<16 x float> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:4800 CodeSize:129 Lat:417 SizeLat:417 for: %v16f32u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f32(<16 x float> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:3018 CodeSize:337 Lat:673 SizeLat:673 for: %v16f64s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f64(<16 x double> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2762 CodeSize:129 Lat:417 SizeLat:417 for: %v16f64u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f64(<16 x double> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2860 CodeSize:226 Lat:515 SizeLat:515 for: %v16f64s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f64(<16 x double> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2762 CodeSize:129 Lat:417 SizeLat:417 for: %v16f64u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f64(<16 x double> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2860 CodeSize:227 Lat:517 SizeLat:517 for: %v16f64s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f64(<16 x double> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2760 CodeSize:129 Lat:417 SizeLat:417 for: %v16f64u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f64(<16 x double> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2856 CodeSize:229 Lat:521 SizeLat:521 for: %v16f64s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f64(<16 x double> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2752 CodeSize:129 Lat:417 SizeLat:417 for: %v16f64u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f64(<16 x double> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:4880 CodeSize:233 Lat:529 SizeLat:529 for: %v16f64s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f64(<16 x double> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:4768 CodeSize:129 Lat:417 SizeLat:417 for: %v16f64u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f64(<16 x double> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-MVEFP-LABEL: 'casts'
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f32s1 = call i1 @llvm.fptosi.sat.i1.f32(float undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f32u1 = call i1 @llvm.fptoui.sat.i1.f32(float undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f32s8 = call i8 @llvm.fptosi.sat.i8.f32(float undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f32u8 = call i8 @llvm.fptoui.sat.i8.f32(float undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f32s16 = call i16 @llvm.fptosi.sat.i16.f32(float undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f32u16 = call i16 @llvm.fptoui.sat.i16.f32(float undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32s32 = call i32 @llvm.fptosi.sat.i32.f32(float undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32u32 = call i32 @llvm.fptoui.sat.i32.f32(float undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %f32s64 = call i64 @llvm.fptosi.sat.i64.f32(float undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f32u64 = call i64 @llvm.fptoui.sat.i64.f32(float undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %f64s1 = call i1 @llvm.fptosi.sat.i1.f64(double undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %f64u1 = call i1 @llvm.fptoui.sat.i1.f64(double undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %f64s8 = call i8 @llvm.fptosi.sat.i8.f64(double undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %f64u8 = call i8 @llvm.fptoui.sat.i8.f64(double undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %f64s16 = call i16 @llvm.fptosi.sat.i16.f64(double undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %f64u16 = call i16 @llvm.fptoui.sat.i16.f64(double undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64s32 = call i32 @llvm.fptosi.sat.i32.f64(double undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64u32 = call i32 @llvm.fptoui.sat.i32.f64(double undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %f64s64 = call i64 @llvm.fptosi.sat.i64.f64(double undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %f64u64 = call i64 @llvm.fptoui.sat.i64.f64(double undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v2f32s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f32(<2 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v2f32u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f32(<2 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v2f32s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f32(<2 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v2f32u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f32(<2 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v2f32s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f32(<2 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v2f32u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f32(<2 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f32(<2 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f32(<2 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %v2f32s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f32(<2 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %v2f32u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f32(<2 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v2f64s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f64(<2 x double> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %v2f64u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f64(<2 x double> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %v2f64s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f64(<2 x double> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %v2f64u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f64(<2 x double> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %v2f64s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f64(<2 x double> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %v2f64u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f64(<2 x double> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %v2f64s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f64(<2 x double> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %v2f64u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f64(<2 x double> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %v2f64s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f64(<2 x double> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %v2f64u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f64(<2 x double> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v4f32s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f32(<4 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v4f32u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f32(<4 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v4f32s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f32(<4 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v4f32u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f32(<4 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v4f32s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f32(<4 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v4f32u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f32(<4 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f32(<4 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 284 for instruction: %v4f32s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f32(<4 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 278 for instruction: %v4f32u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f32(<4 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 234 for instruction: %v4f64s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f64(<4 x double> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 178 for instruction: %v4f64u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f64(<4 x double> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 196 for instruction: %v4f64s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f64(<4 x double> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 178 for instruction: %v4f64u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f64(<4 x double> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 196 for instruction: %v4f64s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f64(<4 x double> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 178 for instruction: %v4f64u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f64(<4 x double> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 196 for instruction: %v4f64s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f64(<4 x double> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 178 for instruction: %v4f64u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f64(<4 x double> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 324 for instruction: %v4f64s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f64(<4 x double> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 304 for instruction: %v4f64u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f64(<4 x double> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v8f32s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f32(<8 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v8f32u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f32(<8 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v8f32s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f32(<8 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v8f32u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f32(<8 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v8f32s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f32(<8 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v8f32u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f32(<8 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f32(<8 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f32(<8 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1148 for instruction: %v8f32s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f32(<8 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1104 for instruction: %v8f32u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f32(<8 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 762 for instruction: %v8f64s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f64(<8 x double> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 650 for instruction: %v8f64u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f64(<8 x double> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 684 for instruction: %v8f64s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f64(<8 x double> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 650 for instruction: %v8f64u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f64(<8 x double> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 684 for instruction: %v8f64s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f64(<8 x double> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 650 for instruction: %v8f64u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f64(<8 x double> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 684 for instruction: %v8f64s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f64(<8 x double> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 648 for instruction: %v8f64u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f64(<8 x double> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1192 for instruction: %v8f64s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f64(<8 x double> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1152 for instruction: %v8f64u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f64(<8 x double> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v16f32s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f32(<16 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v16f32u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f32(<16 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v16f32s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f32(<16 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v16f32u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f32(<16 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v16f32s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f32(<16 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v16f32u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f32(<16 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f32s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f32(<16 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f32u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f32(<16 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 4488 for instruction: %v16f32s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f32(<16 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 4400 for instruction: %v16f32u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f32(<16 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2698 for instruction: %v16f64s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f64(<16 x double> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2474 for instruction: %v16f64u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f64(<16 x double> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2540 for instruction: %v16f64s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f64(<16 x double> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2474 for instruction: %v16f64u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f64(<16 x double> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2540 for instruction: %v16f64s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f64(<16 x double> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2472 for instruction: %v16f64u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f64(<16 x double> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2536 for instruction: %v16f64s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f64(<16 x double> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2464 for instruction: %v16f64u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f64(<16 x double> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 4560 for instruction: %v16f64s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f64(<16 x double> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 4480 for instruction: %v16f64u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f64(<16 x double> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f32s1 = call i1 @llvm.fptosi.sat.i1.f32(float undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f32u1 = call i1 @llvm.fptoui.sat.i1.f32(float undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f32s8 = call i8 @llvm.fptosi.sat.i8.f32(float undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f32u8 = call i8 @llvm.fptoui.sat.i8.f32(float undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f32s16 = call i16 @llvm.fptosi.sat.i16.f32(float undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f32u16 = call i16 @llvm.fptoui.sat.i16.f32(float undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %f32s32 = call i32 @llvm.fptosi.sat.i32.f32(float undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %f32u32 = call i32 @llvm.fptoui.sat.i32.f32(float undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %f32s64 = call i64 @llvm.fptosi.sat.i64.f32(float undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 3 for: %f32u64 = call i64 @llvm.fptoui.sat.i64.f32(float undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:7 CodeSize:9 Lat:7 SizeLat:7 for: %f64s1 = call i1 @llvm.fptosi.sat.i1.f64(double undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:7 CodeSize:9 Lat:7 SizeLat:7 for: %f64u1 = call i1 @llvm.fptoui.sat.i1.f64(double undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:7 CodeSize:9 Lat:7 SizeLat:7 for: %f64s8 = call i8 @llvm.fptosi.sat.i8.f64(double undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:7 CodeSize:9 Lat:7 SizeLat:7 for: %f64u8 = call i8 @llvm.fptoui.sat.i8.f64(double undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:7 CodeSize:9 Lat:7 SizeLat:7 for: %f64s16 = call i16 @llvm.fptosi.sat.i16.f64(double undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:7 CodeSize:9 Lat:7 SizeLat:7 for: %f64u16 = call i16 @llvm.fptoui.sat.i16.f64(double undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %f64s32 = call i32 @llvm.fptosi.sat.i32.f64(double undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %f64u32 = call i32 @llvm.fptoui.sat.i32.f64(double undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:7 CodeSize:9 Lat:7 SizeLat:7 for: %f64s64 = call i64 @llvm.fptosi.sat.i64.f64(double undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:7 CodeSize:9 Lat:7 SizeLat:7 for: %f64u64 = call i64 @llvm.fptoui.sat.i64.f64(double undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v2f32s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f32(<2 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v2f32u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f32(<2 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v2f32s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f32(<2 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v2f32u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f32(<2 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v2f32s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f32(<2 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v2f32u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f32(<2 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2f32s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f32(<2 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2f32u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f32(<2 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:76 CodeSize:5 Lat:9 SizeLat:9 for: %v2f32s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f32(<2 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:72 CodeSize:3 Lat:5 SizeLat:5 for: %v2f32u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f32(<2 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:80 CodeSize:35 Lat:45 SizeLat:45 for: %v2f64s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f64(<2 x double> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:52 CodeSize:17 Lat:17 SizeLat:17 for: %v2f64u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f64(<2 x double> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:62 CodeSize:22 Lat:27 SizeLat:27 for: %v2f64s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f64(<2 x double> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:52 CodeSize:17 Lat:17 SizeLat:17 for: %v2f64u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f64(<2 x double> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:62 CodeSize:22 Lat:27 SizeLat:27 for: %v2f64s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f64(<2 x double> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:52 CodeSize:17 Lat:17 SizeLat:17 for: %v2f64u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f64(<2 x double> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:62 CodeSize:22 Lat:27 SizeLat:27 for: %v2f64s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f64(<2 x double> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:52 CodeSize:17 Lat:17 SizeLat:17 for: %v2f64u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f64(<2 x double> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:94 CodeSize:22 Lat:27 SizeLat:27 for: %v2f64s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f64(<2 x double> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:84 CodeSize:17 Lat:17 SizeLat:17 for: %v2f64u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f64(<2 x double> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v4f32s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f32(<4 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v4f32u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f32(<4 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v4f32s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f32(<4 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v4f32u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f32(<4 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v4f32s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f32(<4 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v4f32u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f32(<4 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4f32s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4f32u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f32(<4 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:284 CodeSize:6 Lat:11 SizeLat:11 for: %v4f32s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f32(<4 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:278 CodeSize:3 Lat:5 SizeLat:5 for: %v4f32u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f32(<4 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:234 CodeSize:69 Lat:89 SizeLat:89 for: %v4f64s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f64(<4 x double> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:178 CodeSize:33 Lat:33 SizeLat:33 for: %v4f64u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f64(<4 x double> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:196 CodeSize:42 Lat:51 SizeLat:51 for: %v4f64s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f64(<4 x double> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:178 CodeSize:33 Lat:33 SizeLat:33 for: %v4f64u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f64(<4 x double> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:196 CodeSize:42 Lat:51 SizeLat:51 for: %v4f64s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f64(<4 x double> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:178 CodeSize:33 Lat:33 SizeLat:33 for: %v4f64u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f64(<4 x double> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:196 CodeSize:42 Lat:51 SizeLat:51 for: %v4f64s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f64(<4 x double> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:178 CodeSize:33 Lat:33 SizeLat:33 for: %v4f64u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f64(<4 x double> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:324 CodeSize:43 Lat:53 SizeLat:53 for: %v4f64s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f64(<4 x double> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:304 CodeSize:33 Lat:33 SizeLat:33 for: %v4f64u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f64(<4 x double> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:12 CodeSize:14 Lat:12 SizeLat:12 for: %v8f32s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f32(<8 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:12 CodeSize:14 Lat:12 SizeLat:12 for: %v8f32u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f32(<8 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:12 CodeSize:14 Lat:12 SizeLat:12 for: %v8f32s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f32(<8 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:12 CodeSize:14 Lat:12 SizeLat:12 for: %v8f32u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f32(<8 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:12 CodeSize:14 Lat:12 SizeLat:12 for: %v8f32s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f32(<8 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:12 CodeSize:14 Lat:12 SizeLat:12 for: %v8f32u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f32(<8 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v8f32s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f32(<8 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v8f32u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f32(<8 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:1148 CodeSize:43 Lat:53 SizeLat:53 for: %v8f32s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f32(<8 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:1104 CodeSize:5 Lat:9 SizeLat:9 for: %v8f32u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f32(<8 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:762 CodeSize:137 Lat:177 SizeLat:177 for: %v8f64s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f64(<8 x double> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:650 CodeSize:65 Lat:65 SizeLat:65 for: %v8f64u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f64(<8 x double> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:684 CodeSize:82 Lat:99 SizeLat:99 for: %v8f64s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f64(<8 x double> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:650 CodeSize:65 Lat:65 SizeLat:65 for: %v8f64u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f64(<8 x double> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:684 CodeSize:82 Lat:99 SizeLat:99 for: %v8f64s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f64(<8 x double> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:650 CodeSize:65 Lat:65 SizeLat:65 for: %v8f64u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f64(<8 x double> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:684 CodeSize:83 Lat:101 SizeLat:101 for: %v8f64s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f64(<8 x double> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:648 CodeSize:65 Lat:65 SizeLat:65 for: %v8f64u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f64(<8 x double> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:1192 CodeSize:85 Lat:105 SizeLat:105 for: %v8f64s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f64(<8 x double> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:1152 CodeSize:65 Lat:65 SizeLat:65 for: %v8f64u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f64(<8 x double> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:24 CodeSize:28 Lat:24 SizeLat:24 for: %v16f32s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f32(<16 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:24 CodeSize:28 Lat:24 SizeLat:24 for: %v16f32u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f32(<16 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:24 CodeSize:28 Lat:24 SizeLat:24 for: %v16f32s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f32(<16 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:24 CodeSize:28 Lat:24 SizeLat:24 for: %v16f32u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f32(<16 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:24 CodeSize:28 Lat:24 SizeLat:24 for: %v16f32s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f32(<16 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:24 CodeSize:28 Lat:24 SizeLat:24 for: %v16f32u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f32(<16 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v16f32s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f32(<16 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v16f32u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f32(<16 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:4488 CodeSize:85 Lat:105 SizeLat:105 for: %v16f32s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f32(<16 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:4400 CodeSize:9 Lat:17 SizeLat:17 for: %v16f32u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f32(<16 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2698 CodeSize:273 Lat:353 SizeLat:353 for: %v16f64s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f64(<16 x double> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2474 CodeSize:129 Lat:129 SizeLat:129 for: %v16f64u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f64(<16 x double> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2540 CodeSize:162 Lat:195 SizeLat:195 for: %v16f64s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f64(<16 x double> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2474 CodeSize:129 Lat:129 SizeLat:129 for: %v16f64u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f64(<16 x double> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2540 CodeSize:163 Lat:197 SizeLat:197 for: %v16f64s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f64(<16 x double> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2472 CodeSize:129 Lat:129 SizeLat:129 for: %v16f64u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f64(<16 x double> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2536 CodeSize:165 Lat:201 SizeLat:201 for: %v16f64s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f64(<16 x double> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2464 CodeSize:129 Lat:129 SizeLat:129 for: %v16f64u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f64(<16 x double> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:4560 CodeSize:169 Lat:209 SizeLat:209 for: %v16f64s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f64(<16 x double> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:4480 CodeSize:129 Lat:129 SizeLat:129 for: %v16f64u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f64(<16 x double> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%f32s1 = call i1 @llvm.fptosi.sat.i1.f32(float undef)
%f32u1 = call i1 @llvm.fptoui.sat.i1.f32(float undef)
@@ -324,110 +324,110 @@ define void @casts() {
define void @fp16() {
; CHECK-MVE-LABEL: 'fp16'
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 202 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 202 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 202 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 406 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 378 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 352 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 402 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 352 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 402 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 352 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 550 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 498 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1362 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1306 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1250 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 994 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1092 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 994 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1092 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 992 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1680 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1576 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4920 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4808 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:23 CodeSize:7 Lat:23 SizeLat:23 for: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:23 CodeSize:6 Lat:23 SizeLat:23 for: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:23 CodeSize:6 Lat:23 SizeLat:23 for: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:23 CodeSize:6 Lat:23 SizeLat:23 for: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:24 CodeSize:7 Lat:24 SizeLat:24 for: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:120 CodeSize:43 Lat:85 SizeLat:85 for: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:102 CodeSize:30 Lat:67 SizeLat:67 for: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:102 CodeSize:30 Lat:67 SizeLat:67 for: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:102 CodeSize:30 Lat:67 SizeLat:67 for: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:134 CodeSize:30 Lat:67 SizeLat:67 for: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:120 CodeSize:17 Lat:53 SizeLat:53 for: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:240 CodeSize:85 Lat:169 SizeLat:169 for: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:176 CodeSize:33 Lat:105 SizeLat:105 for: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:202 CodeSize:58 Lat:131 SizeLat:131 for: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:176 CodeSize:33 Lat:105 SizeLat:105 for: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:202 CodeSize:58 Lat:131 SizeLat:131 for: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:176 CodeSize:33 Lat:105 SizeLat:105 for: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:202 CodeSize:58 Lat:131 SizeLat:131 for: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:176 CodeSize:33 Lat:105 SizeLat:105 for: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:406 CodeSize:59 Lat:133 SizeLat:133 for: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:378 CodeSize:33 Lat:105 SizeLat:105 for: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:480 CodeSize:169 Lat:337 SizeLat:337 for: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:352 CodeSize:65 Lat:209 SizeLat:209 for: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:402 CodeSize:114 Lat:259 SizeLat:259 for: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:352 CodeSize:65 Lat:209 SizeLat:209 for: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:402 CodeSize:114 Lat:259 SizeLat:259 for: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:352 CodeSize:65 Lat:209 SizeLat:209 for: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:550 CodeSize:115 Lat:261 SizeLat:261 for: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:498 CodeSize:65 Lat:209 SizeLat:209 for: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1362 CodeSize:117 Lat:265 SizeLat:265 for: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1306 CodeSize:65 Lat:209 SizeLat:209 for: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1250 CodeSize:337 Lat:673 SizeLat:673 for: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:994 CodeSize:129 Lat:417 SizeLat:417 for: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1092 CodeSize:226 Lat:515 SizeLat:515 for: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:994 CodeSize:129 Lat:417 SizeLat:417 for: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1092 CodeSize:227 Lat:517 SizeLat:517 for: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:992 CodeSize:129 Lat:417 SizeLat:417 for: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1680 CodeSize:229 Lat:521 SizeLat:521 for: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1576 CodeSize:129 Lat:417 SizeLat:417 for: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:4920 CodeSize:233 Lat:529 SizeLat:529 for: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:4808 CodeSize:129 Lat:417 SizeLat:417 for: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-MVEFP-LABEL: 'fp16'
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 284 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 278 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1112 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1102 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 124 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 4484 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 4400 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 3 for: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:44 CodeSize:5 Lat:9 SizeLat:9 for: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:40 CodeSize:3 Lat:5 SizeLat:5 for: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:76 CodeSize:5 Lat:9 SizeLat:9 for: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:72 CodeSize:3 Lat:5 SizeLat:5 for: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:10 CodeSize:5 Lat:9 SizeLat:9 for: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:3 Lat:5 SizeLat:5 for: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:284 CodeSize:6 Lat:11 SizeLat:11 for: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:278 CodeSize:3 Lat:5 SizeLat:5 for: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:20 CodeSize:6 Lat:11 SizeLat:11 for: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:14 CodeSize:3 Lat:5 SizeLat:5 for: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:1112 CodeSize:8 Lat:15 SizeLat:15 for: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:1102 CodeSize:3 Lat:5 SizeLat:5 for: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:12 CodeSize:14 Lat:12 SizeLat:12 for: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:12 CodeSize:14 Lat:12 SizeLat:12 for: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:12 CodeSize:14 Lat:12 SizeLat:12 for: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:12 CodeSize:14 Lat:12 SizeLat:12 for: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:124 CodeSize:75 Lat:85 SizeLat:85 for: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:48 CodeSize:5 Lat:9 SizeLat:9 for: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:4484 CodeSize:79 Lat:93 SizeLat:93 for: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:4400 CodeSize:5 Lat:9 SizeLat:9 for: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
%f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef)
diff --git a/llvm/test/Analysis/CostModel/ARM/freeshift.ll b/llvm/test/Analysis/CostModel/ARM/freeshift.ll
index 51e87b5..cd5c8c5 100644
--- a/llvm/test/Analysis/CostModel/ARM/freeshift.ll
+++ b/llvm/test/Analysis/CostModel/ARM/freeshift.ll
@@ -1,23 +1,23 @@
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi < %s | FileCheck %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi < %s | FileCheck %s
target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
define void @shl(i32 %a, i32 %b) {
; CHECK-LABEL: 'shl'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %as = shl i32 %a, 3
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ac = add i32 %b, %as
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %ss = shl i32 %a, 3
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sc = sub i32 %b, %ss
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %xs = shl i32 %a, 3
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %xc = xor i32 %b, %xs
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %ns = shl i32 %a, 3
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nc = and i32 %b, %ns
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %os = shl i32 %a, 3
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %oc = or i32 %b, %os
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %is = shl i32 %a, 3
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ic = icmp eq i32 %b, %is
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %as = shl i32 %a, 3
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %ac = add i32 %b, %as
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %ss = shl i32 %a, 3
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %sc = sub i32 %b, %ss
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %xs = shl i32 %a, 3
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %xc = xor i32 %b, %xs
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %ns = shl i32 %a, 3
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nc = and i32 %b, %ns
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %os = shl i32 %a, 3
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %oc = or i32 %b, %os
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %is = shl i32 %a, 3
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %ic = icmp eq i32 %b, %is
+; CHECK-NEXT: Cost Model: Found costs of 1 for: ret void
;
%as = shl i32 %a, 3
%ac = add i32 %b, %as
@@ -36,19 +36,19 @@ define void @shl(i32 %a, i32 %b) {
define void @ashr(i32 %a, i32 %b) {
; CHECK-LABEL: 'ashr'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %as = ashr i32 %a, 3
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ac = add i32 %b, %as
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %ss = ashr i32 %a, 3
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sc = sub i32 %b, %ss
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %xs = ashr i32 %a, 3
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %xc = xor i32 %b, %xs
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %ns = ashr i32 %a, 3
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nc = and i32 %b, %ns
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %os = ashr i32 %a, 3
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %oc = or i32 %b, %os
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %is = ashr i32 %a, 3
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ic = icmp eq i32 %b, %is
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %as = ashr i32 %a, 3
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %ac = add i32 %b, %as
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %ss = ashr i32 %a, 3
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %sc = sub i32 %b, %ss
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %xs = ashr i32 %a, 3
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %xc = xor i32 %b, %xs
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %ns = ashr i32 %a, 3
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nc = and i32 %b, %ns
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %os = ashr i32 %a, 3
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %oc = or i32 %b, %os
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %is = ashr i32 %a, 3
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %ic = icmp eq i32 %b, %is
+; CHECK-NEXT: Cost Model: Found costs of 1 for: ret void
;
%as = ashr i32 %a, 3
%ac = add i32 %b, %as
@@ -67,19 +67,19 @@ define void @ashr(i32 %a, i32 %b) {
define void @lshr(i32 %a, i32 %b) {
; CHECK-LABEL: 'lshr'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %as = lshr i32 %a, 3
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ac = add i32 %b, %as
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %ss = lshr i32 %a, 3
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sc = sub i32 %b, %ss
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %xs = lshr i32 %a, 3
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %xc = xor i32 %b, %xs
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %ns = lshr i32 %a, 3
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nc = and i32 %b, %ns
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %os = lshr i32 %a, 3
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %oc = or i32 %b, %os
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %is = lshr i32 %a, 3
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ic = icmp eq i32 %b, %is
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %as = lshr i32 %a, 3
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %ac = add i32 %b, %as
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %ss = lshr i32 %a, 3
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %sc = sub i32 %b, %ss
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %xs = lshr i32 %a, 3
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %xc = xor i32 %b, %xs
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %ns = lshr i32 %a, 3
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nc = and i32 %b, %ns
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %os = lshr i32 %a, 3
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %oc = or i32 %b, %os
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %is = lshr i32 %a, 3
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %ic = icmp eq i32 %b, %is
+; CHECK-NEXT: Cost Model: Found costs of 1 for: ret void
;
%as = lshr i32 %a, 3
%ac = add i32 %b, %as
diff --git a/llvm/test/Analysis/CostModel/ARM/gep.ll b/llvm/test/Analysis/CostModel/ARM/gep.ll
index 48de193..cce87a5 100644
--- a/llvm/test/Analysis/CostModel/ARM/gep.ll
+++ b/llvm/test/Analysis/CostModel/ARM/gep.ll
@@ -1,98 +1,98 @@
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv6m-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V6M
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv7m-none-eabi -mcpu=cortex-m3 < %s | FileCheck %s --check-prefix=CHECK-V7M-NOFP
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv7m-none-eabi -mcpu=cortex-m4 < %s | FileCheck %s --check-prefix=CHECK-V7M-FP
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefix=CHECK-MVE
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVEFP
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift < %s | FileCheck %s --check-prefix=CHECK-T32
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=arm-none-eabi -mcpu=cortex-a53 < %s | FileCheck %s --check-prefix=CHECK-A32
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv6m-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V6M
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv7m-none-eabi -mcpu=cortex-m3 < %s | FileCheck %s --check-prefix=CHECK-V7M-NOFP
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv7m-none-eabi -mcpu=cortex-m4 < %s | FileCheck %s --check-prefix=CHECK-V7M-FP
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefix=CHECK-MVE
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVEFP
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift < %s | FileCheck %s --check-prefix=CHECK-T32
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=arm-none-eabi -mcpu=cortex-a53 < %s | FileCheck %s --check-prefix=CHECK-A32
target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
define void @testi8(ptr %a, i32 %i) {
; CHECK-V6M-LABEL: 'testi8'
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i8, ptr %a, i32 1
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i8, ptr %a, i32 31
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a32 = getelementptr inbounds i8, ptr %a, i32 32
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i8, ptr %a, i32 %i
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V6M-NEXT: Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i8, ptr %a, i32 1
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
+; CHECK-V6M-NEXT: Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i8, ptr %a, i32 31
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %a32 = getelementptr inbounds i8, ptr %a, i32 32
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
+; CHECK-V6M-NEXT: Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i8, ptr %a, i32 %i
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-V7M-NOFP-LABEL: 'testi8'
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i8, ptr %a, i32 1
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i8, ptr %a, i32 31
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i8, ptr %a, i32 32
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i8, ptr %a, i32 %i
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i8, ptr %a, i32 1
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i8, ptr %a, i32 31
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i8, ptr %a, i32 32
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i8, ptr %a, i32 %i
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-V7M-FP-LABEL: 'testi8'
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i8, ptr %a, i32 1
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i8, ptr %a, i32 31
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i8, ptr %a, i32 32
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i8, ptr %a, i32 %i
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i8, ptr %a, i32 1
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i8, ptr %a, i32 31
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i8, ptr %a, i32 32
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i8, ptr %a, i32 %i
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-MVE-LABEL: 'testi8'
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i8, ptr %a, i32 1
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i8, ptr %a, i32 31
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i8, ptr %a, i32 32
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i8, ptr %a, i32 %i
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i8, ptr %a, i32 1
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i8, ptr %a, i32 31
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i8, ptr %a, i32 32
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i8, ptr %a, i32 %i
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-MVEFP-LABEL: 'testi8'
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i8, ptr %a, i32 1
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i8, ptr %a, i32 31
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i8, ptr %a, i32 32
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i8, ptr %a, i32 %i
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i8, ptr %a, i32 1
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i8, ptr %a, i32 31
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i8, ptr %a, i32 32
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i8, ptr %a, i32 %i
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-T32-LABEL: 'testi8'
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i8, ptr %a, i32 1
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i8, ptr %a, i32 31
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i8, ptr %a, i32 32
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i8, ptr %a, i32 %i
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-T32-NEXT: Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i8, ptr %a, i32 1
+; CHECK-T32-NEXT: Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
+; CHECK-T32-NEXT: Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i8, ptr %a, i32 31
+; CHECK-T32-NEXT: Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i8, ptr %a, i32 32
+; CHECK-T32-NEXT: Cost Model: Found costs of 0 for: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
+; CHECK-T32-NEXT: Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
+; CHECK-T32-NEXT: Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i8, ptr %a, i32 %i
+; CHECK-T32-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-A32-LABEL: 'testi8'
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i8, ptr %a, i32 1
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i8, ptr %a, i32 31
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i8, ptr %a, i32 32
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i8, ptr %a, i32 %i
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-A32-NEXT: Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i8, ptr %a, i32 1
+; CHECK-A32-NEXT: Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
+; CHECK-A32-NEXT: Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i8, ptr %a, i32 31
+; CHECK-A32-NEXT: Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i8, ptr %a, i32 32
+; CHECK-A32-NEXT: Cost Model: Found costs of 0 for: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
+; CHECK-A32-NEXT: Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
+; CHECK-A32-NEXT: Cost Model: Found costs of 0 for: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
+; CHECK-A32-NEXT: Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i8, ptr %a, i32 %i
+; CHECK-A32-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%a1 = getelementptr inbounds i8, ptr %a, i32 1
%am4 = getelementptr inbounds i8, ptr %a, i32 -1
@@ -109,88 +109,88 @@ define void @testi8(ptr %a, i32 %i) {
define void @testi16(ptr %a, i32 %i) {
; CHECK-V6M-LABEL: 'testi16'
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i16, ptr %a, i32 1
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i16, ptr %a, i32 31
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a32 = getelementptr inbounds i16, ptr %a, i32 32
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds i16, ptr %a, i32 %i
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V6M-NEXT: Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i16, ptr %a, i32 1
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
+; CHECK-V6M-NEXT: Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i16, ptr %a, i32 31
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %a32 = getelementptr inbounds i16, ptr %a, i32 32
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %ai = getelementptr inbounds i16, ptr %a, i32 %i
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-V7M-NOFP-LABEL: 'testi16'
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i16, ptr %a, i32 1
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i16, ptr %a, i32 31
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i16, ptr %a, i32 32
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i16, ptr %a, i32 %i
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i16, ptr %a, i32 1
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i16, ptr %a, i32 31
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i16, ptr %a, i32 32
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i16, ptr %a, i32 %i
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-V7M-FP-LABEL: 'testi16'
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i16, ptr %a, i32 1
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i16, ptr %a, i32 31
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i16, ptr %a, i32 32
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i16, ptr %a, i32 %i
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i16, ptr %a, i32 1
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i16, ptr %a, i32 31
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i16, ptr %a, i32 32
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i16, ptr %a, i32 %i
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-MVE-LABEL: 'testi16'
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i16, ptr %a, i32 1
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i16, ptr %a, i32 31
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i16, ptr %a, i32 32
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i16, ptr %a, i32 %i
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i16, ptr %a, i32 1
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i16, ptr %a, i32 31
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i16, ptr %a, i32 32
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i16, ptr %a, i32 %i
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-MVEFP-LABEL: 'testi16'
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i16, ptr %a, i32 1
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i16, ptr %a, i32 31
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i16, ptr %a, i32 32
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i16, ptr %a, i32 %i
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i16, ptr %a, i32 1
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i16, ptr %a, i32 31
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i16, ptr %a, i32 32
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i16, ptr %a, i32 %i
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-T32-LABEL: 'testi16'
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i16, ptr %a, i32 1
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i16, ptr %a, i32 31
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i16, ptr %a, i32 32
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i16, ptr %a, i32 %i
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-T32-NEXT: Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i16, ptr %a, i32 1
+; CHECK-T32-NEXT: Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
+; CHECK-T32-NEXT: Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i16, ptr %a, i32 31
+; CHECK-T32-NEXT: Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i16, ptr %a, i32 32
+; CHECK-T32-NEXT: Cost Model: Found costs of 0 for: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
+; CHECK-T32-NEXT: Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
+; CHECK-T32-NEXT: Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i16, ptr %a, i32 %i
+; CHECK-T32-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-A32-LABEL: 'testi16'
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i16, ptr %a, i32 1
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i16, ptr %a, i32 31
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i16, ptr %a, i32 32
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds i16, ptr %a, i32 %i
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-A32-NEXT: Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i16, ptr %a, i32 1
+; CHECK-A32-NEXT: Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
+; CHECK-A32-NEXT: Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i16, ptr %a, i32 31
+; CHECK-A32-NEXT: Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i16, ptr %a, i32 32
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
+; CHECK-A32-NEXT: Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %ai = getelementptr inbounds i16, ptr %a, i32 %i
+; CHECK-A32-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%a1 = getelementptr inbounds i16, ptr %a, i32 1
%am4 = getelementptr inbounds i16, ptr %a, i32 -1
@@ -207,88 +207,88 @@ define void @testi16(ptr %a, i32 %i) {
define void @testi32(ptr %a, i32 %i) {
; CHECK-V6M-LABEL: 'testi32'
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i32, ptr %a, i32 1
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i32, ptr %a, i32 31
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a32 = getelementptr inbounds i32, ptr %a, i32 32
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds i32, ptr %a, i32 %i
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V6M-NEXT: Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i32, ptr %a, i32 1
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
+; CHECK-V6M-NEXT: Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i32, ptr %a, i32 31
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %a32 = getelementptr inbounds i32, ptr %a, i32 32
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %ai = getelementptr inbounds i32, ptr %a, i32 %i
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-V7M-NOFP-LABEL: 'testi32'
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i32, ptr %a, i32 1
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i32, ptr %a, i32 31
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i32, ptr %a, i32 32
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i32, ptr %a, i32 %i
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i32, ptr %a, i32 1
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i32, ptr %a, i32 31
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i32, ptr %a, i32 32
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i32, ptr %a, i32 %i
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-V7M-FP-LABEL: 'testi32'
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i32, ptr %a, i32 1
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i32, ptr %a, i32 31
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i32, ptr %a, i32 32
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i32, ptr %a, i32 %i
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i32, ptr %a, i32 1
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i32, ptr %a, i32 31
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i32, ptr %a, i32 32
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i32, ptr %a, i32 %i
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-MVE-LABEL: 'testi32'
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i32, ptr %a, i32 1
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i32, ptr %a, i32 31
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i32, ptr %a, i32 32
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i32, ptr %a, i32 %i
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i32, ptr %a, i32 1
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i32, ptr %a, i32 31
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i32, ptr %a, i32 32
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i32, ptr %a, i32 %i
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-MVEFP-LABEL: 'testi32'
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i32, ptr %a, i32 1
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i32, ptr %a, i32 31
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i32, ptr %a, i32 32
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i32, ptr %a, i32 %i
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i32, ptr %a, i32 1
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i32, ptr %a, i32 31
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i32, ptr %a, i32 32
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i32, ptr %a, i32 %i
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-T32-LABEL: 'testi32'
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i32, ptr %a, i32 1
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i32, ptr %a, i32 31
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i32, ptr %a, i32 32
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i32, ptr %a, i32 %i
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-T32-NEXT: Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i32, ptr %a, i32 1
+; CHECK-T32-NEXT: Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
+; CHECK-T32-NEXT: Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i32, ptr %a, i32 31
+; CHECK-T32-NEXT: Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i32, ptr %a, i32 32
+; CHECK-T32-NEXT: Cost Model: Found costs of 0 for: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
+; CHECK-T32-NEXT: Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
+; CHECK-T32-NEXT: Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i32, ptr %a, i32 %i
+; CHECK-T32-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-A32-LABEL: 'testi32'
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i32, ptr %a, i32 1
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i32, ptr %a, i32 31
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i32, ptr %a, i32 32
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i32, ptr %a, i32 %i
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-A32-NEXT: Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i32, ptr %a, i32 1
+; CHECK-A32-NEXT: Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
+; CHECK-A32-NEXT: Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i32, ptr %a, i32 31
+; CHECK-A32-NEXT: Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i32, ptr %a, i32 32
+; CHECK-A32-NEXT: Cost Model: Found costs of 0 for: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
+; CHECK-A32-NEXT: Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
+; CHECK-A32-NEXT: Cost Model: Found costs of 0 for: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
+; CHECK-A32-NEXT: Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i32, ptr %a, i32 %i
+; CHECK-A32-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%a1 = getelementptr inbounds i32, ptr %a, i32 1
%am4 = getelementptr inbounds i32, ptr %a, i32 -1
@@ -305,102 +305,102 @@ define void @testi32(ptr %a, i32 %i) {
define void @testi64(ptr %a, i32 %i) {
; CHECK-V6M-LABEL: 'testi64'
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i64, ptr %a, i32 1
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a15 = getelementptr inbounds i64, ptr %a, i32 15
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a16 = getelementptr inbounds i64, ptr %a, i32 16
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a31 = getelementptr inbounds i64, ptr %a, i32 31
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a32 = getelementptr inbounds i64, ptr %a, i32 32
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds i64, ptr %a, i32 %i
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V6M-NEXT: Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i64, ptr %a, i32 1
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
+; CHECK-V6M-NEXT: Cost Model: Found costs of 0 for: %a15 = getelementptr inbounds i64, ptr %a, i32 15
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %a16 = getelementptr inbounds i64, ptr %a, i32 16
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %a31 = getelementptr inbounds i64, ptr %a, i32 31
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %a32 = getelementptr inbounds i64, ptr %a, i32 32
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %ai = getelementptr inbounds i64, ptr %a, i32 %i
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-V7M-NOFP-LABEL: 'testi64'
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i64, ptr %a, i32 1
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a15 = getelementptr inbounds i64, ptr %a, i32 15
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a16 = getelementptr inbounds i64, ptr %a, i32 16
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i64, ptr %a, i32 31
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i64, ptr %a, i32 32
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds i64, ptr %a, i32 %i
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i64, ptr %a, i32 1
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %a15 = getelementptr inbounds i64, ptr %a, i32 15
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %a16 = getelementptr inbounds i64, ptr %a, i32 16
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i64, ptr %a, i32 31
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i64, ptr %a, i32 32
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %ai = getelementptr inbounds i64, ptr %a, i32 %i
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-V7M-FP-LABEL: 'testi64'
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i64, ptr %a, i32 1
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a15 = getelementptr inbounds i64, ptr %a, i32 15
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a16 = getelementptr inbounds i64, ptr %a, i32 16
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i64, ptr %a, i32 31
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i64, ptr %a, i32 32
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds i64, ptr %a, i32 %i
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i64, ptr %a, i32 1
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %a15 = getelementptr inbounds i64, ptr %a, i32 15
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %a16 = getelementptr inbounds i64, ptr %a, i32 16
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i64, ptr %a, i32 31
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i64, ptr %a, i32 32
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %ai = getelementptr inbounds i64, ptr %a, i32 %i
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-MVE-LABEL: 'testi64'
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i64, ptr %a, i32 1
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a15 = getelementptr inbounds i64, ptr %a, i32 15
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a16 = getelementptr inbounds i64, ptr %a, i32 16
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i64, ptr %a, i32 31
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i64, ptr %a, i32 32
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds i64, ptr %a, i32 %i
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i64, ptr %a, i32 1
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %a15 = getelementptr inbounds i64, ptr %a, i32 15
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %a16 = getelementptr inbounds i64, ptr %a, i32 16
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i64, ptr %a, i32 31
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i64, ptr %a, i32 32
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %ai = getelementptr inbounds i64, ptr %a, i32 %i
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-MVEFP-LABEL: 'testi64'
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i64, ptr %a, i32 1
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a15 = getelementptr inbounds i64, ptr %a, i32 15
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a16 = getelementptr inbounds i64, ptr %a, i32 16
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i64, ptr %a, i32 31
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i64, ptr %a, i32 32
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds i64, ptr %a, i32 %i
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i64, ptr %a, i32 1
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %a15 = getelementptr inbounds i64, ptr %a, i32 15
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %a16 = getelementptr inbounds i64, ptr %a, i32 16
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i64, ptr %a, i32 31
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i64, ptr %a, i32 32
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %ai = getelementptr inbounds i64, ptr %a, i32 %i
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-T32-LABEL: 'testi64'
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i64, ptr %a, i32 1
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a15 = getelementptr inbounds i64, ptr %a, i32 15
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a16 = getelementptr inbounds i64, ptr %a, i32 16
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i64, ptr %a, i32 31
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i64, ptr %a, i32 32
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds i64, ptr %a, i32 %i
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-T32-NEXT: Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i64, ptr %a, i32 1
+; CHECK-T32-NEXT: Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
+; CHECK-T32-NEXT: Cost Model: Found costs of 0 for: %a15 = getelementptr inbounds i64, ptr %a, i32 15
+; CHECK-T32-NEXT: Cost Model: Found costs of 0 for: %a16 = getelementptr inbounds i64, ptr %a, i32 16
+; CHECK-T32-NEXT: Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i64, ptr %a, i32 31
+; CHECK-T32-NEXT: Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i64, ptr %a, i32 32
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
+; CHECK-T32-NEXT: Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
+; CHECK-T32-NEXT: Cost Model: Found costs of 0 for: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %ai = getelementptr inbounds i64, ptr %a, i32 %i
+; CHECK-T32-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-A32-LABEL: 'testi64'
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1 = getelementptr inbounds i64, ptr %a, i32 1
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a15 = getelementptr inbounds i64, ptr %a, i32 15
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a16 = getelementptr inbounds i64, ptr %a, i32 16
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a31 = getelementptr inbounds i64, ptr %a, i32 31
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a32 = getelementptr inbounds i64, ptr %a, i32 32
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds i64, ptr %a, i32 %i
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %a1 = getelementptr inbounds i64, ptr %a, i32 1
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %a15 = getelementptr inbounds i64, ptr %a, i32 15
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %a16 = getelementptr inbounds i64, ptr %a, i32 16
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %a31 = getelementptr inbounds i64, ptr %a, i32 31
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %a32 = getelementptr inbounds i64, ptr %a, i32 32
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %ai = getelementptr inbounds i64, ptr %a, i32 %i
+; CHECK-A32-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%a1 = getelementptr inbounds i64, ptr %a, i32 1
%am4 = getelementptr inbounds i64, ptr %a, i32 -1
@@ -419,102 +419,102 @@ define void @testi64(ptr %a, i32 %i) {
define void @testhalf(ptr %a, i32 %i) {
; CHECK-V6M-LABEL: 'testhalf'
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1 = getelementptr inbounds half, ptr %a, i32 1
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am1 = getelementptr inbounds half, ptr %a, i32 -1
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a255 = getelementptr inbounds half, ptr %a, i32 255
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds half, ptr %a, i32 256
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds half, ptr %a, i32 -255
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds half, ptr %a, i32 -256
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am63 = getelementptr inbounds half, ptr %a, i32 -63
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am64 = getelementptr inbounds half, ptr %a, i32 -64
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds half, ptr %a, i32 %i
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %a1 = getelementptr inbounds half, ptr %a, i32 1
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %am1 = getelementptr inbounds half, ptr %a, i32 -1
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %a255 = getelementptr inbounds half, ptr %a, i32 255
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds half, ptr %a, i32 256
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds half, ptr %a, i32 -255
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds half, ptr %a, i32 -256
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %am63 = getelementptr inbounds half, ptr %a, i32 -63
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %am64 = getelementptr inbounds half, ptr %a, i32 -64
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %ai = getelementptr inbounds half, ptr %a, i32 %i
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-V7M-NOFP-LABEL: 'testhalf'
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds half, ptr %a, i32 1
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds half, ptr %a, i32 -1
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds half, ptr %a, i32 255
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a256 = getelementptr inbounds half, ptr %a, i32 256
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds half, ptr %a, i32 -255
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds half, ptr %a, i32 -256
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds half, ptr %a, i32 -63
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds half, ptr %a, i32 -64
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds half, ptr %a, i32 %i
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds half, ptr %a, i32 1
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds half, ptr %a, i32 -1
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds half, ptr %a, i32 255
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %a256 = getelementptr inbounds half, ptr %a, i32 256
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds half, ptr %a, i32 -255
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds half, ptr %a, i32 -256
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds half, ptr %a, i32 -63
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds half, ptr %a, i32 -64
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %ai = getelementptr inbounds half, ptr %a, i32 %i
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-V7M-FP-LABEL: 'testhalf'
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1 = getelementptr inbounds half, ptr %a, i32 1
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am1 = getelementptr inbounds half, ptr %a, i32 -1
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a255 = getelementptr inbounds half, ptr %a, i32 255
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a256 = getelementptr inbounds half, ptr %a, i32 256
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds half, ptr %a, i32 -255
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am256 = getelementptr inbounds half, ptr %a, i32 -256
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am63 = getelementptr inbounds half, ptr %a, i32 -63
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds half, ptr %a, i32 -64
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds half, ptr %a, i32 %i
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %a1 = getelementptr inbounds half, ptr %a, i32 1
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %am1 = getelementptr inbounds half, ptr %a, i32 -1
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %a255 = getelementptr inbounds half, ptr %a, i32 255
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %a256 = getelementptr inbounds half, ptr %a, i32 256
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds half, ptr %a, i32 -255
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %am256 = getelementptr inbounds half, ptr %a, i32 -256
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %am63 = getelementptr inbounds half, ptr %a, i32 -63
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds half, ptr %a, i32 -64
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %ai = getelementptr inbounds half, ptr %a, i32 %i
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-MVE-LABEL: 'testhalf'
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds half, ptr %a, i32 1
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds half, ptr %a, i32 -1
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds half, ptr %a, i32 255
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds half, ptr %a, i32 256
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds half, ptr %a, i32 -255
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds half, ptr %a, i32 -256
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds half, ptr %a, i32 -63
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds half, ptr %a, i32 -64
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds half, ptr %a, i32 %i
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds half, ptr %a, i32 1
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds half, ptr %a, i32 -1
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds half, ptr %a, i32 255
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds half, ptr %a, i32 256
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds half, ptr %a, i32 -255
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds half, ptr %a, i32 -256
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds half, ptr %a, i32 -63
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds half, ptr %a, i32 -64
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %ai = getelementptr inbounds half, ptr %a, i32 %i
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-MVEFP-LABEL: 'testhalf'
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds half, ptr %a, i32 1
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds half, ptr %a, i32 -1
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds half, ptr %a, i32 255
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds half, ptr %a, i32 256
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds half, ptr %a, i32 -255
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds half, ptr %a, i32 -256
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds half, ptr %a, i32 -63
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds half, ptr %a, i32 -64
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds half, ptr %a, i32 %i
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds half, ptr %a, i32 1
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds half, ptr %a, i32 -1
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds half, ptr %a, i32 255
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds half, ptr %a, i32 256
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds half, ptr %a, i32 -255
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds half, ptr %a, i32 -256
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds half, ptr %a, i32 -63
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds half, ptr %a, i32 -64
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %ai = getelementptr inbounds half, ptr %a, i32 %i
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-T32-LABEL: 'testhalf'
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1 = getelementptr inbounds half, ptr %a, i32 1
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am1 = getelementptr inbounds half, ptr %a, i32 -1
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a255 = getelementptr inbounds half, ptr %a, i32 255
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a256 = getelementptr inbounds half, ptr %a, i32 256
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds half, ptr %a, i32 -255
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am256 = getelementptr inbounds half, ptr %a, i32 -256
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am63 = getelementptr inbounds half, ptr %a, i32 -63
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds half, ptr %a, i32 -64
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds half, ptr %a, i32 %i
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %a1 = getelementptr inbounds half, ptr %a, i32 1
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %am1 = getelementptr inbounds half, ptr %a, i32 -1
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %a255 = getelementptr inbounds half, ptr %a, i32 255
+; CHECK-T32-NEXT: Cost Model: Found costs of 0 for: %a256 = getelementptr inbounds half, ptr %a, i32 256
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds half, ptr %a, i32 -255
+; CHECK-T32-NEXT: Cost Model: Found costs of 0 for: %am256 = getelementptr inbounds half, ptr %a, i32 -256
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %am63 = getelementptr inbounds half, ptr %a, i32 -63
+; CHECK-T32-NEXT: Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds half, ptr %a, i32 -64
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %ai = getelementptr inbounds half, ptr %a, i32 %i
+; CHECK-T32-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-A32-LABEL: 'testhalf'
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1 = getelementptr inbounds half, ptr %a, i32 1
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am1 = getelementptr inbounds half, ptr %a, i32 -1
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a255 = getelementptr inbounds half, ptr %a, i32 255
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds half, ptr %a, i32 256
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds half, ptr %a, i32 -255
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds half, ptr %a, i32 -256
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am63 = getelementptr inbounds half, ptr %a, i32 -63
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am64 = getelementptr inbounds half, ptr %a, i32 -64
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds half, ptr %a, i32 %i
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %a1 = getelementptr inbounds half, ptr %a, i32 1
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %am1 = getelementptr inbounds half, ptr %a, i32 -1
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %a255 = getelementptr inbounds half, ptr %a, i32 255
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds half, ptr %a, i32 256
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds half, ptr %a, i32 -255
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds half, ptr %a, i32 -256
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %am63 = getelementptr inbounds half, ptr %a, i32 -63
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %am64 = getelementptr inbounds half, ptr %a, i32 -64
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %ai = getelementptr inbounds half, ptr %a, i32 %i
+; CHECK-A32-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%a1 = getelementptr inbounds half, ptr %a, i32 1
%am1 = getelementptr inbounds half, ptr %a, i32 -1
@@ -533,102 +533,102 @@ define void @testhalf(ptr %a, i32 %i) {
define void @testfloat(ptr %a, i32 %i) {
; CHECK-V6M-LABEL: 'testfloat'
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds float, ptr %a, i32 1
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am1 = getelementptr inbounds float, ptr %a, i32 -1
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a255 = getelementptr inbounds float, ptr %a, i32 255
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds float, ptr %a, i32 256
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds float, ptr %a, i32 -255
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds float, ptr %a, i32 -256
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am63 = getelementptr inbounds float, ptr %a, i32 -63
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am64 = getelementptr inbounds float, ptr %a, i32 -64
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds float, ptr %a, i32 %i
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V6M-NEXT: Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds float, ptr %a, i32 1
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %am1 = getelementptr inbounds float, ptr %a, i32 -1
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %a255 = getelementptr inbounds float, ptr %a, i32 255
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds float, ptr %a, i32 256
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds float, ptr %a, i32 -255
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds float, ptr %a, i32 -256
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %am63 = getelementptr inbounds float, ptr %a, i32 -63
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %am64 = getelementptr inbounds float, ptr %a, i32 -64
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %ai = getelementptr inbounds float, ptr %a, i32 %i
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-V7M-NOFP-LABEL: 'testfloat'
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds float, ptr %a, i32 1
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds float, ptr %a, i32 -1
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds float, ptr %a, i32 255
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a256 = getelementptr inbounds float, ptr %a, i32 256
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds float, ptr %a, i32 -255
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds float, ptr %a, i32 -256
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds float, ptr %a, i32 -63
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am64 = getelementptr inbounds float, ptr %a, i32 -64
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds float, ptr %a, i32 %i
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds float, ptr %a, i32 1
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds float, ptr %a, i32 -1
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds float, ptr %a, i32 255
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %a256 = getelementptr inbounds float, ptr %a, i32 256
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds float, ptr %a, i32 -255
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds float, ptr %a, i32 -256
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds float, ptr %a, i32 -63
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %am64 = getelementptr inbounds float, ptr %a, i32 -64
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %ai = getelementptr inbounds float, ptr %a, i32 %i
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-V7M-FP-LABEL: 'testfloat'
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds float, ptr %a, i32 1
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds float, ptr %a, i32 -1
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds float, ptr %a, i32 255
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds float, ptr %a, i32 256
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds float, ptr %a, i32 -255
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds float, ptr %a, i32 -256
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds float, ptr %a, i32 -63
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds float, ptr %a, i32 -64
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds float, ptr %a, i32 %i
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds float, ptr %a, i32 1
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds float, ptr %a, i32 -1
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds float, ptr %a, i32 255
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds float, ptr %a, i32 256
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds float, ptr %a, i32 -255
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds float, ptr %a, i32 -256
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds float, ptr %a, i32 -63
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds float, ptr %a, i32 -64
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %ai = getelementptr inbounds float, ptr %a, i32 %i
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-MVE-LABEL: 'testfloat'
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds float, ptr %a, i32 1
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds float, ptr %a, i32 -1
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds float, ptr %a, i32 255
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a256 = getelementptr inbounds float, ptr %a, i32 256
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds float, ptr %a, i32 -255
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds float, ptr %a, i32 -256
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds float, ptr %a, i32 -63
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am64 = getelementptr inbounds float, ptr %a, i32 -64
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds float, ptr %a, i32 %i
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds float, ptr %a, i32 1
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds float, ptr %a, i32 -1
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds float, ptr %a, i32 255
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %a256 = getelementptr inbounds float, ptr %a, i32 256
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds float, ptr %a, i32 -255
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds float, ptr %a, i32 -256
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds float, ptr %a, i32 -63
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %am64 = getelementptr inbounds float, ptr %a, i32 -64
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %ai = getelementptr inbounds float, ptr %a, i32 %i
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-MVEFP-LABEL: 'testfloat'
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds float, ptr %a, i32 1
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds float, ptr %a, i32 -1
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds float, ptr %a, i32 255
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds float, ptr %a, i32 256
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds float, ptr %a, i32 -255
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds float, ptr %a, i32 -256
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds float, ptr %a, i32 -63
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds float, ptr %a, i32 -64
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds float, ptr %a, i32 %i
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds float, ptr %a, i32 1
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds float, ptr %a, i32 -1
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds float, ptr %a, i32 255
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds float, ptr %a, i32 256
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds float, ptr %a, i32 -255
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds float, ptr %a, i32 -256
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds float, ptr %a, i32 -63
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds float, ptr %a, i32 -64
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %ai = getelementptr inbounds float, ptr %a, i32 %i
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-T32-LABEL: 'testfloat'
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds float, ptr %a, i32 1
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds float, ptr %a, i32 -1
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds float, ptr %a, i32 255
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds float, ptr %a, i32 256
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds float, ptr %a, i32 -255
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds float, ptr %a, i32 -256
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds float, ptr %a, i32 -63
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds float, ptr %a, i32 -64
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds float, ptr %a, i32 %i
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-T32-NEXT: Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds float, ptr %a, i32 1
+; CHECK-T32-NEXT: Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds float, ptr %a, i32 -1
+; CHECK-T32-NEXT: Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds float, ptr %a, i32 255
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds float, ptr %a, i32 256
+; CHECK-T32-NEXT: Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds float, ptr %a, i32 -255
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds float, ptr %a, i32 -256
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
+; CHECK-T32-NEXT: Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds float, ptr %a, i32 -63
+; CHECK-T32-NEXT: Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds float, ptr %a, i32 -64
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %ai = getelementptr inbounds float, ptr %a, i32 %i
+; CHECK-T32-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-A32-LABEL: 'testfloat'
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds float, ptr %a, i32 1
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds float, ptr %a, i32 -1
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds float, ptr %a, i32 255
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds float, ptr %a, i32 256
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds float, ptr %a, i32 -255
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds float, ptr %a, i32 -256
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds float, ptr %a, i32 -63
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds float, ptr %a, i32 -64
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds float, ptr %a, i32 %i
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-A32-NEXT: Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds float, ptr %a, i32 1
+; CHECK-A32-NEXT: Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds float, ptr %a, i32 -1
+; CHECK-A32-NEXT: Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds float, ptr %a, i32 255
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds float, ptr %a, i32 256
+; CHECK-A32-NEXT: Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds float, ptr %a, i32 -255
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds float, ptr %a, i32 -256
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
+; CHECK-A32-NEXT: Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds float, ptr %a, i32 -63
+; CHECK-A32-NEXT: Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds float, ptr %a, i32 -64
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %ai = getelementptr inbounds float, ptr %a, i32 %i
+; CHECK-A32-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%a1 = getelementptr inbounds float, ptr %a, i32 1
%am1 = getelementptr inbounds float, ptr %a, i32 -1
@@ -647,102 +647,102 @@ define void @testfloat(ptr %a, i32 %i) {
define void @testdouble(ptr %a, i32 %i) {
; CHECK-V6M-LABEL: 'testdouble'
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds double, ptr %a, i32 1
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am1 = getelementptr inbounds double, ptr %a, i32 -1
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a255 = getelementptr inbounds double, ptr %a, i32 127
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds double, ptr %a, i32 128
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds double, ptr %a, i32 -127
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds double, ptr %a, i32 -128
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds double, ptr %a, i32 511
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds double, ptr %a, i32 512
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am63 = getelementptr inbounds double, ptr %a, i32 -31
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am64 = getelementptr inbounds double, ptr %a, i32 -32
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds double, ptr %a, i32 %i
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V6M-NEXT: Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds double, ptr %a, i32 1
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %am1 = getelementptr inbounds double, ptr %a, i32 -1
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %a255 = getelementptr inbounds double, ptr %a, i32 127
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds double, ptr %a, i32 128
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds double, ptr %a, i32 -127
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds double, ptr %a, i32 -128
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds double, ptr %a, i32 511
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds double, ptr %a, i32 512
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %am63 = getelementptr inbounds double, ptr %a, i32 -31
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %am64 = getelementptr inbounds double, ptr %a, i32 -32
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %ai = getelementptr inbounds double, ptr %a, i32 %i
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-V7M-NOFP-LABEL: 'testdouble'
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds double, ptr %a, i32 1
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds double, ptr %a, i32 -1
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds double, ptr %a, i32 127
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds double, ptr %a, i32 128
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds double, ptr %a, i32 -127
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds double, ptr %a, i32 -128
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds double, ptr %a, i32 511
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds double, ptr %a, i32 512
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds double, ptr %a, i32 -31
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds double, ptr %a, i32 -32
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds double, ptr %a, i32 %i
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds double, ptr %a, i32 1
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds double, ptr %a, i32 -1
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds double, ptr %a, i32 127
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds double, ptr %a, i32 128
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds double, ptr %a, i32 -127
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds double, ptr %a, i32 -128
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds double, ptr %a, i32 511
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds double, ptr %a, i32 512
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds double, ptr %a, i32 -31
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds double, ptr %a, i32 -32
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %ai = getelementptr inbounds double, ptr %a, i32 %i
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-V7M-FP-LABEL: 'testdouble'
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds double, ptr %a, i32 1
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds double, ptr %a, i32 -1
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds double, ptr %a, i32 127
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds double, ptr %a, i32 128
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds double, ptr %a, i32 -127
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds double, ptr %a, i32 -128
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds double, ptr %a, i32 511
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds double, ptr %a, i32 512
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds double, ptr %a, i32 -31
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds double, ptr %a, i32 -32
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds double, ptr %a, i32 %i
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds double, ptr %a, i32 1
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds double, ptr %a, i32 -1
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds double, ptr %a, i32 127
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds double, ptr %a, i32 128
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds double, ptr %a, i32 -127
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds double, ptr %a, i32 -128
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds double, ptr %a, i32 511
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds double, ptr %a, i32 512
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds double, ptr %a, i32 -31
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds double, ptr %a, i32 -32
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %ai = getelementptr inbounds double, ptr %a, i32 %i
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-MVE-LABEL: 'testdouble'
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds double, ptr %a, i32 1
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds double, ptr %a, i32 -1
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds double, ptr %a, i32 127
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds double, ptr %a, i32 128
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds double, ptr %a, i32 -127
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds double, ptr %a, i32 -128
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds double, ptr %a, i32 511
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds double, ptr %a, i32 512
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds double, ptr %a, i32 -31
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds double, ptr %a, i32 -32
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds double, ptr %a, i32 %i
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds double, ptr %a, i32 1
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds double, ptr %a, i32 -1
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds double, ptr %a, i32 127
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds double, ptr %a, i32 128
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds double, ptr %a, i32 -127
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds double, ptr %a, i32 -128
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds double, ptr %a, i32 511
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds double, ptr %a, i32 512
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds double, ptr %a, i32 -31
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds double, ptr %a, i32 -32
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %ai = getelementptr inbounds double, ptr %a, i32 %i
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-MVEFP-LABEL: 'testdouble'
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds double, ptr %a, i32 1
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds double, ptr %a, i32 -1
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds double, ptr %a, i32 127
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds double, ptr %a, i32 128
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds double, ptr %a, i32 -127
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds double, ptr %a, i32 -128
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds double, ptr %a, i32 511
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds double, ptr %a, i32 512
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds double, ptr %a, i32 -31
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds double, ptr %a, i32 -32
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds double, ptr %a, i32 %i
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds double, ptr %a, i32 1
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds double, ptr %a, i32 -1
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds double, ptr %a, i32 127
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds double, ptr %a, i32 128
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds double, ptr %a, i32 -127
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds double, ptr %a, i32 -128
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds double, ptr %a, i32 511
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds double, ptr %a, i32 512
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds double, ptr %a, i32 -31
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds double, ptr %a, i32 -32
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %ai = getelementptr inbounds double, ptr %a, i32 %i
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-T32-LABEL: 'testdouble'
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds double, ptr %a, i32 1
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds double, ptr %a, i32 -1
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds double, ptr %a, i32 127
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds double, ptr %a, i32 128
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds double, ptr %a, i32 -127
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds double, ptr %a, i32 -128
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds double, ptr %a, i32 511
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds double, ptr %a, i32 512
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds double, ptr %a, i32 -31
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds double, ptr %a, i32 -32
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds double, ptr %a, i32 %i
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-T32-NEXT: Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds double, ptr %a, i32 1
+; CHECK-T32-NEXT: Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds double, ptr %a, i32 -1
+; CHECK-T32-NEXT: Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds double, ptr %a, i32 127
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds double, ptr %a, i32 128
+; CHECK-T32-NEXT: Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds double, ptr %a, i32 -127
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds double, ptr %a, i32 -128
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds double, ptr %a, i32 511
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds double, ptr %a, i32 512
+; CHECK-T32-NEXT: Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds double, ptr %a, i32 -31
+; CHECK-T32-NEXT: Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds double, ptr %a, i32 -32
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %ai = getelementptr inbounds double, ptr %a, i32 %i
+; CHECK-T32-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-A32-LABEL: 'testdouble'
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds double, ptr %a, i32 1
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds double, ptr %a, i32 -1
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds double, ptr %a, i32 127
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds double, ptr %a, i32 128
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds double, ptr %a, i32 -127
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds double, ptr %a, i32 -128
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds double, ptr %a, i32 511
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds double, ptr %a, i32 512
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds double, ptr %a, i32 -31
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds double, ptr %a, i32 -32
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds double, ptr %a, i32 %i
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-A32-NEXT: Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds double, ptr %a, i32 1
+; CHECK-A32-NEXT: Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds double, ptr %a, i32 -1
+; CHECK-A32-NEXT: Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds double, ptr %a, i32 127
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds double, ptr %a, i32 128
+; CHECK-A32-NEXT: Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds double, ptr %a, i32 -127
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds double, ptr %a, i32 -128
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds double, ptr %a, i32 511
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds double, ptr %a, i32 512
+; CHECK-A32-NEXT: Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds double, ptr %a, i32 -31
+; CHECK-A32-NEXT: Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds double, ptr %a, i32 -32
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %ai = getelementptr inbounds double, ptr %a, i32 %i
+; CHECK-A32-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%a1 = getelementptr inbounds double, ptr %a, i32 1
%am1 = getelementptr inbounds double, ptr %a, i32 -1
@@ -761,375 +761,375 @@ define void @testdouble(ptr %a, i32 %i) {
define void @testvecs(i32 %i) {
; CHECK-V6M-LABEL: 'testvecs'
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
-; CHECK-V6M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V6M-NEXT: Cost Model: Found costs of 0 for: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
+; CHECK-V6M-NEXT: Cost Model: Found costs of 0 for: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
+; CHECK-V6M-NEXT: Cost Model: Found costs of 0 for: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
+; CHECK-V6M-NEXT: Cost Model: Found costs of 0 for: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
+; CHECK-V6M-NEXT: Cost Model: Found costs of 0 for: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
+; CHECK-V6M-NEXT: Cost Model: Found costs of 0 for: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
+; CHECK-V6M-NEXT: Cost Model: Found costs of 0 for: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
+; CHECK-V6M-NEXT: Cost Model: Found costs of 0 for: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
+; CHECK-V6M-NEXT: Cost Model: Found costs of 0 for: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
+; CHECK-V6M-NEXT: Cost Model: Found costs of 0 for: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
+; CHECK-V6M-NEXT: Cost Model: Found costs of 0 for: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
+; CHECK-V6M-NEXT: Cost Model: Found costs of 0 for: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
+; CHECK-V6M-NEXT: Cost Model: Found costs of 0 for: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
+; CHECK-V6M-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-V7M-NOFP-LABEL: 'testvecs'
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
-; CHECK-V7M-NOFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 0 for: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
+; CHECK-V7M-NOFP-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-V7M-FP-LABEL: 'testvecs'
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
-; CHECK-V7M-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 0 for: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
+; CHECK-V7M-FP-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-MVE-LABEL: 'testvecs'
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
+; CHECK-MVE-NEXT: Cost Model: Found costs of 0 for: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-MVEFP-LABEL: 'testvecs'
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of 0 for: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
+; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-T32-LABEL: 'testvecs'
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
-; CHECK-T32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
+; CHECK-T32-NEXT: Cost Model: Found costs of 1 for: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
+; CHECK-T32-NEXT: Cost Model: Found costs of 0 for: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
+; CHECK-T32-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-A32-LABEL: 'testvecs'
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
-; CHECK-A32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
+; CHECK-A32-NEXT: Cost Model: Found costs of 1 for: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
+; CHECK-A32-NEXT: Cost Model: Found costs of 0 for: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
+; CHECK-A32-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
diff --git a/llvm/test/Analysis/CostModel/ARM/immediates.ll b/llvm/test/Analysis/CostModel/ARM/immediates.ll
index ed13636..cd42313 100644
--- a/llvm/test/Analysis/CostModel/ARM/immediates.ll
+++ b/llvm/test/Analysis/CostModel/ARM/immediates.ll
@@ -1,145 +1,53 @@
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8m.base | FileCheck %s --check-prefix=CHECK-T1-SIZE
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8m.main | FileCheck %s --check-prefix=CHECK-T2-SIZE
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mtriple=thumbv8m.base | FileCheck %s --check-prefix=CHECK-T1-LATENCY
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mtriple=thumbv8m.main | FileCheck %s --check-prefix=CHECK-T2-LATENCY
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=throughput -mtriple=thumbv8m.base | FileCheck %s --check-prefix=CHECK-T1-THROUGHPUT
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=throughput -mtriple=thumbv8m.main | FileCheck %s --check-prefix=CHECK-T2-THROUGHPUT
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8m.base | FileCheck %s --check-prefix=CHECK-T1
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8m.main | FileCheck %s --check-prefix=CHECK-T2
target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
define i32 @const_costs() {
-; CHECK-T1-SIZE-LABEL: 'const_costs'
-; CHECK-T1-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add_1 = add i32 undef, 1
-; CHECK-T1-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add_32767 = add i32 undef, 32767
-; CHECK-T1-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sub_1 = sub i32 undef, 1
-; CHECK-T1-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sub_32768 = sub i32 undef, 32768
-; CHECK-T1-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mul_2 = mul i32 undef, 2
-; CHECK-T1-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mul_3 = mul i32 undef, 3
-; CHECK-T1-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mul_27 = mul i32 undef, 27
-; CHECK-T1-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %and_255 = and i32 undef, 255
-; CHECK-T1-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %and_65535 = and i32 undef, 65535
-; CHECK-T1-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %and_1 = and i32 undef, 1
-; CHECK-T1-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %xor_1 = xor i32 undef, 1
-; CHECK-T1-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %xor_7 = xor i32 undef, 7
-; CHECK-T1-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep_1 = getelementptr i32, ptr undef, i32 1
-; CHECK-T1-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep_16 = getelementptr i32, ptr undef, i32 16
-; CHECK-T1-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp_244 = icmp ne i32 undef, 244
-; CHECK-T1-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp_256 = icmp uge i32 undef, 256
-; CHECK-T1-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp_1024 = icmp ult i32 undef, 1024
-; CHECK-T1-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %select_1_0 = select i1 undef, i32 1, i32 0
-; CHECK-T1-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %select_7_255 = select i1 undef, i32 7, i32 255
-; CHECK-T1-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 1
+; CHECK-T1-LABEL: 'const_costs'
+; CHECK-T1-NEXT: Cost Model: Found costs of 1 for: %add_1 = add i32 undef, 1
+; CHECK-T1-NEXT: Cost Model: Found costs of 1 for: %add_32767 = add i32 undef, 32767
+; CHECK-T1-NEXT: Cost Model: Found costs of 1 for: %sub_1 = sub i32 undef, 1
+; CHECK-T1-NEXT: Cost Model: Found costs of 1 for: %sub_32768 = sub i32 undef, 32768
+; CHECK-T1-NEXT: Cost Model: Found costs of 1 for: %mul_2 = mul i32 undef, 2
+; CHECK-T1-NEXT: Cost Model: Found costs of 1 for: %mul_3 = mul i32 undef, 3
+; CHECK-T1-NEXT: Cost Model: Found costs of 1 for: %mul_27 = mul i32 undef, 27
+; CHECK-T1-NEXT: Cost Model: Found costs of 1 for: %and_255 = and i32 undef, 255
+; CHECK-T1-NEXT: Cost Model: Found costs of 1 for: %and_65535 = and i32 undef, 65535
+; CHECK-T1-NEXT: Cost Model: Found costs of 1 for: %and_1 = and i32 undef, 1
+; CHECK-T1-NEXT: Cost Model: Found costs of 1 for: %xor_1 = xor i32 undef, 1
+; CHECK-T1-NEXT: Cost Model: Found costs of 1 for: %xor_7 = xor i32 undef, 7
+; CHECK-T1-NEXT: Cost Model: Found costs of 0 for: %gep_1 = getelementptr i32, ptr undef, i32 1
+; CHECK-T1-NEXT: Cost Model: Found costs of 0 for: %gep_16 = getelementptr i32, ptr undef, i32 16
+; CHECK-T1-NEXT: Cost Model: Found costs of 1 for: %cmp_244 = icmp ne i32 undef, 244
+; CHECK-T1-NEXT: Cost Model: Found costs of 1 for: %cmp_256 = icmp uge i32 undef, 256
+; CHECK-T1-NEXT: Cost Model: Found costs of 1 for: %cmp_1024 = icmp ult i32 undef, 1024
+; CHECK-T1-NEXT: Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %select_1_0 = select i1 undef, i32 1, i32 0
+; CHECK-T1-NEXT: Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %select_7_255 = select i1 undef, i32 7, i32 255
+; CHECK-T1-NEXT: Cost Model: Found costs of 1 for: ret i32 1
;
-; CHECK-T2-SIZE-LABEL: 'const_costs'
-; CHECK-T2-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add_1 = add i32 undef, 1
-; CHECK-T2-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add_32767 = add i32 undef, 32767
-; CHECK-T2-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sub_1 = sub i32 undef, 1
-; CHECK-T2-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sub_32768 = sub i32 undef, 32768
-; CHECK-T2-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mul_2 = mul i32 undef, 2
-; CHECK-T2-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mul_3 = mul i32 undef, 3
-; CHECK-T2-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mul_27 = mul i32 undef, 27
-; CHECK-T2-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %and_255 = and i32 undef, 255
-; CHECK-T2-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %and_65535 = and i32 undef, 65535
-; CHECK-T2-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %and_1 = and i32 undef, 1
-; CHECK-T2-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %xor_1 = xor i32 undef, 1
-; CHECK-T2-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %xor_7 = xor i32 undef, 7
-; CHECK-T2-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep_1 = getelementptr i32, ptr undef, i32 1
-; CHECK-T2-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep_16 = getelementptr i32, ptr undef, i32 16
-; CHECK-T2-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp_244 = icmp ne i32 undef, 244
-; CHECK-T2-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp_256 = icmp uge i32 undef, 256
-; CHECK-T2-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp_1024 = icmp ult i32 undef, 1024
-; CHECK-T2-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %select_1_0 = select i1 undef, i32 1, i32 0
-; CHECK-T2-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %select_7_255 = select i1 undef, i32 7, i32 255
-; CHECK-T2-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 1
-;
-; CHECK-T1-LATENCY-LABEL: 'const_costs'
-; CHECK-T1-LATENCY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add_1 = add i32 undef, 1
-; CHECK-T1-LATENCY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add_32767 = add i32 undef, 32767
-; CHECK-T1-LATENCY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sub_1 = sub i32 undef, 1
-; CHECK-T1-LATENCY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sub_32768 = sub i32 undef, 32768
-; CHECK-T1-LATENCY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mul_2 = mul i32 undef, 2
-; CHECK-T1-LATENCY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mul_3 = mul i32 undef, 3
-; CHECK-T1-LATENCY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mul_27 = mul i32 undef, 27
-; CHECK-T1-LATENCY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %and_255 = and i32 undef, 255
-; CHECK-T1-LATENCY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %and_65535 = and i32 undef, 65535
-; CHECK-T1-LATENCY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %and_1 = and i32 undef, 1
-; CHECK-T1-LATENCY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %xor_1 = xor i32 undef, 1
-; CHECK-T1-LATENCY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %xor_7 = xor i32 undef, 7
-; CHECK-T1-LATENCY-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep_1 = getelementptr i32, ptr undef, i32 1
-; CHECK-T1-LATENCY-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep_16 = getelementptr i32, ptr undef, i32 16
-; CHECK-T1-LATENCY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp_244 = icmp ne i32 undef, 244
-; CHECK-T1-LATENCY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp_256 = icmp uge i32 undef, 256
-; CHECK-T1-LATENCY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp_1024 = icmp ult i32 undef, 1024
-; CHECK-T1-LATENCY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %select_1_0 = select i1 undef, i32 1, i32 0
-; CHECK-T1-LATENCY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %select_7_255 = select i1 undef, i32 7, i32 255
-; CHECK-T1-LATENCY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 1
-;
-; CHECK-T2-LATENCY-LABEL: 'const_costs'
-; CHECK-T2-LATENCY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add_1 = add i32 undef, 1
-; CHECK-T2-LATENCY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add_32767 = add i32 undef, 32767
-; CHECK-T2-LATENCY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sub_1 = sub i32 undef, 1
-; CHECK-T2-LATENCY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sub_32768 = sub i32 undef, 32768
-; CHECK-T2-LATENCY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mul_2 = mul i32 undef, 2
-; CHECK-T2-LATENCY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mul_3 = mul i32 undef, 3
-; CHECK-T2-LATENCY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mul_27 = mul i32 undef, 27
-; CHECK-T2-LATENCY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %and_255 = and i32 undef, 255
-; CHECK-T2-LATENCY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %and_65535 = and i32 undef, 65535
-; CHECK-T2-LATENCY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %and_1 = and i32 undef, 1
-; CHECK-T2-LATENCY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %xor_1 = xor i32 undef, 1
-; CHECK-T2-LATENCY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %xor_7 = xor i32 undef, 7
-; CHECK-T2-LATENCY-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep_1 = getelementptr i32, ptr undef, i32 1
-; CHECK-T2-LATENCY-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep_16 = getelementptr i32, ptr undef, i32 16
-; CHECK-T2-LATENCY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp_244 = icmp ne i32 undef, 244
-; CHECK-T2-LATENCY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp_256 = icmp uge i32 undef, 256
-; CHECK-T2-LATENCY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp_1024 = icmp ult i32 undef, 1024
-; CHECK-T2-LATENCY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %select_1_0 = select i1 undef, i32 1, i32 0
-; CHECK-T2-LATENCY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %select_7_255 = select i1 undef, i32 7, i32 255
-; CHECK-T2-LATENCY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 1
-;
-; CHECK-T1-THROUGHPUT-LABEL: 'const_costs'
-; CHECK-T1-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add_1 = add i32 undef, 1
-; CHECK-T1-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add_32767 = add i32 undef, 32767
-; CHECK-T1-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sub_1 = sub i32 undef, 1
-; CHECK-T1-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sub_32768 = sub i32 undef, 32768
-; CHECK-T1-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mul_2 = mul i32 undef, 2
-; CHECK-T1-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mul_3 = mul i32 undef, 3
-; CHECK-T1-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mul_27 = mul i32 undef, 27
-; CHECK-T1-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %and_255 = and i32 undef, 255
-; CHECK-T1-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %and_65535 = and i32 undef, 65535
-; CHECK-T1-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %and_1 = and i32 undef, 1
-; CHECK-T1-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %xor_1 = xor i32 undef, 1
-; CHECK-T1-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %xor_7 = xor i32 undef, 7
-; CHECK-T1-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep_1 = getelementptr i32, ptr undef, i32 1
-; CHECK-T1-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep_16 = getelementptr i32, ptr undef, i32 16
-; CHECK-T1-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp_244 = icmp ne i32 undef, 244
-; CHECK-T1-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp_256 = icmp uge i32 undef, 256
-; CHECK-T1-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp_1024 = icmp ult i32 undef, 1024
-; CHECK-T1-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %select_1_0 = select i1 undef, i32 1, i32 0
-; CHECK-T1-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %select_7_255 = select i1 undef, i32 7, i32 255
-; CHECK-T1-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 1
-;
-; CHECK-T2-THROUGHPUT-LABEL: 'const_costs'
-; CHECK-T2-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add_1 = add i32 undef, 1
-; CHECK-T2-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add_32767 = add i32 undef, 32767
-; CHECK-T2-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sub_1 = sub i32 undef, 1
-; CHECK-T2-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sub_32768 = sub i32 undef, 32768
-; CHECK-T2-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mul_2 = mul i32 undef, 2
-; CHECK-T2-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mul_3 = mul i32 undef, 3
-; CHECK-T2-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mul_27 = mul i32 undef, 27
-; CHECK-T2-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %and_255 = and i32 undef, 255
-; CHECK-T2-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %and_65535 = and i32 undef, 65535
-; CHECK-T2-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %and_1 = and i32 undef, 1
-; CHECK-T2-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %xor_1 = xor i32 undef, 1
-; CHECK-T2-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %xor_7 = xor i32 undef, 7
-; CHECK-T2-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep_1 = getelementptr i32, ptr undef, i32 1
-; CHECK-T2-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep_16 = getelementptr i32, ptr undef, i32 16
-; CHECK-T2-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp_244 = icmp ne i32 undef, 244
-; CHECK-T2-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp_256 = icmp uge i32 undef, 256
-; CHECK-T2-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp_1024 = icmp ult i32 undef, 1024
-; CHECK-T2-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %select_1_0 = select i1 undef, i32 1, i32 0
-; CHECK-T2-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %select_7_255 = select i1 undef, i32 7, i32 255
-; CHECK-T2-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 1
+; CHECK-T2-LABEL: 'const_costs'
+; CHECK-T2-NEXT: Cost Model: Found costs of 1 for: %add_1 = add i32 undef, 1
+; CHECK-T2-NEXT: Cost Model: Found costs of 1 for: %add_32767 = add i32 undef, 32767
+; CHECK-T2-NEXT: Cost Model: Found costs of 1 for: %sub_1 = sub i32 undef, 1
+; CHECK-T2-NEXT: Cost Model: Found costs of 1 for: %sub_32768 = sub i32 undef, 32768
+; CHECK-T2-NEXT: Cost Model: Found costs of 1 for: %mul_2 = mul i32 undef, 2
+; CHECK-T2-NEXT: Cost Model: Found costs of 1 for: %mul_3 = mul i32 undef, 3
+; CHECK-T2-NEXT: Cost Model: Found costs of 1 for: %mul_27 = mul i32 undef, 27
+; CHECK-T2-NEXT: Cost Model: Found costs of 1 for: %and_255 = and i32 undef, 255
+; CHECK-T2-NEXT: Cost Model: Found costs of 1 for: %and_65535 = and i32 undef, 65535
+; CHECK-T2-NEXT: Cost Model: Found costs of 1 for: %and_1 = and i32 undef, 1
+; CHECK-T2-NEXT: Cost Model: Found costs of 1 for: %xor_1 = xor i32 undef, 1
+; CHECK-T2-NEXT: Cost Model: Found costs of 1 for: %xor_7 = xor i32 undef, 7
+; CHECK-T2-NEXT: Cost Model: Found costs of 0 for: %gep_1 = getelementptr i32, ptr undef, i32 1
+; CHECK-T2-NEXT: Cost Model: Found costs of 0 for: %gep_16 = getelementptr i32, ptr undef, i32 16
+; CHECK-T2-NEXT: Cost Model: Found costs of 1 for: %cmp_244 = icmp ne i32 undef, 244
+; CHECK-T2-NEXT: Cost Model: Found costs of 1 for: %cmp_256 = icmp uge i32 undef, 256
+; CHECK-T2-NEXT: Cost Model: Found costs of 1 for: %cmp_1024 = icmp ult i32 undef, 1024
+; CHECK-T2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %select_1_0 = select i1 undef, i32 1, i32 0
+; CHECK-T2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %select_7_255 = select i1 undef, i32 7, i32 255
+; CHECK-T2-NEXT: Cost Model: Found costs of 1 for: ret i32 1
;
%add_1 = add i32 undef, 1
%add_32767 = add i32 undef, 32767
diff --git a/llvm/test/Analysis/CostModel/ARM/insertelement.ll b/llvm/test/Analysis/CostModel/ARM/insertelement.ll
index 5a922dd..f14b200 100644
--- a/llvm/test/Analysis/CostModel/ARM/insertelement.ll
+++ b/llvm/test/Analysis/CostModel/ARM/insertelement.ll
@@ -1,4 +1,5 @@
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift < %s | FileCheck %s
target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
target triple = "thumbv7-apple-ios6.0.0"
@@ -7,12 +8,16 @@ target triple = "thumbv7-apple-ios6.0.0"
; due to renaming constraints.
%T_i8v = type <8 x i8>
%T_i8 = type i8
-; CHECK: insertelement_i8
-define void @insertelement_i8(ptr %saddr,
- ptr %vaddr) {
+define void @insertelement_i8(ptr %saddr, ptr %vaddr) {
+; CHECK-LABEL: 'insertelement_i8'
+; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <8 x i8>, ptr %vaddr, align 4
+; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load i8, ptr %saddr, align 1
+; CHECK-NEXT: Cost Model: Found costs of 3 for: %v2 = insertelement <8 x i8> %v0, i8 %v1, i32 1
+; CHECK-NEXT: Cost Model: Found costs of 1 for: store <8 x i8> %v2, ptr %vaddr, align 4
+; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
%v0 = load %T_i8v, ptr %vaddr
%v1 = load %T_i8, ptr %saddr
-;CHECK: estimated cost of 3 for {{.*}} insertelement <8 x i8>
%v2 = insertelement %T_i8v %v0, %T_i8 %v1, i32 1
store %T_i8v %v2, ptr %vaddr
ret void
@@ -21,12 +26,16 @@ define void @insertelement_i8(ptr %saddr,
%T_i16v = type <4 x i16>
%T_i16 = type i16
-; CHECK: insertelement_i16
-define void @insertelement_i16(ptr %saddr,
- ptr %vaddr) {
+define void @insertelement_i16(ptr %saddr, ptr %vaddr) {
+; CHECK-LABEL: 'insertelement_i16'
+; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i16>, ptr %vaddr, align 4
+; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load i16, ptr %saddr, align 2
+; CHECK-NEXT: Cost Model: Found costs of 3 for: %v2 = insertelement <4 x i16> %v0, i16 %v1, i32 1
+; CHECK-NEXT: Cost Model: Found costs of 1 for: store <4 x i16> %v2, ptr %vaddr, align 4
+; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
%v0 = load %T_i16v, ptr %vaddr
%v1 = load %T_i16, ptr %saddr
-;CHECK: estimated cost of 3 for {{.*}} insertelement <4 x i16>
%v2 = insertelement %T_i16v %v0, %T_i16 %v1, i32 1
store %T_i16v %v2, ptr %vaddr
ret void
@@ -34,12 +43,16 @@ define void @insertelement_i16(ptr %saddr,
%T_i32v = type <2 x i32>
%T_i32 = type i32
-; CHECK: insertelement_i32
-define void @insertelement_i32(ptr %saddr,
- ptr %vaddr) {
+define void @insertelement_i32(ptr %saddr, ptr %vaddr) {
+; CHECK-LABEL: 'insertelement_i32'
+; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <2 x i32>, ptr %vaddr, align 4
+; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load i32, ptr %saddr, align 4
+; CHECK-NEXT: Cost Model: Found costs of 3 for: %v2 = insertelement <2 x i32> %v0, i32 %v1, i32 1
+; CHECK-NEXT: Cost Model: Found costs of 1 for: store <2 x i32> %v2, ptr %vaddr, align 4
+; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
%v0 = load %T_i32v, ptr %vaddr
%v1 = load %T_i32, ptr %saddr
-;CHECK: estimated cost of 3 for {{.*}} insertelement <2 x i32>
%v2 = insertelement %T_i32v %v0, %T_i32 %v1, i32 1
store %T_i32v %v2, ptr %vaddr
ret void
diff --git a/llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll b/llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll
deleted file mode 100644
index 6377437..0000000
--- a/llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll
+++ /dev/null
@@ -1,371 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -mtriple=armv8.1m.main -mattr=+mve.fp -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=throughput < %s | FileCheck %s --check-prefix=THRU
-; RUN: opt -mtriple=armv8.1m.main -mattr=+mve.fp -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency < %s | FileCheck %s --check-prefix=LATE
-; RUN: opt -mtriple=armv8.1m.main -mattr=+mve.fp -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size < %s | FileCheck %s --check-prefix=SIZE
-; RUN: opt -mtriple=armv8.1m.main -mattr=+mve.fp -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency < %s | FileCheck %s --check-prefix=SIZE_LATE
-
-target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
-
-; Test a cross-section of intrinsics for various cost-kinds.
-; Other test files may check for accuracy of a particular intrinsic
-; across subtargets or types. This is just a basic correctness check using an
-; ARM target and a legal scalar type (i32/float) and/or an
-; illegal vector type (16 x i32/float).
-
-declare i32 @llvm.smax.i32(i32, i32)
-declare <16 x i32> @llvm.smax.v16i32(<16 x i32>, <16 x i32>)
-
-declare float @llvm.fmuladd.f32(float, float, float)
-declare <16 x float> @llvm.fmuladd.v16f32(<16 x float>, <16 x float>, <16 x float>)
-
-declare float @llvm.log2.f32(float)
-declare <16 x float> @llvm.log2.v16f32(<16 x float>)
-
-declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata)
-declare <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float>, <16 x float>, metadata, metadata)
-
-declare float @llvm.maximum.f32(float, float)
-declare <16 x float> @llvm.maximum.v16f32(<16 x float>, <16 x float>)
-
-declare i32 @llvm.cttz.i32(i32, i1)
-declare <16 x i32> @llvm.cttz.v16i32(<16 x i32>, i1)
-
-declare i32 @llvm.ctlz.i32(i32, i1)
-declare <16 x i32> @llvm.ctlz.v16i32(<16 x i32>, i1)
-
-declare i32 @llvm.fshl.i32(i32, i32, i32)
-declare <16 x i32> @llvm.fshl.v16i32(<16 x i32>, <16 x i32>, <16 x i32>)
-
-declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x float>)
-declare void @llvm.masked.scatter.v16f32.v16p0(<16 x float>, <16 x ptr>, i32, <16 x i1>)
-declare float @llvm.vector.reduce.fmax.v16f32(<16 x float>)
-
-declare void @llvm.memcpy.p0.p0.i32(ptr, ptr, i32, i1)
-
-declare i32 @llvm.ssa.copy.i32(i32)
-declare float @llvm.ssa.copy.f32(float)
-declare ptr @llvm.ssa.copy.p0(ptr)
-
-define void @smax(i32 %a, i32 %b, <16 x i32> %va, <16 x i32> %vb) {
-; THRU-LABEL: 'smax'
-; THRU-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s = call i32 @llvm.smax.i32(i32 %a, i32 %b)
-; THRU-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %va, <16 x i32> %vb)
-; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; LATE-LABEL: 'smax'
-; LATE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s = call i32 @llvm.smax.i32(i32 %a, i32 %b)
-; LATE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %va, <16 x i32> %vb)
-; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE-LABEL: 'smax'
-; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %s = call i32 @llvm.smax.i32(i32 %a, i32 %b)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %va, <16 x i32> %vb)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE_LATE-LABEL: 'smax'
-; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s = call i32 @llvm.smax.i32(i32 %a, i32 %b)
-; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %va, <16 x i32> %vb)
-; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
- %s = call i32 @llvm.smax.i32(i32 %a, i32 %b)
- %v = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %va, <16 x i32> %vb)
- ret void
-}
-
-define void @fmuladd(float %a, float %b, float %c, <16 x float> %va, <16 x float> %vb, <16 x float> %vc) {
-; THRU-LABEL: 'fmuladd'
-; THRU-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
-; THRU-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %va, <16 x float> %vb, <16 x float> %vc)
-; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; LATE-LABEL: 'fmuladd'
-; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
-; LATE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %va, <16 x float> %vb, <16 x float> %vc)
-; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE-LABEL: 'fmuladd'
-; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %va, <16 x float> %vb, <16 x float> %vc)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE_LATE-LABEL: 'fmuladd'
-; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
-; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %va, <16 x float> %vb, <16 x float> %vc)
-; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
- %s = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
- %v = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %va, <16 x float> %vb, <16 x float> %vc)
- ret void
-}
-
-define void @log2(float %a, <16 x float> %va) {
-; THRU-LABEL: 'log2'
-; THRU-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %s = call float @llvm.log2.f32(float %a)
-; THRU-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va)
-; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; LATE-LABEL: 'log2'
-; LATE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %s = call float @llvm.log2.f32(float %a)
-; LATE-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va)
-; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE-LABEL: 'log2'
-; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.log2.f32(float %a)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE_LATE-LABEL: 'log2'
-; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %s = call float @llvm.log2.f32(float %a)
-; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va)
-; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
- %s = call float @llvm.log2.f32(float %a)
- %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va)
- ret void
-}
-
-define void @constrained_fadd(float %a, <16 x float> %va) strictfp {
-; THRU-LABEL: 'constrained_fadd'
-; THRU-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.experimental.constrained.fadd.f32(float %a, float %a, metadata !"round.dynamic", metadata !"fpexcept.ignore")
-; THRU-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %t = call <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float> %va, <16 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.ignore")
-; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; LATE-LABEL: 'constrained_fadd'
-; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.experimental.constrained.fadd.f32(float %a, float %a, metadata !"round.dynamic", metadata !"fpexcept.ignore")
-; LATE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %t = call <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float> %va, <16 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.ignore")
-; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE-LABEL: 'constrained_fadd'
-; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.experimental.constrained.fadd.f32(float %a, float %a, metadata !"round.dynamic", metadata !"fpexcept.ignore")
-; SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %t = call <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float> %va, <16 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.ignore")
-; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE_LATE-LABEL: 'constrained_fadd'
-; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.experimental.constrained.fadd.f32(float %a, float %a, metadata !"round.dynamic", metadata !"fpexcept.ignore")
-; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %t = call <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float> %va, <16 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.ignore")
-; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
- %s = call float @llvm.experimental.constrained.fadd.f32(float %a, float %a, metadata !"round.dynamic", metadata !"fpexcept.ignore")
- %t = call <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float> %va, <16 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.ignore")
- ret void
-}
-
-define void @fmaximum(float %a, float %b, <16 x float> %va, <16 x float> %vb) {
-; THRU-LABEL: 'fmaximum'
-; THRU-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %s = call float @llvm.maximum.f32(float %a, float %b)
-; THRU-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb)
-; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; LATE-LABEL: 'fmaximum'
-; LATE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %s = call float @llvm.maximum.f32(float %a, float %b)
-; LATE-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb)
-; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE-LABEL: 'fmaximum'
-; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.maximum.f32(float %a, float %b)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE_LATE-LABEL: 'fmaximum'
-; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %s = call float @llvm.maximum.f32(float %a, float %b)
-; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb)
-; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
- %s = call float @llvm.maximum.f32(float %a, float %b)
- %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb)
- ret void
-}
-
-define void @cttz(i32 %a, <16 x i32> %va) {
-; THRU-LABEL: 'cttz'
-; THRU-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
-; THRU-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
-; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; LATE-LABEL: 'cttz'
-; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
-; LATE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
-; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE-LABEL: 'cttz'
-; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE_LATE-LABEL: 'cttz'
-; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
-; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
-; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
- %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
- %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
- ret void
-}
-
-define void @ctlz(i32 %a, <16 x i32> %va) {
-; THRU-LABEL: 'ctlz'
-; THRU-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.ctlz.i32(i32 %a, i1 true)
-; THRU-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %va, i1 true)
-; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; LATE-LABEL: 'ctlz'
-; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.ctlz.i32(i32 %a, i1 true)
-; LATE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %va, i1 true)
-; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE-LABEL: 'ctlz'
-; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.ctlz.i32(i32 %a, i1 true)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %va, i1 true)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE_LATE-LABEL: 'ctlz'
-; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.ctlz.i32(i32 %a, i1 true)
-; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %va, i1 true)
-; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
- %s = call i32 @llvm.ctlz.i32(i32 %a, i1 true)
- %v = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %va, i1 true)
- ret void
-}
-
-define void @fshl(i32 %a, i32 %b, i32 %c, <16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc) {
-; THRU-LABEL: 'fshl'
-; THRU-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %s = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c)
-; THRU-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %v = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc)
-; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; LATE-LABEL: 'fshl'
-; LATE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %s = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c)
-; LATE-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %v = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc)
-; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE-LABEL: 'fshl'
-; SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %s = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %v = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE_LATE-LABEL: 'fshl'
-; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %s = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c)
-; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %v = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc)
-; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
- %s = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c)
- %v = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc)
- ret void
-}
-
-define void @maskedgather(<16 x ptr> %va, <16 x i1> %vb, <16 x float> %vc) {
-; THRU-LABEL: 'maskedgather'
-; THRU-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 %va, <16 x i1> %vb, <16 x float> %vc)
-; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; LATE-LABEL: 'maskedgather'
-; LATE-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 %va, <16 x i1> %vb, <16 x float> %vc)
-; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE-LABEL: 'maskedgather'
-; SIZE-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 %va, <16 x i1> %vb, <16 x float> %vc)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE_LATE-LABEL: 'maskedgather'
-; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 %va, <16 x i1> %vb, <16 x float> %vc)
-; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
- %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %va, i32 1, <16 x i1> %vb, <16 x float> %vc)
- ret void
-}
-
-define void @maskedscatter(<16 x float> %va, <16 x ptr> %vb, <16 x i1> %vc) {
-; THRU-LABEL: 'maskedscatter'
-; THRU-NEXT: Cost Model: Found an estimated cost of 176 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> align 1 %vb, <16 x i1> %vc)
-; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; LATE-LABEL: 'maskedscatter'
-; LATE-NEXT: Cost Model: Found an estimated cost of 176 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> align 1 %vb, <16 x i1> %vc)
-; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE-LABEL: 'maskedscatter'
-; SIZE-NEXT: Cost Model: Found an estimated cost of 176 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> align 1 %vb, <16 x i1> %vc)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE_LATE-LABEL: 'maskedscatter'
-; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 176 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> align 1 %vb, <16 x i1> %vc)
-; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
- call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> %vb, i32 1, <16 x i1> %vc)
- ret void
-}
-
-define void @reduce_fmax(<16 x float> %va) {
-; THRU-LABEL: 'reduce_fmax'
-; THRU-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
-; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; LATE-LABEL: 'reduce_fmax'
-; LATE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
-; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE-LABEL: 'reduce_fmax'
-; SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE_LATE-LABEL: 'reduce_fmax'
-; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
-; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
- %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
- ret void
-}
-
-define void @memcpy(ptr %a, ptr %b, i32 %c) {
-; THRU-LABEL: 'memcpy'
-; THRU-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.memcpy.p0.p0.i32(ptr align 1 %a, ptr align 1 %b, i32 32, i1 false)
-; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; LATE-LABEL: 'memcpy'
-; LATE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.memcpy.p0.p0.i32(ptr align 1 %a, ptr align 1 %b, i32 32, i1 false)
-; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE-LABEL: 'memcpy'
-; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.memcpy.p0.p0.i32(ptr align 1 %a, ptr align 1 %b, i32 32, i1 false)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE_LATE-LABEL: 'memcpy'
-; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.memcpy.p0.p0.i32(ptr align 1 %a, ptr align 1 %b, i32 32, i1 false)
-; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
- call void @llvm.memcpy.p0.p0.i32(ptr align 1 %a, ptr align 1 %b, i32 32, i1 false)
- ret void
-}
-
-define void @ssa_copy() {
- ; CHECK: %{{.*}} = llvm.intr.ssa.copy %{{.*}} : f32
-; THRU-LABEL: 'ssa_copy'
-; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i = call i32 @llvm.ssa.copy.i32(i32 undef)
-; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %f = call float @llvm.ssa.copy.f32(float undef)
-; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %p = call ptr @llvm.ssa.copy.p0(ptr undef)
-; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; LATE-LABEL: 'ssa_copy'
-; LATE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i = call i32 @llvm.ssa.copy.i32(i32 undef)
-; LATE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %f = call float @llvm.ssa.copy.f32(float undef)
-; LATE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %p = call ptr @llvm.ssa.copy.p0(ptr undef)
-; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE-LABEL: 'ssa_copy'
-; SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i = call i32 @llvm.ssa.copy.i32(i32 undef)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %f = call float @llvm.ssa.copy.f32(float undef)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %p = call ptr @llvm.ssa.copy.p0(ptr undef)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE_LATE-LABEL: 'ssa_copy'
-; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i = call i32 @llvm.ssa.copy.i32(i32 undef)
-; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %f = call float @llvm.ssa.copy.f32(float undef)
-; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %p = call ptr @llvm.ssa.copy.p0(ptr undef)
-; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
- %i = call i32 @llvm.ssa.copy.i32(i32 undef)
- %f = call float @llvm.ssa.copy.f32(float undef)
- %p = call ptr @llvm.ssa.copy.p0(ptr undef)
- ret void
-}
diff --git a/llvm/test/Analysis/CostModel/ARM/load-to-trunc.ll b/llvm/test/Analysis/CostModel/ARM/load-to-trunc.ll
index 4404209..c98601f 100644
--- a/llvm/test/Analysis/CostModel/ARM/load-to-trunc.ll
+++ b/llvm/test/Analysis/CostModel/ARM/load-to-trunc.ll
@@ -1,17 +1,17 @@
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=armv8r-none-eabi < %s | FileCheck %s
+
; Check memory cost model action for a load of an unusually sized integer
; follow by and a trunc to a register sized integer gives a cost of 1 rather
; than the expanded cost if it is not. Currently, this target does not have
; that expansion.
-; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=armv8r-none-eabi < %s | FileCheck %s --check-prefix=CHECK
-
; Check that cost is 1 for unusual load to register sized load.
define i32 @loadUnusualIntegerWithTrunc(ptr %ptr) {
; CHECK-LABEL: 'loadUnusualIntegerWithTrunc'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 8
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %trunc = trunc i128 %out to i32
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %trunc
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %out = load i128, ptr %ptr, align 8
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %trunc = trunc i128 %out to i32
+; CHECK-NEXT: Cost Model: Found costs of 1 for: ret i32 %trunc
;
%out = load i128, ptr %ptr
%trunc = trunc i128 %out to i32
@@ -20,8 +20,8 @@ define i32 @loadUnusualIntegerWithTrunc(ptr %ptr) {
define i128 @loadUnusualInteger(ptr %ptr) {
; CHECK-LABEL: 'loadUnusualInteger'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 8
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i128 %out
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %out = load i128, ptr %ptr, align 8
+; CHECK-NEXT: Cost Model: Found costs of 1 for: ret i128 %out
;
%out = load i128, ptr %ptr
ret i128 %out
diff --git a/llvm/test/Analysis/CostModel/ARM/load_store.ll b/llvm/test/Analysis/CostModel/ARM/load_store.ll
index 4c322e9..dd2eaae 100644
--- a/llvm/test/Analysis/CostModel/ARM/load_store.ll
+++ b/llvm/test/Analysis/CostModel/ARM/load_store.ll
@@ -1,171 +1,117 @@
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv6m-none-eabi < %s | FileCheck %s --check-prefix=CHECK-NOVEC
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv7m-none-eabi -mcpu=cortex-m3 < %s | FileCheck %s --check-prefix=CHECK-NOVEC
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv7m-none-eabi -mcpu=cortex-m4 < %s | FileCheck %s --check-prefix=CHECK-FP
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefix=CHECK-MVE
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift < %s | FileCheck %s --check-prefix=CHECK-NEON
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=arm-none-eabi -mcpu=cortex-a53 < %s | FileCheck %s --check-prefix=CHECK-NEON
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8a-linux-gnueabihf < %s | FileCheck %s --check-prefix=CHECK-V8-SIZE
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefix=CHECK-MVE-SIZE
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv6m-none-eabi < %s | FileCheck %s --check-prefix=CHECK-NOVEC
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv7m-none-eabi -mcpu=cortex-m3 < %s | FileCheck %s --check-prefix=CHECK-NOVEC
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv7m-none-eabi -mcpu=cortex-m4 < %s | FileCheck %s --check-prefix=CHECK-FP
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefix=CHECK-MVE
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift < %s | FileCheck %s --check-prefix=CHECK-NEON
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=arm-none-eabi -mcpu=cortex-a53 < %s | FileCheck %s --check-prefix=CHECK-NEON
target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
define void @stores() {
; CHECK-NOVEC-LABEL: 'stores'
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i8 undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i16 undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i32 undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store i64 undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store i128 undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store float undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store double undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <2 x i8> undef, ptr undef, align 1
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <2 x i16> undef, ptr undef, align 2
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <2 x i32> undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <2 x i64> undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <4 x i32> undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 8 for instruction: store <8 x i16> undef, ptr undef, align 2
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 16 for instruction: store <16 x i8> undef, ptr undef, align 1
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <4 x float> undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 8 for instruction: store <4 x double> undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <2 x float> undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <2 x double> undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <2 x i64> undef, ptr undef, align 1
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <4 x i32> undef, ptr undef, align 1
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 8 for instruction: store <8 x i16> undef, ptr undef, align 1
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <4 x float> undef, ptr undef, align 1
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <2 x double> undef, ptr undef, align 1
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of 1 for: store i8 undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of 1 for: store i16 undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of 1 for: store i32 undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store i64 undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store i128 undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of 1 for: store float undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store double undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i8> undef, ptr undef, align 1
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i16> undef, ptr undef, align 2
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i32> undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i64> undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x i32> undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: store <8 x i16> undef, ptr undef, align 2
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: store <16 x i8> undef, ptr undef, align 1
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x float> undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x double> undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x float> undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x double> undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i64> undef, ptr undef, align 1
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x i32> undef, ptr undef, align 1
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: store <8 x i16> undef, ptr undef, align 1
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x float> undef, ptr undef, align 1
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x double> undef, ptr undef, align 1
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-FP-LABEL: 'stores'
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i8 undef, ptr undef, align 4
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i16 undef, ptr undef, align 4
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i32 undef, ptr undef, align 4
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store i64 undef, ptr undef, align 4
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store i128 undef, ptr undef, align 4
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store float undef, ptr undef, align 4
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store double undef, ptr undef, align 4
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <2 x i8> undef, ptr undef, align 1
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <2 x i16> undef, ptr undef, align 2
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <2 x i32> undef, ptr undef, align 4
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <2 x i64> undef, ptr undef, align 4
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <4 x i32> undef, ptr undef, align 4
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: store <8 x i16> undef, ptr undef, align 2
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: store <16 x i8> undef, ptr undef, align 1
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <4 x float> undef, ptr undef, align 4
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <4 x double> undef, ptr undef, align 4
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <2 x float> undef, ptr undef, align 4
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <2 x double> undef, ptr undef, align 4
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <2 x i64> undef, ptr undef, align 1
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <4 x i32> undef, ptr undef, align 1
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: store <8 x i16> undef, ptr undef, align 1
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <4 x float> undef, ptr undef, align 1
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <2 x double> undef, ptr undef, align 1
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-FP-NEXT: Cost Model: Found costs of 1 for: store i8 undef, ptr undef, align 4
+; CHECK-FP-NEXT: Cost Model: Found costs of 1 for: store i16 undef, ptr undef, align 4
+; CHECK-FP-NEXT: Cost Model: Found costs of 1 for: store i32 undef, ptr undef, align 4
+; CHECK-FP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store i64 undef, ptr undef, align 4
+; CHECK-FP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store i128 undef, ptr undef, align 4
+; CHECK-FP-NEXT: Cost Model: Found costs of 1 for: store float undef, ptr undef, align 4
+; CHECK-FP-NEXT: Cost Model: Found costs of 1 for: store double undef, ptr undef, align 4
+; CHECK-FP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i8> undef, ptr undef, align 1
+; CHECK-FP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i16> undef, ptr undef, align 2
+; CHECK-FP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i32> undef, ptr undef, align 4
+; CHECK-FP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i64> undef, ptr undef, align 4
+; CHECK-FP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x i32> undef, ptr undef, align 4
+; CHECK-FP-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: store <8 x i16> undef, ptr undef, align 2
+; CHECK-FP-NEXT: Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: store <16 x i8> undef, ptr undef, align 1
+; CHECK-FP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x float> undef, ptr undef, align 4
+; CHECK-FP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x double> undef, ptr undef, align 4
+; CHECK-FP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x float> undef, ptr undef, align 4
+; CHECK-FP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x double> undef, ptr undef, align 4
+; CHECK-FP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i64> undef, ptr undef, align 1
+; CHECK-FP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x i32> undef, ptr undef, align 1
+; CHECK-FP-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: store <8 x i16> undef, ptr undef, align 1
+; CHECK-FP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x float> undef, ptr undef, align 1
+; CHECK-FP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x double> undef, ptr undef, align 1
+; CHECK-FP-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-MVE-LABEL: 'stores'
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i8 undef, ptr undef, align 4
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i16 undef, ptr undef, align 4
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i32 undef, ptr undef, align 4
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store i64 undef, ptr undef, align 4
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store i128 undef, ptr undef, align 4
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store float undef, ptr undef, align 4
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store double undef, ptr undef, align 4
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 18 for instruction: store <2 x i8> undef, ptr undef, align 1
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 18 for instruction: store <2 x i16> undef, ptr undef, align 2
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 18 for instruction: store <2 x i32> undef, ptr undef, align 4
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <2 x i64> undef, ptr undef, align 4
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <4 x i32> undef, ptr undef, align 4
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <8 x i16> undef, ptr undef, align 2
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <16 x i8> undef, ptr undef, align 1
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <4 x float> undef, ptr undef, align 4
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <4 x double> undef, ptr undef, align 4
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: store <2 x float> undef, ptr undef, align 4
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <2 x double> undef, ptr undef, align 4
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <2 x i64> undef, ptr undef, align 1
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <4 x i32> undef, ptr undef, align 1
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <8 x i16> undef, ptr undef, align 1
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <4 x float> undef, ptr undef, align 1
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <2 x double> undef, ptr undef, align 1
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: store i8 undef, ptr undef, align 4
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: store i16 undef, ptr undef, align 4
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: store i32 undef, ptr undef, align 4
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store i64 undef, ptr undef, align 4
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store i128 undef, ptr undef, align 4
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: store float undef, ptr undef, align 4
+; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: store double undef, ptr undef, align 4
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i8> undef, ptr undef, align 1
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i16> undef, ptr undef, align 2
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i32> undef, ptr undef, align 4
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i64> undef, ptr undef, align 4
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x i32> undef, ptr undef, align 4
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <8 x i16> undef, ptr undef, align 2
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <16 x i8> undef, ptr undef, align 1
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x float> undef, ptr undef, align 4
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x double> undef, ptr undef, align 4
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x float> undef, ptr undef, align 4
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x double> undef, ptr undef, align 4
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i64> undef, ptr undef, align 1
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x i32> undef, ptr undef, align 1
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <8 x i16> undef, ptr undef, align 1
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x float> undef, ptr undef, align 1
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x double> undef, ptr undef, align 1
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-NEON-LABEL: 'stores'
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i8 undef, ptr undef, align 4
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i16 undef, ptr undef, align 4
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i32 undef, ptr undef, align 4
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store i64 undef, ptr undef, align 4
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store i128 undef, ptr undef, align 4
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store float undef, ptr undef, align 4
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store double undef, ptr undef, align 4
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 7 for instruction: store <2 x i8> undef, ptr undef, align 1
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 7 for instruction: store <2 x i16> undef, ptr undef, align 2
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i32> undef, ptr undef, align 4
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> undef, ptr undef, align 4
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> undef, ptr undef, align 4
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <8 x i16> undef, ptr undef, align 2
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <16 x i8> undef, ptr undef, align 1
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x float> undef, ptr undef, align 4
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: store <4 x double> undef, ptr undef, align 4
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x float> undef, ptr undef, align 4
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <2 x double> undef, ptr undef, align 4
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> undef, ptr undef, align 1
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> undef, ptr undef, align 1
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <8 x i16> undef, ptr undef, align 1
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x float> undef, ptr undef, align 1
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <2 x double> undef, ptr undef, align 1
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; CHECK-V8-SIZE-LABEL: 'stores'
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i8 undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i16 undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i32 undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i64 undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i128 undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store float undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store double undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i8> undef, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i16> undef, ptr undef, align 2
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i32> undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <8 x i16> undef, ptr undef, align 2
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <16 x i8> undef, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x float> undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x double> undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x float> undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x double> undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> undef, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> undef, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <8 x i16> undef, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x float> undef, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x double> undef, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-MVE-SIZE-LABEL: 'stores'
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i8 undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i16 undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i32 undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i64 undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i128 undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store float undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store double undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i8> undef, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i16> undef, ptr undef, align 2
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i32> undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <8 x i16> undef, ptr undef, align 2
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <16 x i8> undef, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x float> undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x double> undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x float> undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x double> undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> undef, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> undef, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <8 x i16> undef, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x float> undef, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x double> undef, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: store i8 undef, ptr undef, align 4
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: store i16 undef, ptr undef, align 4
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: store i32 undef, ptr undef, align 4
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store i64 undef, ptr undef, align 4
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store i128 undef, ptr undef, align 4
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: store float undef, ptr undef, align 4
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: store double undef, ptr undef, align 4
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i8> undef, ptr undef, align 1
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i16> undef, ptr undef, align 2
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: store <2 x i32> undef, ptr undef, align 4
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: store <2 x i64> undef, ptr undef, align 4
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: store <4 x i32> undef, ptr undef, align 4
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: store <8 x i16> undef, ptr undef, align 2
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: store <16 x i8> undef, ptr undef, align 1
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: store <4 x float> undef, ptr undef, align 4
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x double> undef, ptr undef, align 4
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: store <2 x float> undef, ptr undef, align 4
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x double> undef, ptr undef, align 4
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: store <2 x i64> undef, ptr undef, align 1
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: store <4 x i32> undef, ptr undef, align 1
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: store <8 x i16> undef, ptr undef, align 1
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: store <4 x float> undef, ptr undef, align 1
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x double> undef, ptr undef, align 1
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
store i8 undef, ptr undef, align 4
store i16 undef, ptr undef, align 4
@@ -199,160 +145,108 @@ define void @stores() {
define void @loads() {
; CHECK-NOVEC-LABEL: 'loads'
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = load i8, ptr undef, align 4
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = load i16, ptr undef, align 4
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = load i32, ptr undef, align 4
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %4 = load i64, ptr undef, align 4
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %5 = load i128, ptr undef, align 4
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = load float, ptr undef, align 4
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %7 = load double, ptr undef, align 4
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %8 = load <2 x i8>, ptr undef, align 1
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %9 = load <2 x i16>, ptr undef, align 2
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %10 = load <2 x i32>, ptr undef, align 4
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %11 = load <2 x i64>, ptr undef, align 4
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %12 = load <4 x i32>, ptr undef, align 4
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %13 = load <8 x i16>, ptr undef, align 2
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %14 = load <16 x i8>, ptr undef, align 1
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %15 = load <4 x float>, ptr undef, align 4
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %16 = load <4 x double>, ptr undef, align 4
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %17 = load <2 x float>, ptr undef, align 4
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %18 = load <2 x double>, ptr undef, align 4
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %19 = load <2 x i64>, ptr undef, align 1
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %20 = load <4 x i32>, ptr undef, align 1
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %21 = load <8 x i16>, ptr undef, align 1
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %22 = load <4 x float>, ptr undef, align 1
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %23 = load <2 x double>, ptr undef, align 1
-; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %1 = load i8, ptr undef, align 4
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %2 = load i16, ptr undef, align 4
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %3 = load i32, ptr undef, align 4
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %4 = load i64, ptr undef, align 4
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %5 = load i128, ptr undef, align 4
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %6 = load float, ptr undef, align 4
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %7 = load double, ptr undef, align 4
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %8 = load <2 x i8>, ptr undef, align 1
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %9 = load <2 x i16>, ptr undef, align 2
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %10 = load <2 x i32>, ptr undef, align 4
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %11 = load <2 x i64>, ptr undef, align 4
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %12 = load <4 x i32>, ptr undef, align 4
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:4 SizeLat:1 for: %13 = load <8 x i16>, ptr undef, align 2
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of RThru:16 CodeSize:1 Lat:4 SizeLat:1 for: %14 = load <16 x i8>, ptr undef, align 1
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %15 = load <4 x float>, ptr undef, align 4
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:4 SizeLat:1 for: %16 = load <4 x double>, ptr undef, align 4
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %17 = load <2 x float>, ptr undef, align 4
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %18 = load <2 x double>, ptr undef, align 4
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %19 = load <2 x i64>, ptr undef, align 1
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %20 = load <4 x i32>, ptr undef, align 1
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:4 SizeLat:1 for: %21 = load <8 x i16>, ptr undef, align 1
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %22 = load <4 x float>, ptr undef, align 1
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %23 = load <2 x double>, ptr undef, align 1
+; CHECK-NOVEC-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-FP-LABEL: 'loads'
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = load i8, ptr undef, align 4
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = load i16, ptr undef, align 4
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = load i32, ptr undef, align 4
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %4 = load i64, ptr undef, align 4
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %5 = load i128, ptr undef, align 4
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = load float, ptr undef, align 4
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = load double, ptr undef, align 4
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %8 = load <2 x i8>, ptr undef, align 1
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %9 = load <2 x i16>, ptr undef, align 2
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %10 = load <2 x i32>, ptr undef, align 4
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %11 = load <2 x i64>, ptr undef, align 4
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %12 = load <4 x i32>, ptr undef, align 4
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %13 = load <8 x i16>, ptr undef, align 2
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %14 = load <16 x i8>, ptr undef, align 1
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %15 = load <4 x float>, ptr undef, align 4
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %16 = load <4 x double>, ptr undef, align 4
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %17 = load <2 x float>, ptr undef, align 4
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %18 = load <2 x double>, ptr undef, align 4
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %19 = load <2 x i64>, ptr undef, align 1
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %20 = load <4 x i32>, ptr undef, align 1
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %21 = load <8 x i16>, ptr undef, align 1
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %22 = load <4 x float>, ptr undef, align 1
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %23 = load <2 x double>, ptr undef, align 1
-; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-FP-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %1 = load i8, ptr undef, align 4
+; CHECK-FP-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %2 = load i16, ptr undef, align 4
+; CHECK-FP-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %3 = load i32, ptr undef, align 4
+; CHECK-FP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %4 = load i64, ptr undef, align 4
+; CHECK-FP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %5 = load i128, ptr undef, align 4
+; CHECK-FP-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %6 = load float, ptr undef, align 4
+; CHECK-FP-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %7 = load double, ptr undef, align 4
+; CHECK-FP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %8 = load <2 x i8>, ptr undef, align 1
+; CHECK-FP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %9 = load <2 x i16>, ptr undef, align 2
+; CHECK-FP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %10 = load <2 x i32>, ptr undef, align 4
+; CHECK-FP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %11 = load <2 x i64>, ptr undef, align 4
+; CHECK-FP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %12 = load <4 x i32>, ptr undef, align 4
+; CHECK-FP-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:4 SizeLat:1 for: %13 = load <8 x i16>, ptr undef, align 2
+; CHECK-FP-NEXT: Cost Model: Found costs of RThru:16 CodeSize:1 Lat:4 SizeLat:1 for: %14 = load <16 x i8>, ptr undef, align 1
+; CHECK-FP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %15 = load <4 x float>, ptr undef, align 4
+; CHECK-FP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %16 = load <4 x double>, ptr undef, align 4
+; CHECK-FP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %17 = load <2 x float>, ptr undef, align 4
+; CHECK-FP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %18 = load <2 x double>, ptr undef, align 4
+; CHECK-FP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %19 = load <2 x i64>, ptr undef, align 1
+; CHECK-FP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %20 = load <4 x i32>, ptr undef, align 1
+; CHECK-FP-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:4 SizeLat:1 for: %21 = load <8 x i16>, ptr undef, align 1
+; CHECK-FP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %22 = load <4 x float>, ptr undef, align 1
+; CHECK-FP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %23 = load <2 x double>, ptr undef, align 1
+; CHECK-FP-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-MVE-LABEL: 'loads'
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = load i8, ptr undef, align 4
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = load i16, ptr undef, align 4
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = load i32, ptr undef, align 4
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %4 = load i64, ptr undef, align 4
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %5 = load i128, ptr undef, align 4
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = load float, ptr undef, align 4
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = load double, ptr undef, align 4
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %8 = load <2 x i8>, ptr undef, align 1
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %9 = load <2 x i16>, ptr undef, align 2
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %10 = load <2 x i32>, ptr undef, align 4
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %11 = load <2 x i64>, ptr undef, align 4
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %12 = load <4 x i32>, ptr undef, align 4
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %13 = load <8 x i16>, ptr undef, align 2
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %14 = load <16 x i8>, ptr undef, align 1
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %15 = load <4 x float>, ptr undef, align 4
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %16 = load <4 x double>, ptr undef, align 4
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %17 = load <2 x float>, ptr undef, align 4
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %18 = load <2 x double>, ptr undef, align 4
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %19 = load <2 x i64>, ptr undef, align 1
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %20 = load <4 x i32>, ptr undef, align 1
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %21 = load <8 x i16>, ptr undef, align 1
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %22 = load <4 x float>, ptr undef, align 1
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %23 = load <2 x double>, ptr undef, align 1
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %1 = load i8, ptr undef, align 4
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %2 = load i16, ptr undef, align 4
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %3 = load i32, ptr undef, align 4
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %4 = load i64, ptr undef, align 4
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %5 = load i128, ptr undef, align 4
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %6 = load float, ptr undef, align 4
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %7 = load double, ptr undef, align 4
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:18 CodeSize:1 Lat:4 SizeLat:1 for: %8 = load <2 x i8>, ptr undef, align 1
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:18 CodeSize:1 Lat:4 SizeLat:1 for: %9 = load <2 x i16>, ptr undef, align 2
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:18 CodeSize:1 Lat:4 SizeLat:1 for: %10 = load <2 x i32>, ptr undef, align 4
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %11 = load <2 x i64>, ptr undef, align 4
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %12 = load <4 x i32>, ptr undef, align 4
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %13 = load <8 x i16>, ptr undef, align 2
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %14 = load <16 x i8>, ptr undef, align 1
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %15 = load <4 x float>, ptr undef, align 4
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %16 = load <4 x double>, ptr undef, align 4
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:4 SizeLat:1 for: %17 = load <2 x float>, ptr undef, align 4
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %18 = load <2 x double>, ptr undef, align 4
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %19 = load <2 x i64>, ptr undef, align 1
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %20 = load <4 x i32>, ptr undef, align 1
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %21 = load <8 x i16>, ptr undef, align 1
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %22 = load <4 x float>, ptr undef, align 1
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %23 = load <2 x double>, ptr undef, align 1
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-NEON-LABEL: 'loads'
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = load i8, ptr undef, align 4
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = load i16, ptr undef, align 4
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = load i32, ptr undef, align 4
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %4 = load i64, ptr undef, align 4
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %5 = load i128, ptr undef, align 4
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = load float, ptr undef, align 4
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = load double, ptr undef, align 4
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %8 = load <2 x i8>, ptr undef, align 1
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %9 = load <2 x i16>, ptr undef, align 2
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %10 = load <2 x i32>, ptr undef, align 4
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %11 = load <2 x i64>, ptr undef, align 4
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %12 = load <4 x i32>, ptr undef, align 4
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %13 = load <8 x i16>, ptr undef, align 2
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %14 = load <16 x i8>, ptr undef, align 1
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %15 = load <4 x float>, ptr undef, align 4
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %16 = load <4 x double>, ptr undef, align 4
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %17 = load <2 x float>, ptr undef, align 4
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %18 = load <2 x double>, ptr undef, align 4
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %19 = load <2 x i64>, ptr undef, align 1
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %20 = load <4 x i32>, ptr undef, align 1
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %21 = load <8 x i16>, ptr undef, align 1
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %22 = load <4 x float>, ptr undef, align 1
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %23 = load <2 x double>, ptr undef, align 1
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; CHECK-V8-SIZE-LABEL: 'loads'
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = load i8, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = load i16, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = load i32, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = load i64, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %5 = load i128, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = load float, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = load double, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %8 = load <2 x i8>, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %9 = load <2 x i16>, ptr undef, align 2
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %10 = load <2 x i32>, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %11 = load <2 x i64>, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %12 = load <4 x i32>, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %13 = load <8 x i16>, ptr undef, align 2
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %14 = load <16 x i8>, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %15 = load <4 x float>, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %16 = load <4 x double>, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %17 = load <2 x float>, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %18 = load <2 x double>, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %19 = load <2 x i64>, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %20 = load <4 x i32>, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %21 = load <8 x i16>, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %22 = load <4 x float>, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %23 = load <2 x double>, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-MVE-SIZE-LABEL: 'loads'
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = load i8, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = load i16, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = load i32, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = load i64, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %5 = load i128, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = load float, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = load double, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %8 = load <2 x i8>, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %9 = load <2 x i16>, ptr undef, align 2
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %10 = load <2 x i32>, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %11 = load <2 x i64>, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %12 = load <4 x i32>, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %13 = load <8 x i16>, ptr undef, align 2
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %14 = load <16 x i8>, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %15 = load <4 x float>, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %16 = load <4 x double>, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %17 = load <2 x float>, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %18 = load <2 x double>, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %19 = load <2 x i64>, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %20 = load <4 x i32>, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %21 = load <8 x i16>, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %22 = load <4 x float>, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %23 = load <2 x double>, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %1 = load i8, ptr undef, align 4
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %2 = load i16, ptr undef, align 4
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %3 = load i32, ptr undef, align 4
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %4 = load i64, ptr undef, align 4
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %5 = load i128, ptr undef, align 4
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %6 = load float, ptr undef, align 4
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %7 = load double, ptr undef, align 4
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %8 = load <2 x i8>, ptr undef, align 1
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %9 = load <2 x i16>, ptr undef, align 2
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %10 = load <2 x i32>, ptr undef, align 4
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %11 = load <2 x i64>, ptr undef, align 4
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %12 = load <4 x i32>, ptr undef, align 4
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %13 = load <8 x i16>, ptr undef, align 2
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %14 = load <16 x i8>, ptr undef, align 1
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %15 = load <4 x float>, ptr undef, align 4
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:4 SizeLat:1 for: %16 = load <4 x double>, ptr undef, align 4
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %17 = load <2 x float>, ptr undef, align 4
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %18 = load <2 x double>, ptr undef, align 4
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %19 = load <2 x i64>, ptr undef, align 1
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %20 = load <4 x i32>, ptr undef, align 1
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %21 = load <8 x i16>, ptr undef, align 1
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %22 = load <4 x float>, ptr undef, align 1
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %23 = load <2 x double>, ptr undef, align 1
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
load i8, ptr undef, align 4
load i16, ptr undef, align 4
diff --git a/llvm/test/Analysis/CostModel/ARM/logicalop.ll b/llvm/test/Analysis/CostModel/ARM/logicalop.ll
index 967426c..82eb716 100644
--- a/llvm/test/Analysis/CostModel/ARM/logicalop.ll
+++ b/llvm/test/Analysis/CostModel/ARM/logicalop.ll
@@ -1,72 +1,40 @@
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp | FileCheck %s --check-prefix=CHECK-MVE-RECIP
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift | FileCheck %s --check-prefix=CHECK-NEON-RECIP
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8m.base | FileCheck %s --check-prefix=CHECK-THUMB1-RECIP
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8m.main | FileCheck %s --check-prefix=CHECK-THUMB2-RECIP
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp | FileCheck %s --check-prefix=CHECK-MVE-SIZE
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift | FileCheck %s --check-prefix=CHECK-NEON-SIZE
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8m.base | FileCheck %s --check-prefix=CHECK-THUMB1-SIZE
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8m.main | FileCheck %s --check-prefix=CHECK-THUMB2-SIZE
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp | FileCheck %s --check-prefix=CHECK-MVE
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift | FileCheck %s --check-prefix=CHECK-NEON
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8m.base | FileCheck %s --check-prefix=CHECK-THUMB1
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8m.main | FileCheck %s --check-prefix=CHECK-THUMB2
target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
define void @op() {
; Logical and/or - select's cost must be equivalent to that of binop
-; CHECK-MVE-RECIP-LABEL: 'op'
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sand = select i1 undef, i1 undef, i1 false
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %band = and i1 undef, undef
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sor = select i1 undef, i1 true, i1 undef
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bor = or i1 undef, undef
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; CHECK-NEON-RECIP-LABEL: 'op'
-; CHECK-NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sand = select i1 undef, i1 undef, i1 false
-; CHECK-NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %band = and i1 undef, undef
-; CHECK-NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sor = select i1 undef, i1 true, i1 undef
-; CHECK-NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bor = or i1 undef, undef
-; CHECK-NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; CHECK-THUMB1-RECIP-LABEL: 'op'
-; CHECK-THUMB1-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sand = select i1 undef, i1 undef, i1 false
-; CHECK-THUMB1-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %band = and i1 undef, undef
-; CHECK-THUMB1-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sor = select i1 undef, i1 true, i1 undef
-; CHECK-THUMB1-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bor = or i1 undef, undef
-; CHECK-THUMB1-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-THUMB2-RECIP-LABEL: 'op'
-; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sand = select i1 undef, i1 undef, i1 false
-; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %band = and i1 undef, undef
-; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sor = select i1 undef, i1 true, i1 undef
-; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bor = or i1 undef, undef
-; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-MVE-SIZE-LABEL: 'op'
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sand = select i1 undef, i1 undef, i1 false
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %band = and i1 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %sor = select i1 undef, i1 true, i1 undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bor = or i1 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-NEON-SIZE-LABEL: 'op'
-; CHECK-NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sand = select i1 undef, i1 undef, i1 false
-; CHECK-NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %band = and i1 undef, undef
-; CHECK-NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %sor = select i1 undef, i1 true, i1 undef
-; CHECK-NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bor = or i1 undef, undef
-; CHECK-NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-THUMB1-SIZE-LABEL: 'op'
-; CHECK-THUMB1-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sand = select i1 undef, i1 undef, i1 false
-; CHECK-THUMB1-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %band = and i1 undef, undef
-; CHECK-THUMB1-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %sor = select i1 undef, i1 true, i1 undef
-; CHECK-THUMB1-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bor = or i1 undef, undef
-; CHECK-THUMB1-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-THUMB2-SIZE-LABEL: 'op'
-; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sand = select i1 undef, i1 undef, i1 false
-; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %band = and i1 undef, undef
-; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %sor = select i1 undef, i1 true, i1 undef
-; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bor = or i1 undef, undef
-; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-MVE-LABEL: 'op'
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %sand = select i1 undef, i1 undef, i1 false
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %band = and i1 undef, undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %sor = select i1 undef, i1 true, i1 undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %bor = or i1 undef, undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-NEON-LABEL: 'op'
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %sand = select i1 undef, i1 undef, i1 false
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %band = and i1 undef, undef
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %sor = select i1 undef, i1 true, i1 undef
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %bor = or i1 undef, undef
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-THUMB1-LABEL: 'op'
+; CHECK-THUMB1-NEXT: Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %sand = select i1 undef, i1 undef, i1 false
+; CHECK-THUMB1-NEXT: Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %band = and i1 undef, undef
+; CHECK-THUMB1-NEXT: Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %sor = select i1 undef, i1 true, i1 undef
+; CHECK-THUMB1-NEXT: Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %bor = or i1 undef, undef
+; CHECK-THUMB1-NEXT: Cost Model: Found costs of 1 for: ret void
+;
+; CHECK-THUMB2-LABEL: 'op'
+; CHECK-THUMB2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %sand = select i1 undef, i1 undef, i1 false
+; CHECK-THUMB2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %band = and i1 undef, undef
+; CHECK-THUMB2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %sor = select i1 undef, i1 true, i1 undef
+; CHECK-THUMB2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %bor = or i1 undef, undef
+; CHECK-THUMB2-NEXT: Cost Model: Found costs of 1 for: ret void
;
%sand = select i1 undef, i1 undef, i1 false
%band = and i1 undef, undef
@@ -77,61 +45,33 @@ define void @op() {
}
define void @vecop() {
-; CHECK-MVE-RECIP-LABEL: 'vecop'
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %band = and <4 x i1> undef, undef
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bor = or <4 x i1> undef, undef
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; CHECK-NEON-RECIP-LABEL: 'vecop'
-; CHECK-NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
-; CHECK-NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %band = and <4 x i1> undef, undef
-; CHECK-NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
-; CHECK-NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bor = or <4 x i1> undef, undef
-; CHECK-NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; CHECK-THUMB1-RECIP-LABEL: 'vecop'
-; CHECK-THUMB1-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
-; CHECK-THUMB1-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %band = and <4 x i1> undef, undef
-; CHECK-THUMB1-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
-; CHECK-THUMB1-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bor = or <4 x i1> undef, undef
-; CHECK-THUMB1-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-THUMB2-RECIP-LABEL: 'vecop'
-; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
-; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %band = and <4 x i1> undef, undef
-; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
-; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bor = or <4 x i1> undef, undef
-; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-MVE-SIZE-LABEL: 'vecop'
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %band = and <4 x i1> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bor = or <4 x i1> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-NEON-SIZE-LABEL: 'vecop'
-; CHECK-NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
-; CHECK-NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %band = and <4 x i1> undef, undef
-; CHECK-NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
-; CHECK-NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bor = or <4 x i1> undef, undef
-; CHECK-NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-THUMB1-SIZE-LABEL: 'vecop'
-; CHECK-THUMB1-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
-; CHECK-THUMB1-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %band = and <4 x i1> undef, undef
-; CHECK-THUMB1-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
-; CHECK-THUMB1-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bor = or <4 x i1> undef, undef
-; CHECK-THUMB1-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-THUMB2-SIZE-LABEL: 'vecop'
-; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
-; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %band = and <4 x i1> undef, undef
-; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
-; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bor = or <4 x i1> undef, undef
-; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-MVE-LABEL: 'vecop'
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %band = and <4 x i1> undef, undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %bor = or <4 x i1> undef, undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-NEON-LABEL: 'vecop'
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %band = and <4 x i1> undef, undef
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %bor = or <4 x i1> undef, undef
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-THUMB1-LABEL: 'vecop'
+; CHECK-THUMB1-NEXT: Cost Model: Found costs of 4 for: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
+; CHECK-THUMB1-NEXT: Cost Model: Found costs of 4 for: %band = and <4 x i1> undef, undef
+; CHECK-THUMB1-NEXT: Cost Model: Found costs of 4 for: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
+; CHECK-THUMB1-NEXT: Cost Model: Found costs of 4 for: %bor = or <4 x i1> undef, undef
+; CHECK-THUMB1-NEXT: Cost Model: Found costs of 1 for: ret void
+;
+; CHECK-THUMB2-LABEL: 'vecop'
+; CHECK-THUMB2-NEXT: Cost Model: Found costs of 4 for: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
+; CHECK-THUMB2-NEXT: Cost Model: Found costs of 4 for: %band = and <4 x i1> undef, undef
+; CHECK-THUMB2-NEXT: Cost Model: Found costs of 4 for: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
+; CHECK-THUMB2-NEXT: Cost Model: Found costs of 4 for: %bor = or <4 x i1> undef, undef
+; CHECK-THUMB2-NEXT: Cost Model: Found costs of 1 for: ret void
;
%sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> <i1 false, i1 false, i1 false, i1 false>
%band = and <4 x i1> undef, undef
diff --git a/llvm/test/Analysis/CostModel/ARM/mul-cast-vect.ll b/llvm/test/Analysis/CostModel/ARM/mul-cast-vect.ll
index 17d4263..07d2bd0 100644
--- a/llvm/test/Analysis/CostModel/ARM/mul-cast-vect.ll
+++ b/llvm/test/Analysis/CostModel/ARM/mul-cast-vect.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 | FileCheck --check-prefix=COST %s
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 | FileCheck --check-prefix=COST %s
+
; To see the assembly output: llc -mcpu=cortex-a9 < %s | FileCheck --check-prefix=ASM %s
; ASM lines below are only for reference, tests on that direction should go to tests/CodeGen/ARM
@@ -17,11 +18,11 @@ target triple = "armv7--linux-gnueabihf"
define void @direct(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
; COST-LABEL: 'direct'
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i32>, ptr %loadaddr, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r3 = mul <4 x i32> %v0, %v1
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> %r3, ptr %storeaddr, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i32>, ptr %loadaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r3 = mul <4 x i32> %v0, %v1
+; COST-NEXT: Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%v0 = load %T432, ptr %loadaddr
; ASM: vld1.64
@@ -36,13 +37,13 @@ define void @direct(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
define void @ups1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
; COST-LABEL: 'ups1632'
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i16>, ptr %loadaddr, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r1 = sext <4 x i16> %v0 to <4 x i32>
-; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r2 = sext <4 x i16> %v1 to <4 x i32>
-; COST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r3 = mul <4 x i32> %r1, %r2
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> %r3, ptr %storeaddr, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i16>, ptr %loadaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
+; COST-NEXT: Cost Model: Found costs of 0 for: %r1 = sext <4 x i16> %v0 to <4 x i32>
+; COST-NEXT: Cost Model: Found costs of 0 for: %r2 = sext <4 x i16> %v1 to <4 x i32>
+; COST-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r3 = mul <4 x i32> %r1, %r2
+; COST-NEXT: Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%v0 = load %T416, ptr %loadaddr
; ASM: vldr
@@ -59,13 +60,13 @@ define void @ups1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
define void @upu1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
; COST-LABEL: 'upu1632'
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i16>, ptr %loadaddr, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r1 = zext <4 x i16> %v0 to <4 x i32>
-; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r2 = zext <4 x i16> %v1 to <4 x i32>
-; COST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r3 = mul <4 x i32> %r1, %r2
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> %r3, ptr %storeaddr, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i16>, ptr %loadaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
+; COST-NEXT: Cost Model: Found costs of 0 for: %r1 = zext <4 x i16> %v0 to <4 x i32>
+; COST-NEXT: Cost Model: Found costs of 0 for: %r2 = zext <4 x i16> %v1 to <4 x i32>
+; COST-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r3 = mul <4 x i32> %r1, %r2
+; COST-NEXT: Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%v0 = load %T416, ptr %loadaddr
; ASM: vldr
@@ -82,12 +83,12 @@ define void @upu1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
define void @ups3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
; COST-LABEL: 'ups3264'
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <2 x i32>, ptr %loadaddr, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r3 = mul <2 x i32> %v0, %v1
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %st = sext <2 x i32> %r3 to <2 x i64>
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> %st, ptr %storeaddr, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <2 x i32>, ptr %loadaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT: Cost Model: Found costs of 1 for: %r3 = mul <2 x i32> %v0, %v1
+; COST-NEXT: Cost Model: Found costs of 1 for: %st = sext <2 x i32> %r3 to <2 x i64>
+; COST-NEXT: Cost Model: Found costs of 1 for: store <2 x i64> %st, ptr %storeaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%v0 = load %T232, ptr %loadaddr
; ASM: vldr
@@ -104,12 +105,12 @@ define void @ups3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
define void @upu3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
; COST-LABEL: 'upu3264'
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <2 x i32>, ptr %loadaddr, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r3 = mul <2 x i32> %v0, %v1
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %st = zext <2 x i32> %r3 to <2 x i64>
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> %st, ptr %storeaddr, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <2 x i32>, ptr %loadaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT: Cost Model: Found costs of 1 for: %r3 = mul <2 x i32> %v0, %v1
+; COST-NEXT: Cost Model: Found costs of 1 for: %st = zext <2 x i32> %r3 to <2 x i64>
+; COST-NEXT: Cost Model: Found costs of 1 for: store <2 x i64> %st, ptr %storeaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%v0 = load %T232, ptr %loadaddr
; ASM: vldr
@@ -126,12 +127,12 @@ define void @upu3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
define void @dn3216(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
; COST-LABEL: 'dn3216'
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i32>, ptr %loadaddr, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r3 = mul <4 x i32> %v0, %v1
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %st = trunc <4 x i32> %r3 to <4 x i16>
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i16> %st, ptr %storeaddr, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i32>, ptr %loadaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r3 = mul <4 x i32> %v0, %v1
+; COST-NEXT: Cost Model: Found costs of 1 for: %st = trunc <4 x i32> %r3 to <4 x i16>
+; COST-NEXT: Cost Model: Found costs of 1 for: store <4 x i16> %st, ptr %storeaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%v0 = load %T432, ptr %loadaddr
; ASM: vld1.64
diff --git a/llvm/test/Analysis/CostModel/ARM/muls-in-smlal-patterns.ll b/llvm/test/Analysis/CostModel/ARM/muls-in-smlal-patterns.ll
index 7de2799..3ae02cd 100644
--- a/llvm/test/Analysis/CostModel/ARM/muls-in-smlal-patterns.ll
+++ b/llvm/test/Analysis/CostModel/ARM/muls-in-smlal-patterns.ll
@@ -1,21 +1,21 @@
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple thumbv8.1-m.main -mattr=+dsp < %s | FileCheck %s
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple thumbv8.1-m.main < %s | FileCheck %s --check-prefix=CHECK-NO-DSP
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple thumbv8.1-m.main -mattr=+dsp < %s | FileCheck %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple thumbv8.1-m.main < %s | FileCheck %s --check-prefix=CHECK-NO-DSP
define i64 @test(i16 %a, i16 %b) {
; CHECK-LABEL: 'test'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %as = sext i16 %a to i32
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bs = sext i16 %b to i32
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ms
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %as = sext i16 %a to i32
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %bs = sext i16 %b to i32
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %m = mul i32 %as, %bs
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %ms = sext i32 %m to i64
+; CHECK-NEXT: Cost Model: Found costs of 1 for: ret i64 %ms
;
; CHECK-NO-DSP-LABEL: 'test'
-; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %as = sext i16 %a to i32
-; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bs = sext i16 %b to i32
-; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
-; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
-; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ms
+; CHECK-NO-DSP-NEXT: Cost Model: Found costs of 1 for: %as = sext i16 %a to i32
+; CHECK-NO-DSP-NEXT: Cost Model: Found costs of 1 for: %bs = sext i16 %b to i32
+; CHECK-NO-DSP-NEXT: Cost Model: Found costs of 1 for: %m = mul i32 %as, %bs
+; CHECK-NO-DSP-NEXT: Cost Model: Found costs of 1 for: %ms = sext i32 %m to i64
+; CHECK-NO-DSP-NEXT: Cost Model: Found costs of 1 for: ret i64 %ms
;
%as = sext i16 %a to i32
%bs = sext i16 %b to i32
@@ -26,20 +26,20 @@ define i64 @test(i16 %a, i16 %b) {
define i64 @withadd(i16 %a, i16 %b, i64 %c) {
; CHECK-LABEL: 'withadd'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %as = sext i16 %a to i32
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bs = sext i16 %b to i32
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %as = sext i16 %a to i32
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %bs = sext i16 %b to i32
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %m = mul i32 %as, %bs
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %ms = sext i32 %m to i64
+; CHECK-NEXT: Cost Model: Found costs of 2 for: %r = add i64 %c, %ms
+; CHECK-NEXT: Cost Model: Found costs of 1 for: ret i64 %r
;
; CHECK-NO-DSP-LABEL: 'withadd'
-; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %as = sext i16 %a to i32
-; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bs = sext i16 %b to i32
-; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
-; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
-; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
-; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+; CHECK-NO-DSP-NEXT: Cost Model: Found costs of 1 for: %as = sext i16 %a to i32
+; CHECK-NO-DSP-NEXT: Cost Model: Found costs of 1 for: %bs = sext i16 %b to i32
+; CHECK-NO-DSP-NEXT: Cost Model: Found costs of 1 for: %m = mul i32 %as, %bs
+; CHECK-NO-DSP-NEXT: Cost Model: Found costs of 1 for: %ms = sext i32 %m to i64
+; CHECK-NO-DSP-NEXT: Cost Model: Found costs of 2 for: %r = add i64 %c, %ms
+; CHECK-NO-DSP-NEXT: Cost Model: Found costs of 1 for: ret i64 %r
;
%as = sext i16 %a to i32
%bs = sext i16 %b to i32
@@ -51,24 +51,24 @@ define i64 @withadd(i16 %a, i16 %b, i64 %c) {
define i64 @withloads(ptr %pa, ptr %pb, i64 %c) {
; CHECK-LABEL: 'withloads'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, ptr %pa, align 2
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %b = load i16, ptr %pb, align 2
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %as = sext i16 %a to i32
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %bs = sext i16 %b to i32
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %a = load i16, ptr %pa, align 2
+; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %b = load i16, ptr %pb, align 2
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %as = sext i16 %a to i32
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %bs = sext i16 %b to i32
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %m = mul i32 %as, %bs
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %ms = sext i32 %m to i64
+; CHECK-NEXT: Cost Model: Found costs of 2 for: %r = add i64 %c, %ms
+; CHECK-NEXT: Cost Model: Found costs of 1 for: ret i64 %r
;
; CHECK-NO-DSP-LABEL: 'withloads'
-; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, ptr %pa, align 2
-; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %b = load i16, ptr %pb, align 2
-; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %as = sext i16 %a to i32
-; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %bs = sext i16 %b to i32
-; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
-; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
-; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
-; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+; CHECK-NO-DSP-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %a = load i16, ptr %pa, align 2
+; CHECK-NO-DSP-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %b = load i16, ptr %pb, align 2
+; CHECK-NO-DSP-NEXT: Cost Model: Found costs of 0 for: %as = sext i16 %a to i32
+; CHECK-NO-DSP-NEXT: Cost Model: Found costs of 0 for: %bs = sext i16 %b to i32
+; CHECK-NO-DSP-NEXT: Cost Model: Found costs of 1 for: %m = mul i32 %as, %bs
+; CHECK-NO-DSP-NEXT: Cost Model: Found costs of 1 for: %ms = sext i32 %m to i64
+; CHECK-NO-DSP-NEXT: Cost Model: Found costs of 2 for: %r = add i64 %c, %ms
+; CHECK-NO-DSP-NEXT: Cost Model: Found costs of 1 for: ret i64 %r
;
%a = load i16, ptr %pa
%b = load i16, ptr %pb
@@ -82,18 +82,18 @@ define i64 @withloads(ptr %pa, ptr %pb, i64 %c) {
define i64 @different_extend_ops(i16 %a, i16 %b) {
; CHECK-LABEL: 'different_extend_ops'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %as = sext i16 %a to i32
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bs = zext i16 %b to i32
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ms
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %as = sext i16 %a to i32
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %bs = zext i16 %b to i32
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %m = mul i32 %as, %bs
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %ms = sext i32 %m to i64
+; CHECK-NEXT: Cost Model: Found costs of 1 for: ret i64 %ms
;
; CHECK-NO-DSP-LABEL: 'different_extend_ops'
-; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %as = sext i16 %a to i32
-; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bs = zext i16 %b to i32
-; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
-; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
-; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ms
+; CHECK-NO-DSP-NEXT: Cost Model: Found costs of 1 for: %as = sext i16 %a to i32
+; CHECK-NO-DSP-NEXT: Cost Model: Found costs of 1 for: %bs = zext i16 %b to i32
+; CHECK-NO-DSP-NEXT: Cost Model: Found costs of 1 for: %m = mul i32 %as, %bs
+; CHECK-NO-DSP-NEXT: Cost Model: Found costs of 1 for: %ms = sext i32 %m to i64
+; CHECK-NO-DSP-NEXT: Cost Model: Found costs of 1 for: ret i64 %ms
;
%as = sext i16 %a to i32
%bs = zext i16 %b to i32
diff --git a/llvm/test/Analysis/CostModel/ARM/muls-in-umull-patterns.ll b/llvm/test/Analysis/CostModel/ARM/muls-in-umull-patterns.ll
index 521816d13..04a9520 100644
--- a/llvm/test/Analysis/CostModel/ARM/muls-in-umull-patterns.ll
+++ b/llvm/test/Analysis/CostModel/ARM/muls-in-umull-patterns.ll
@@ -1,20 +1,21 @@
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple thumbv8.1-m.main -mattr=+dsp < %s | FileCheck %s
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple thumbv8.1-m.main < %s | FileCheck %s --check-prefix=CHECK-NO-DSP
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple thumbv8.1-m.main -mattr=+dsp < %s | FileCheck %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple thumbv8.1-m.main < %s | FileCheck %s --check-prefix=CHECK-NO-DSP
+
define i64 @test(i16 %a, i16 %b) {
; CHECK-LABEL: 'test'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %as = zext i16 %a to i32
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bs = zext i16 %b to i32
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ms = zext i32 %m to i64
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ms
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %as = zext i16 %a to i32
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %bs = zext i16 %b to i32
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %m = mul i32 %as, %bs
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %ms = zext i32 %m to i64
+; CHECK-NEXT: Cost Model: Found costs of 1 for: ret i64 %ms
;
; CHECK-NO-DSP-LABEL: 'test'
-; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %as = zext i16 %a to i32
-; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bs = zext i16 %b to i32
-; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
-; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ms = zext i32 %m to i64
-; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ms
+; CHECK-NO-DSP-NEXT: Cost Model: Found costs of 1 for: %as = zext i16 %a to i32
+; CHECK-NO-DSP-NEXT: Cost Model: Found costs of 1 for: %bs = zext i16 %b to i32
+; CHECK-NO-DSP-NEXT: Cost Model: Found costs of 1 for: %m = mul i32 %as, %bs
+; CHECK-NO-DSP-NEXT: Cost Model: Found costs of 1 for: %ms = zext i32 %m to i64
+; CHECK-NO-DSP-NEXT: Cost Model: Found costs of 1 for: ret i64 %ms
;
%as = zext i16 %a to i32
%bs = zext i16 %b to i32
@@ -25,20 +26,20 @@ define i64 @test(i16 %a, i16 %b) {
define i64 @withadd(i16 %a, i16 %b, i64 %c) {
; CHECK-LABEL: 'withadd'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %as = zext i16 %a to i32
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bs = zext i16 %b to i32
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ms = zext i32 %m to i64
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %as = zext i16 %a to i32
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %bs = zext i16 %b to i32
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %m = mul i32 %as, %bs
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %ms = zext i32 %m to i64
+; CHECK-NEXT: Cost Model: Found costs of 2 for: %r = add i64 %c, %ms
+; CHECK-NEXT: Cost Model: Found costs of 1 for: ret i64 %r
;
; CHECK-NO-DSP-LABEL: 'withadd'
-; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %as = zext i16 %a to i32
-; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bs = zext i16 %b to i32
-; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
-; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ms = zext i32 %m to i64
-; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
-; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+; CHECK-NO-DSP-NEXT: Cost Model: Found costs of 1 for: %as = zext i16 %a to i32
+; CHECK-NO-DSP-NEXT: Cost Model: Found costs of 1 for: %bs = zext i16 %b to i32
+; CHECK-NO-DSP-NEXT: Cost Model: Found costs of 1 for: %m = mul i32 %as, %bs
+; CHECK-NO-DSP-NEXT: Cost Model: Found costs of 1 for: %ms = zext i32 %m to i64
+; CHECK-NO-DSP-NEXT: Cost Model: Found costs of 2 for: %r = add i64 %c, %ms
+; CHECK-NO-DSP-NEXT: Cost Model: Found costs of 1 for: ret i64 %r
;
%as = zext i16 %a to i32
%bs = zext i16 %b to i32
@@ -50,24 +51,24 @@ define i64 @withadd(i16 %a, i16 %b, i64 %c) {
define i64 @withloads(ptr %pa, ptr %pb, i64 %c) {
; CHECK-LABEL: 'withloads'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, ptr %pa, align 2
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %b = load i16, ptr %pb, align 2
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %as = zext i16 %a to i32
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %bs = zext i16 %b to i32
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ms = zext i32 %m to i64
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %a = load i16, ptr %pa, align 2
+; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %b = load i16, ptr %pb, align 2
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %as = zext i16 %a to i32
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %bs = zext i16 %b to i32
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %m = mul i32 %as, %bs
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %ms = zext i32 %m to i64
+; CHECK-NEXT: Cost Model: Found costs of 2 for: %r = add i64 %c, %ms
+; CHECK-NEXT: Cost Model: Found costs of 1 for: ret i64 %r
;
; CHECK-NO-DSP-LABEL: 'withloads'
-; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, ptr %pa, align 2
-; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %b = load i16, ptr %pb, align 2
-; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %as = zext i16 %a to i32
-; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %bs = zext i16 %b to i32
-; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
-; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ms = zext i32 %m to i64
-; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
-; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+; CHECK-NO-DSP-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %a = load i16, ptr %pa, align 2
+; CHECK-NO-DSP-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %b = load i16, ptr %pb, align 2
+; CHECK-NO-DSP-NEXT: Cost Model: Found costs of 0 for: %as = zext i16 %a to i32
+; CHECK-NO-DSP-NEXT: Cost Model: Found costs of 0 for: %bs = zext i16 %b to i32
+; CHECK-NO-DSP-NEXT: Cost Model: Found costs of 1 for: %m = mul i32 %as, %bs
+; CHECK-NO-DSP-NEXT: Cost Model: Found costs of 1 for: %ms = zext i32 %m to i64
+; CHECK-NO-DSP-NEXT: Cost Model: Found costs of 2 for: %r = add i64 %c, %ms
+; CHECK-NO-DSP-NEXT: Cost Model: Found costs of 1 for: ret i64 %r
;
%a = load i16, ptr %pa
%b = load i16, ptr %pb
diff --git a/llvm/test/Analysis/CostModel/ARM/mve-intrinsic-cost-kinds.ll b/llvm/test/Analysis/CostModel/ARM/mve-intrinsic-cost-kinds.ll
new file mode 100644
index 0000000..b3ad818
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/ARM/mve-intrinsic-cost-kinds.ll
@@ -0,0 +1,181 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt -mtriple=armv8.1m.main -mattr=+mve.fp -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output < %s | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+
+; Test a cross-section of intrinsics for various cost-kinds.
+; Other test files may check for accuracy of a particular intrinsic
+; across subtargets or types. This is just a basic correctness check using an
+; ARM target and a legal scalar type (i32/float) and/or an
+; illegal vector type (16 x i32/float).
+
+declare i32 @llvm.smax.i32(i32, i32)
+declare <16 x i32> @llvm.smax.v16i32(<16 x i32>, <16 x i32>)
+
+declare float @llvm.fmuladd.f32(float, float, float)
+declare <16 x float> @llvm.fmuladd.v16f32(<16 x float>, <16 x float>, <16 x float>)
+
+declare float @llvm.log2.f32(float)
+declare <16 x float> @llvm.log2.v16f32(<16 x float>)
+
+declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata)
+declare <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float>, <16 x float>, metadata, metadata)
+
+declare float @llvm.maximum.f32(float, float)
+declare <16 x float> @llvm.maximum.v16f32(<16 x float>, <16 x float>)
+
+declare i32 @llvm.cttz.i32(i32, i1)
+declare <16 x i32> @llvm.cttz.v16i32(<16 x i32>, i1)
+
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare <16 x i32> @llvm.ctlz.v16i32(<16 x i32>, i1)
+
+declare i32 @llvm.fshl.i32(i32, i32, i32)
+declare <16 x i32> @llvm.fshl.v16i32(<16 x i32>, <16 x i32>, <16 x i32>)
+
+declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x float>)
+declare void @llvm.masked.scatter.v16f32.v16p0(<16 x float>, <16 x ptr>, i32, <16 x i1>)
+declare float @llvm.vector.reduce.fmax.v16f32(<16 x float>)
+
+declare void @llvm.memcpy.p0.p0.i32(ptr, ptr, i32, i1)
+
+declare i32 @llvm.ssa.copy.i32(i32)
+declare float @llvm.ssa.copy.f32(float)
+declare ptr @llvm.ssa.copy.p0(ptr)
+
+define void @smax(i32 %a, i32 %b, <16 x i32> %va, <16 x i32> %vb) {
+; CHECK-LABEL: 'smax'
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %s = call i32 @llvm.smax.i32(i32 %a, i32 %b)
+; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %va, <16 x i32> %vb)
+; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+ %s = call i32 @llvm.smax.i32(i32 %a, i32 %b)
+ %v = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %va, <16 x i32> %vb)
+ ret void
+}
+
+define void @fmuladd(float %a, float %b, float %c, <16 x float> %va, <16 x float> %vb, <16 x float> %vc) {
+; CHECK-LABEL: 'fmuladd'
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %s = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
+; CHECK-NEXT: Cost Model: Found costs of 8 for: %v = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %va, <16 x float> %vb, <16 x float> %vc)
+; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+ %s = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
+ %v = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %va, <16 x float> %vb, <16 x float> %vc)
+ ret void
+}
+
+define void @log2(float %a, <16 x float> %va) {
+; CHECK-LABEL: 'log2'
+; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %s = call float @llvm.log2.f32(float %a)
+; CHECK-NEXT: Cost Model: Found costs of RThru:192 CodeSize:48 Lat:192 SizeLat:192 for: %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va)
+; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+ %s = call float @llvm.log2.f32(float %a)
+ %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va)
+ ret void
+}
+
+define void @constrained_fadd(float %a, <16 x float> %va) strictfp {
+; CHECK-LABEL: 'constrained_fadd'
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %s = call float @llvm.experimental.constrained.fadd.f32(float %a, float %a, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+; CHECK-NEXT: Cost Model: Found costs of 48 for: %t = call <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float> %va, <16 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+ %s = call float @llvm.experimental.constrained.fadd.f32(float %a, float %a, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ %t = call <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float> %va, <16 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret void
+}
+
+define void @fmaximum(float %a, float %b, <16 x float> %va, <16 x float> %vb) {
+; CHECK-LABEL: 'fmaximum'
+; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %s = call float @llvm.maximum.f32(float %a, float %b)
+; CHECK-NEXT: Cost Model: Found costs of RThru:208 CodeSize:64 Lat:208 SizeLat:208 for: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb)
+; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+ %s = call float @llvm.maximum.f32(float %a, float %b)
+ %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb)
+ ret void
+}
+
+define void @cttz(i32 %a, <16 x i32> %va) {
+; CHECK-LABEL: 'cttz'
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+; CHECK-NEXT: Cost Model: Found costs of 8 for: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
+; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+ %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+ %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
+ ret void
+}
+
+define void @ctlz(i32 %a, <16 x i32> %va) {
+; CHECK-LABEL: 'ctlz'
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %s = call i32 @llvm.ctlz.i32(i32 %a, i1 true)
+; CHECK-NEXT: Cost Model: Found costs of 8 for: %v = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %va, i1 true)
+; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+ %s = call i32 @llvm.ctlz.i32(i32 %a, i1 true)
+ %v = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %va, i1 true)
+ ret void
+}
+
+define void @fshl(i32 %a, i32 %b, i32 %c, <16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc) {
+; CHECK-LABEL: 'fshl'
+; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:8 Lat:7 SizeLat:7 for: %s = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c)
+; CHECK-NEXT: Cost Model: Found costs of RThru:120 CodeSize:92 Lat:120 SizeLat:120 for: %v = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc)
+; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+ %s = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c)
+ %v = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc)
+ ret void
+}
+
+define void @maskedgather(<16 x ptr> %va, <16 x i1> %vb, <16 x float> %vc) {
+; CHECK-LABEL: 'maskedgather'
+; CHECK-NEXT: Cost Model: Found costs of 176 for: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 %va, <16 x i1> %vb, <16 x float> %vc)
+; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+ %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %va, i32 1, <16 x i1> %vb, <16 x float> %vc)
+ ret void
+}
+
+define void @maskedscatter(<16 x float> %va, <16 x ptr> %vb, <16 x i1> %vc) {
+; CHECK-LABEL: 'maskedscatter'
+; CHECK-NEXT: Cost Model: Found costs of 176 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> align 1 %vb, <16 x i1> %vc)
+; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+ call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> %vb, i32 1, <16 x i1> %vc)
+ ret void
+}
+
+define void @reduce_fmax(<16 x float> %va) {
+; CHECK-LABEL: 'reduce_fmax'
+; CHECK-NEXT: Cost Model: Found costs of RThru:9 CodeSize:6 Lat:9 SizeLat:9 for: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
+; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+ %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
+ ret void
+}
+
+define void @memcpy(ptr %a, ptr %b, i32 %c) {
+; CHECK-LABEL: 'memcpy'
+; CHECK-NEXT: Cost Model: Found costs of 4 for: call void @llvm.memcpy.p0.p0.i32(ptr align 1 %a, ptr align 1 %b, i32 32, i1 false)
+; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+ call void @llvm.memcpy.p0.p0.i32(ptr align 1 %a, ptr align 1 %b, i32 32, i1 false)
+ ret void
+}
+
+define void @ssa_copy() {
+; CHECK-LABEL: 'ssa_copy'
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %i = call i32 @llvm.ssa.copy.i32(i32 undef)
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %f = call float @llvm.ssa.copy.f32(float undef)
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %p = call ptr @llvm.ssa.copy.p0(ptr undef)
+; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+ %i = call i32 @llvm.ssa.copy.i32(i32 undef)
+ %f = call float @llvm.ssa.copy.f32(float undef)
+ %p = call ptr @llvm.ssa.copy.p0(ptr undef)
+ ret void
+}
diff --git a/llvm/test/Analysis/CostModel/ARM/mve-target-intrinsics.ll b/llvm/test/Analysis/CostModel/ARM/mve-target-intrinsics.ll
new file mode 100644
index 0000000..d09216a
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/ARM/mve-target-intrinsics.ll
@@ -0,0 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main -mattr=+mve | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+
+define void @intrinsics() {
+; CHECK-LABEL: 'intrinsics'
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %t1 = call i32 @llvm.arm.ssat(i32 undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %t2 = tail call { <8 x half>, <8 x half> } @llvm.arm.mve.vld2q.v8f16.p0(ptr undef)
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %t3 = call { i32, i32 } @llvm.arm.mve.sqrshrl(i32 undef, i32 undef, i32 undef, i32 48)
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %t4 = tail call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 0, i32 0, i32 0, <8 x i16> undef, <8 x i16> undef)
+; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+ %t1 = call i32 @llvm.arm.ssat(i32 undef, i32 undef)
+ %t2 = tail call { <8 x half>, <8 x half> } @llvm.arm.mve.vld2q.v8f16.p0(ptr undef)
+ %t3 = call { i32, i32 } @llvm.arm.mve.sqrshrl(i32 undef, i32 undef, i32 undef, i32 48)
+ %t4 = tail call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 0, i32 0, i32 0, <8 x i16> undef, <8 x i16> undef)
+ ret void
+}
+
+declare i32 @llvm.arm.ssat(i32, i32)
+declare { <8 x half>, <8 x half> } @llvm.arm.mve.vld2q.v8f16.p0(ptr)
+declare { i32, i32 } @llvm.arm.mve.sqrshrl(i32, i32, i32, i32)
+declare { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32, i32, i32, i32, i32, <8 x i16>, <8 x i16>)
diff --git a/llvm/test/Analysis/CostModel/ARM/select.ll b/llvm/test/Analysis/CostModel/ARM/select.ll
index f626901..de429fd2 100644
--- a/llvm/test/Analysis/CostModel/ARM/select.ll
+++ b/llvm/test/Analysis/CostModel/ARM/select.ll
@@ -1,272 +1,140 @@
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp | FileCheck %s --check-prefix=CHECK-MVE-RECIP
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift | FileCheck %s --check-prefix=CHECK-NEON-RECIP
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8m.base | FileCheck %s --check-prefix=CHECK-THUMB1-RECIP
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8m.main | FileCheck %s --check-prefix=CHECK-THUMB2-RECIP
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp | FileCheck %s --check-prefix=CHECK-MVE-SIZE
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift | FileCheck %s --check-prefix=CHECK-NEON-SIZE
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8m.base | FileCheck %s --check-prefix=CHECK-THUMB1-SIZE
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8m.main | FileCheck %s --check-prefix=CHECK-THUMB2-SIZE
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp | FileCheck %s --check-prefix=CHECK-MVE
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift | FileCheck %s --check-prefix=CHECK-NEON
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mtriple=thumbv8m.base | FileCheck %s --check-prefix=CHECK-THUMB1
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mtriple=thumbv8m.main | FileCheck %s --check-prefix=CHECK-THUMB2
target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
define void @selects() {
; Scalar values
-; CHECK-MVE-RECIP-LABEL: 'selects'
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v0 = select i1 undef, i1 undef, i1 undef
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v5 = select i1 undef, float undef, float undef
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v6 = select i1 undef, double undef, double undef
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
-; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-LABEL: 'selects'
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %v0 = select i1 undef, i1 undef, i1 undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v1 = select i1 undef, i8 undef, i8 undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v2 = select i1 undef, i16 undef, i16 undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v3 = select i1 undef, i32 undef, i32 undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %v4 = select i1 undef, i64 undef, i64 undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v5 = select i1 undef, float undef, float undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v6 = select i1 undef, double undef, double undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:10 CodeSize:6 Lat:10 SizeLat:10 for: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
-; CHECK-NEON-RECIP-LABEL: 'selects'
-; CHECK-NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v0 = select i1 undef, i1 undef, i1 undef
-; CHECK-NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
-; CHECK-NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
-; CHECK-NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
-; CHECK-NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
-; CHECK-NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v5 = select i1 undef, float undef, float undef
-; CHECK-NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v6 = select i1 undef, double undef, double undef
-; CHECK-NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
-; CHECK-NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
-; CHECK-NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
-; CHECK-NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
-; CHECK-NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
-; CHECK-NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
-; CHECK-NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
-; CHECK-NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
-; CHECK-NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
-; CHECK-NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
-; CHECK-NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
-; CHECK-NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
-; CHECK-NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
-; CHECK-NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
-; CHECK-NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
-; CHECK-NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
-; CHECK-NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
-; CHECK-NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
-; CHECK-NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
-; CHECK-NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
-; CHECK-NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
-; CHECK-NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
-; CHECK-NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEON-LABEL: 'selects'
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %v0 = select i1 undef, i1 undef, i1 undef
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v1 = select i1 undef, i8 undef, i8 undef
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v2 = select i1 undef, i16 undef, i16 undef
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v3 = select i1 undef, i32 undef, i32 undef
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %v4 = select i1 undef, i64 undef, i64 undef
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v5 = select i1 undef, float undef, float undef
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v6 = select i1 undef, double undef, double undef
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
+; CHECK-NEON-NEXT: Cost Model: Found costs of 2 for: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
+; CHECK-NEON-NEXT: Cost Model: Found costs of 2 for: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
+; CHECK-NEON-NEXT: Cost Model: Found costs of 4 for: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
+; CHECK-NEON-NEXT: Cost Model: Found costs of 19 for: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
+; CHECK-NEON-NEXT: Cost Model: Found costs of 50 for: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
+; CHECK-NEON-NEXT: Cost Model: Found costs of 100 for: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
+; CHECK-NEON-NEXT: Cost Model: Found costs of 4 for: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
-; CHECK-THUMB1-RECIP-LABEL: 'selects'
-; CHECK-THUMB1-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v0 = select i1 undef, i1 undef, i1 undef
-; CHECK-THUMB1-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
-; CHECK-THUMB1-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
-; CHECK-THUMB1-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
-; CHECK-THUMB1-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
-; CHECK-THUMB1-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v5 = select i1 undef, float undef, float undef
-; CHECK-THUMB1-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v6 = select i1 undef, double undef, double undef
-; CHECK-THUMB1-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
-; CHECK-THUMB1-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
-; CHECK-THUMB1-RECIP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
-; CHECK-THUMB1-RECIP-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
-; CHECK-THUMB1-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
-; CHECK-THUMB1-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
-; CHECK-THUMB1-RECIP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
-; CHECK-THUMB1-RECIP-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
-; CHECK-THUMB1-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
-; CHECK-THUMB1-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
-; CHECK-THUMB1-RECIP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
-; CHECK-THUMB1-RECIP-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
-; CHECK-THUMB1-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
-; CHECK-THUMB1-RECIP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
-; CHECK-THUMB1-RECIP-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
-; CHECK-THUMB1-RECIP-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
-; CHECK-THUMB1-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
-; CHECK-THUMB1-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
-; CHECK-THUMB1-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
-; CHECK-THUMB1-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
-; CHECK-THUMB1-RECIP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
-; CHECK-THUMB1-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
-; CHECK-THUMB1-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-THUMB1-LABEL: 'selects'
+; CHECK-THUMB1-NEXT: Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %v0 = select i1 undef, i1 undef, i1 undef
+; CHECK-THUMB1-NEXT: Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v1 = select i1 undef, i8 undef, i8 undef
+; CHECK-THUMB1-NEXT: Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v2 = select i1 undef, i16 undef, i16 undef
+; CHECK-THUMB1-NEXT: Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v3 = select i1 undef, i32 undef, i32 undef
+; CHECK-THUMB1-NEXT: Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %v4 = select i1 undef, i64 undef, i64 undef
+; CHECK-THUMB1-NEXT: Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v5 = select i1 undef, float undef, float undef
+; CHECK-THUMB1-NEXT: Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %v6 = select i1 undef, double undef, double undef
+; CHECK-THUMB1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:6 Lat:4 SizeLat:4 for: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
+; CHECK-THUMB1-NEXT: Cost Model: Found costs of RThru:8 CodeSize:12 Lat:8 SizeLat:8 for: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
+; CHECK-THUMB1-NEXT: Cost Model: Found costs of RThru:16 CodeSize:24 Lat:16 SizeLat:16 for: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
+; CHECK-THUMB1-NEXT: Cost Model: Found costs of RThru:32 CodeSize:48 Lat:32 SizeLat:32 for: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
+; CHECK-THUMB1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:6 Lat:4 SizeLat:4 for: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
+; CHECK-THUMB1-NEXT: Cost Model: Found costs of RThru:8 CodeSize:12 Lat:8 SizeLat:8 for: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
+; CHECK-THUMB1-NEXT: Cost Model: Found costs of RThru:16 CodeSize:24 Lat:16 SizeLat:16 for: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
+; CHECK-THUMB1-NEXT: Cost Model: Found costs of RThru:32 CodeSize:48 Lat:32 SizeLat:32 for: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
+; CHECK-THUMB1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:6 Lat:4 SizeLat:4 for: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
+; CHECK-THUMB1-NEXT: Cost Model: Found costs of RThru:8 CodeSize:12 Lat:8 SizeLat:8 for: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
+; CHECK-THUMB1-NEXT: Cost Model: Found costs of RThru:16 CodeSize:24 Lat:16 SizeLat:16 for: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
+; CHECK-THUMB1-NEXT: Cost Model: Found costs of RThru:32 CodeSize:48 Lat:32 SizeLat:32 for: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
+; CHECK-THUMB1-NEXT: Cost Model: Found costs of RThru:8 CodeSize:10 Lat:8 SizeLat:8 for: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
+; CHECK-THUMB1-NEXT: Cost Model: Found costs of RThru:16 CodeSize:20 Lat:16 SizeLat:16 for: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
+; CHECK-THUMB1-NEXT: Cost Model: Found costs of RThru:32 CodeSize:40 Lat:32 SizeLat:32 for: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
+; CHECK-THUMB1-NEXT: Cost Model: Found costs of RThru:64 CodeSize:80 Lat:64 SizeLat:64 for: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
+; CHECK-THUMB1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:6 Lat:4 SizeLat:4 for: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
+; CHECK-THUMB1-NEXT: Cost Model: Found costs of RThru:8 CodeSize:12 Lat:8 SizeLat:8 for: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
+; CHECK-THUMB1-NEXT: Cost Model: Found costs of RThru:8 CodeSize:10 Lat:8 SizeLat:8 for: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
+; CHECK-THUMB1-NEXT: Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
+; CHECK-THUMB1-NEXT: Cost Model: Found costs of RThru:6 CodeSize:9 Lat:6 SizeLat:6 for: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
+; CHECK-THUMB1-NEXT: Cost Model: Found costs of RThru:20 CodeSize:25 Lat:20 SizeLat:20 for: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
+; CHECK-THUMB1-NEXT: Cost Model: Found costs of 1 for: ret void
;
-; CHECK-THUMB2-RECIP-LABEL: 'selects'
-; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v0 = select i1 undef, i1 undef, i1 undef
-; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
-; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
-; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
-; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
-; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v5 = select i1 undef, float undef, float undef
-; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v6 = select i1 undef, double undef, double undef
-; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
-; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
-; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
-; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
-; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
-; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
-; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
-; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
-; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
-; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
-; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
-; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
-; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
-; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
-; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
-; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
-; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
-; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
-; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
-; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
-; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
-; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
-; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-MVE-SIZE-LABEL: 'selects'
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v0 = select i1 undef, i1 undef, i1 undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v5 = select i1 undef, float undef, float undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v6 = select i1 undef, double undef, double undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-NEON-SIZE-LABEL: 'selects'
-; CHECK-NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v0 = select i1 undef, i1 undef, i1 undef
-; CHECK-NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
-; CHECK-NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
-; CHECK-NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
-; CHECK-NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
-; CHECK-NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v5 = select i1 undef, float undef, float undef
-; CHECK-NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v6 = select i1 undef, double undef, double undef
-; CHECK-NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
-; CHECK-NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
-; CHECK-NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
-; CHECK-NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
-; CHECK-NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
-; CHECK-NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
-; CHECK-NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
-; CHECK-NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
-; CHECK-NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
-; CHECK-NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
-; CHECK-NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
-; CHECK-NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
-; CHECK-NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
-; CHECK-NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
-; CHECK-NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
-; CHECK-NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
-; CHECK-NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
-; CHECK-NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
-; CHECK-NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
-; CHECK-NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
-; CHECK-NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
-; CHECK-NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
-; CHECK-NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-THUMB1-SIZE-LABEL: 'selects'
-; CHECK-THUMB1-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v0 = select i1 undef, i1 undef, i1 undef
-; CHECK-THUMB1-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
-; CHECK-THUMB1-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
-; CHECK-THUMB1-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
-; CHECK-THUMB1-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
-; CHECK-THUMB1-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v5 = select i1 undef, float undef, float undef
-; CHECK-THUMB1-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v6 = select i1 undef, double undef, double undef
-; CHECK-THUMB1-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
-; CHECK-THUMB1-SIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
-; CHECK-THUMB1-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
-; CHECK-THUMB1-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
-; CHECK-THUMB1-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
-; CHECK-THUMB1-SIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
-; CHECK-THUMB1-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
-; CHECK-THUMB1-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
-; CHECK-THUMB1-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
-; CHECK-THUMB1-SIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
-; CHECK-THUMB1-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
-; CHECK-THUMB1-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
-; CHECK-THUMB1-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
-; CHECK-THUMB1-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
-; CHECK-THUMB1-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
-; CHECK-THUMB1-SIZE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
-; CHECK-THUMB1-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
-; CHECK-THUMB1-SIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
-; CHECK-THUMB1-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
-; CHECK-THUMB1-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
-; CHECK-THUMB1-SIZE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
-; CHECK-THUMB1-SIZE-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
-; CHECK-THUMB1-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-THUMB2-SIZE-LABEL: 'selects'
-; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v0 = select i1 undef, i1 undef, i1 undef
-; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
-; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
-; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
-; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
-; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v5 = select i1 undef, float undef, float undef
-; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v6 = select i1 undef, double undef, double undef
-; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
-; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
-; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
-; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
-; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
-; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
-; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
-; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
-; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
-; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
-; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
-; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
-; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
-; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
-; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
-; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
-; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
-; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
-; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
-; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
-; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
-; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
-; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-THUMB2-LABEL: 'selects'
+; CHECK-THUMB2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %v0 = select i1 undef, i1 undef, i1 undef
+; CHECK-THUMB2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v1 = select i1 undef, i8 undef, i8 undef
+; CHECK-THUMB2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v2 = select i1 undef, i16 undef, i16 undef
+; CHECK-THUMB2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v3 = select i1 undef, i32 undef, i32 undef
+; CHECK-THUMB2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %v4 = select i1 undef, i64 undef, i64 undef
+; CHECK-THUMB2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v5 = select i1 undef, float undef, float undef
+; CHECK-THUMB2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %v6 = select i1 undef, double undef, double undef
+; CHECK-THUMB2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:6 Lat:4 SizeLat:4 for: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
+; CHECK-THUMB2-NEXT: Cost Model: Found costs of RThru:8 CodeSize:12 Lat:8 SizeLat:8 for: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
+; CHECK-THUMB2-NEXT: Cost Model: Found costs of RThru:16 CodeSize:24 Lat:16 SizeLat:16 for: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
+; CHECK-THUMB2-NEXT: Cost Model: Found costs of RThru:32 CodeSize:48 Lat:32 SizeLat:32 for: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
+; CHECK-THUMB2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:6 Lat:4 SizeLat:4 for: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
+; CHECK-THUMB2-NEXT: Cost Model: Found costs of RThru:8 CodeSize:12 Lat:8 SizeLat:8 for: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
+; CHECK-THUMB2-NEXT: Cost Model: Found costs of RThru:16 CodeSize:24 Lat:16 SizeLat:16 for: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
+; CHECK-THUMB2-NEXT: Cost Model: Found costs of RThru:32 CodeSize:48 Lat:32 SizeLat:32 for: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
+; CHECK-THUMB2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:6 Lat:4 SizeLat:4 for: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
+; CHECK-THUMB2-NEXT: Cost Model: Found costs of RThru:8 CodeSize:12 Lat:8 SizeLat:8 for: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
+; CHECK-THUMB2-NEXT: Cost Model: Found costs of RThru:16 CodeSize:24 Lat:16 SizeLat:16 for: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
+; CHECK-THUMB2-NEXT: Cost Model: Found costs of RThru:32 CodeSize:48 Lat:32 SizeLat:32 for: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
+; CHECK-THUMB2-NEXT: Cost Model: Found costs of RThru:8 CodeSize:10 Lat:8 SizeLat:8 for: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
+; CHECK-THUMB2-NEXT: Cost Model: Found costs of RThru:16 CodeSize:20 Lat:16 SizeLat:16 for: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
+; CHECK-THUMB2-NEXT: Cost Model: Found costs of RThru:32 CodeSize:40 Lat:32 SizeLat:32 for: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
+; CHECK-THUMB2-NEXT: Cost Model: Found costs of RThru:64 CodeSize:80 Lat:64 SizeLat:64 for: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
+; CHECK-THUMB2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:6 Lat:4 SizeLat:4 for: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
+; CHECK-THUMB2-NEXT: Cost Model: Found costs of RThru:8 CodeSize:12 Lat:8 SizeLat:8 for: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
+; CHECK-THUMB2-NEXT: Cost Model: Found costs of RThru:8 CodeSize:10 Lat:8 SizeLat:8 for: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
+; CHECK-THUMB2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
+; CHECK-THUMB2-NEXT: Cost Model: Found costs of RThru:6 CodeSize:9 Lat:6 SizeLat:6 for: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
+; CHECK-THUMB2-NEXT: Cost Model: Found costs of RThru:20 CodeSize:25 Lat:20 SizeLat:20 for: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
+; CHECK-THUMB2-NEXT: Cost Model: Found costs of 1 for: ret void
;
%v0 = select i1 undef, i1 undef, i1 undef
%v1 = select i1 undef, i8 undef, i8 undef
diff --git a/llvm/test/Analysis/CostModel/ARM/shl-cast-vect.ll b/llvm/test/Analysis/CostModel/ARM/shl-cast-vect.ll
index 465611d..5ef869e 100644
--- a/llvm/test/Analysis/CostModel/ARM/shl-cast-vect.ll
+++ b/llvm/test/Analysis/CostModel/ARM/shl-cast-vect.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 | FileCheck --check-prefix=COST %s
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 | FileCheck --check-prefix=COST %s
+
; To see the assembly output: llc -mcpu=cortex-a9 < %s | FileCheck --check-prefix=ASM %s
; ASM lines below are only for reference, tests on that direction should go to tests/CodeGen/ARM
@@ -17,11 +18,11 @@ target triple = "armv7--linux-gnueabihf"
define void @direct(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
; COST-LABEL: 'direct'
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i32>, ptr %loadaddr, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r3 = shl <4 x i32> %v0, %v1
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> %r3, ptr %storeaddr, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i32>, ptr %loadaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r3 = shl <4 x i32> %v0, %v1
+; COST-NEXT: Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%v0 = load %T432, ptr %loadaddr
; ASM: vld1.64
@@ -36,13 +37,13 @@ define void @direct(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
define void @ups1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
; COST-LABEL: 'ups1632'
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i16>, ptr %loadaddr, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r1 = sext <4 x i16> %v0 to <4 x i32>
-; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r2 = sext <4 x i16> %v1 to <4 x i32>
-; COST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r3 = shl <4 x i32> %r1, %r2
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> %r3, ptr %storeaddr, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i16>, ptr %loadaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
+; COST-NEXT: Cost Model: Found costs of 0 for: %r1 = sext <4 x i16> %v0 to <4 x i32>
+; COST-NEXT: Cost Model: Found costs of 0 for: %r2 = sext <4 x i16> %v1 to <4 x i32>
+; COST-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r3 = shl <4 x i32> %r1, %r2
+; COST-NEXT: Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%v0 = load %T416, ptr %loadaddr
; ASM: vldr
@@ -59,13 +60,13 @@ define void @ups1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
define void @upu1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
; COST-LABEL: 'upu1632'
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i16>, ptr %loadaddr, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r1 = zext <4 x i16> %v0 to <4 x i32>
-; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r2 = zext <4 x i16> %v1 to <4 x i32>
-; COST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r3 = shl <4 x i32> %r1, %r2
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> %r3, ptr %storeaddr, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i16>, ptr %loadaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
+; COST-NEXT: Cost Model: Found costs of 0 for: %r1 = zext <4 x i16> %v0 to <4 x i32>
+; COST-NEXT: Cost Model: Found costs of 0 for: %r2 = zext <4 x i16> %v1 to <4 x i32>
+; COST-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r3 = shl <4 x i32> %r1, %r2
+; COST-NEXT: Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%v0 = load %T416, ptr %loadaddr
; ASM: vldr
@@ -82,12 +83,12 @@ define void @upu1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
define void @ups3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
; COST-LABEL: 'ups3264'
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <2 x i32>, ptr %loadaddr, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r3 = shl <2 x i32> %v0, %v1
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %st = sext <2 x i32> %r3 to <2 x i64>
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> %st, ptr %storeaddr, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <2 x i32>, ptr %loadaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r3 = shl <2 x i32> %v0, %v1
+; COST-NEXT: Cost Model: Found costs of 1 for: %st = sext <2 x i32> %r3 to <2 x i64>
+; COST-NEXT: Cost Model: Found costs of 1 for: store <2 x i64> %st, ptr %storeaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%v0 = load %T232, ptr %loadaddr
; ASM: vldr
@@ -104,12 +105,12 @@ define void @ups3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
define void @upu3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
; COST-LABEL: 'upu3264'
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <2 x i32>, ptr %loadaddr, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r3 = shl <2 x i32> %v0, %v1
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %st = zext <2 x i32> %r3 to <2 x i64>
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> %st, ptr %storeaddr, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <2 x i32>, ptr %loadaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r3 = shl <2 x i32> %v0, %v1
+; COST-NEXT: Cost Model: Found costs of 1 for: %st = zext <2 x i32> %r3 to <2 x i64>
+; COST-NEXT: Cost Model: Found costs of 1 for: store <2 x i64> %st, ptr %storeaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%v0 = load %T232, ptr %loadaddr
; ASM: vldr
@@ -126,12 +127,12 @@ define void @upu3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
define void @dn3216(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
; COST-LABEL: 'dn3216'
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i32>, ptr %loadaddr, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r3 = shl <4 x i32> %v0, %v1
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %st = trunc <4 x i32> %r3 to <4 x i16>
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i16> %st, ptr %storeaddr, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i32>, ptr %loadaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r3 = shl <4 x i32> %v0, %v1
+; COST-NEXT: Cost Model: Found costs of 1 for: %st = trunc <4 x i32> %r3 to <4 x i16>
+; COST-NEXT: Cost Model: Found costs of 1 for: store <4 x i16> %st, ptr %storeaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%v0 = load %T432, ptr %loadaddr
; ASM: vld1.64
diff --git a/llvm/test/Analysis/CostModel/ARM/shuffle.ll b/llvm/test/Analysis/CostModel/ARM/shuffle.ll
index a17bbab..0fd1456 100644
--- a/llvm/test/Analysis/CostModel/ARM/shuffle.ll
+++ b/llvm/test/Analysis/CostModel/ARM/shuffle.ll
@@ -1,59 +1,59 @@
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp | FileCheck %s --check-prefix=CHECK-MVE
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift | FileCheck %s --check-prefix=CHECK-NEON
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp | FileCheck %s --check-prefix=CHECK-MVE
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift | FileCheck %s --check-prefix=CHECK-NEON
target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
define void @broadcast() {
; CHECK-MVE-LABEL: 'broadcast'
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> zeroinitializer
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> zeroinitializer
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> zeroinitializer
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> zeroinitializer
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> zeroinitializer
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> zeroinitializer
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> zeroinitializer
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> zeroinitializer
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> zeroinitializer
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> zeroinitializer
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> zeroinitializer
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> zeroinitializer
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> zeroinitializer
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> zeroinitializer
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> zeroinitializer
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> zeroinitializer
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> zeroinitializer
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> zeroinitializer
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> zeroinitializer
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> zeroinitializer
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> zeroinitializer
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:24 CodeSize:12 Lat:24 SizeLat:24 for: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> zeroinitializer
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> zeroinitializer
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> zeroinitializer
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> zeroinitializer
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:24 CodeSize:12 Lat:24 SizeLat:24 for: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> zeroinitializer
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> zeroinitializer
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> zeroinitializer
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:24 CodeSize:12 Lat:24 SizeLat:24 for: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:48 CodeSize:24 Lat:48 SizeLat:48 for: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> zeroinitializer
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:80 CodeSize:40 Lat:80 SizeLat:80 for: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> zeroinitializer
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> zeroinitializer
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> zeroinitializer
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> zeroinitializer
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> zeroinitializer
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> zeroinitializer
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> zeroinitializer
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:6 for: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> zeroinitializer
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:10 CodeSize:5 Lat:10 SizeLat:10 for: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> zeroinitializer
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-NEON-LABEL: 'broadcast'
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> zeroinitializer
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> zeroinitializer
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> zeroinitializer
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> zeroinitializer
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> zeroinitializer
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> zeroinitializer
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> zeroinitializer
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> zeroinitializer
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> zeroinitializer
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> zeroinitializer
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> zeroinitializer
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> zeroinitializer
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> zeroinitializer
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> zeroinitializer
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> zeroinitializer
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> zeroinitializer
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> zeroinitializer
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> zeroinitializer
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> zeroinitializer
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> zeroinitializer
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> zeroinitializer
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> zeroinitializer
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> zeroinitializer
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> zeroinitializer
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> zeroinitializer
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> zeroinitializer
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> zeroinitializer
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-NEON-NEXT: Cost Model: Found costs of 2 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> zeroinitializer
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEON-NEXT: Cost Model: Found costs of 2 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> zeroinitializer
+; CHECK-NEON-NEXT: Cost Model: Found costs of 2 for: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEON-NEXT: Cost Model: Found costs of 8 for: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> zeroinitializer
+; CHECK-NEON-NEXT: Cost Model: Found costs of 14 for: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> zeroinitializer
+; CHECK-NEON-NEXT: Cost Model: Found costs of 26 for: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> zeroinitializer
+; CHECK-NEON-NEXT: Cost Model: Found costs of 50 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> zeroinitializer
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> zeroinitializer
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> zeroinitializer
+; CHECK-NEON-NEXT: Cost Model: Found costs of 2 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> zeroinitializer
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> zeroinitializer
+; CHECK-NEON-NEXT: Cost Model: Found costs of 2 for: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> zeroinitializer
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> zeroinitializer
%v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> zeroinitializer
@@ -90,58 +90,58 @@ define void @broadcast() {
;; Reverse shuffles should be lowered to vrev and possibly a vext (for quadwords, on neon)
define void @reverse() {
; CHECK-MVE-LABEL: 'reverse'
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v3i24 = shufflevector <3 x i24> undef, <3 x i24> undef, <3 x i32> <i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v3i124 = shufflevector <3 x i124> undef, <3 x i124> undef, <3 x i32> <i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:256 CodeSize:128 Lat:256 SizeLat:256 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:256 CodeSize:128 Lat:256 SizeLat:256 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:48 CodeSize:24 Lat:48 SizeLat:48 for: %v3i24 = shufflevector <3 x i24> undef, <3 x i24> undef, <3 x i32> <i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:192 CodeSize:96 Lat:192 SizeLat:192 for: %v3i124 = shufflevector <3 x i124> undef, <3 x i124> undef, <3 x i32> <i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-NEON-LABEL: 'reverse'
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3i24 = shufflevector <3 x i24> undef, <3 x i24> undef, <3 x i32> <i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %v3i124 = shufflevector <3 x i124> undef, <3 x i124> undef, <3 x i32> <i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 2 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 2 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 4 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 2 for: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 4 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 2 for: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 10 for: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 20 for: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 40 for: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 80 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 2 for: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 4 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 2 for: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 2 for: %v3i24 = shufflevector <3 x i24> undef, <3 x i24> undef, <3 x i32> <i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 18 for: %v3i124 = shufflevector <3 x i124> undef, <3 x i124> undef, <3 x i32> <i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
%v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -179,40 +179,40 @@ define void @reverse() {
define void @concat() {
; CHECK-MVE-LABEL: 'concat'
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v4i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v16i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v4i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v8i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v4i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v8i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v4i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = shufflevector <2 x half> undef, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = shufflevector <4 x half> undef, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = shufflevector <8 x half> undef, <8 x half> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = shufflevector <2 x float> undef, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = shufflevector <4 x float> undef, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = shufflevector <2 x double> undef, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v4i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v16i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v4i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v8i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v4i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v8i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v4i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v4f16 = shufflevector <2 x half> undef, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v8f16 = shufflevector <4 x half> undef, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v16f16 = shufflevector <8 x half> undef, <8 x half> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v4f32 = shufflevector <2 x float> undef, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v8f32 = shufflevector <4 x float> undef, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v4f64 = shufflevector <2 x double> undef, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-NEON-LABEL: 'concat'
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v4i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v16i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v4i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v8i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v4i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v8i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v4i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v4f16 = shufflevector <2 x half> undef, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v8f16 = shufflevector <4 x half> undef, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v16f16 = shufflevector <8 x half> undef, <8 x half> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v4f32 = shufflevector <2 x float> undef, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v8f32 = shufflevector <4 x float> undef, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = shufflevector <2 x double> undef, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEON-NEXT: Cost Model: Found costs of 12 for: %v4i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 24 for: %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 48 for: %v16i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 12 for: %v4i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 24 for: %v8i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 48 for: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 12 for: %v4i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 24 for: %v8i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 12 for: %v4i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 10 for: %v4f16 = shufflevector <2 x half> undef, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 20 for: %v8f16 = shufflevector <4 x half> undef, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 40 for: %v16f16 = shufflevector <8 x half> undef, <8 x half> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 10 for: %v4f32 = shufflevector <2 x float> undef, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 20 for: %v8f32 = shufflevector <4 x float> undef, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 4 for: %v4f64 = shufflevector <2 x double> undef, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%v4i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -241,54 +241,54 @@ define void @concat() {
define void @select() {
; CHECK-MVE-LABEL: 'select'
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 0, i32 3>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 0, i32 3>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:256 CodeSize:128 Lat:256 SizeLat:256 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:256 CodeSize:128 Lat:256 SizeLat:256 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-NEON-LABEL: 'select'
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 0, i32 3>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEON-NEXT: Cost Model: Found costs of 6 for: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 0, i32 3>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 2 for: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 48 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 32 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 2 for: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 16 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 32 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 2 for: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 4 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 2 for: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 10 for: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 20 for: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 40 for: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 80 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 2 for: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 4 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 1 for: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 2 for: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 0, i32 3>
%v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
@@ -324,40 +324,40 @@ define void @select() {
define void @vrev2() {
; CHECK-MVE-LABEL: 'vrev2'
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:256 CodeSize:128 Lat:256 SizeLat:256 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-NEON-LABEL: 'vrev2'
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEON-NEXT: Cost Model: Found costs of 24 for: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 48 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 96 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 24 for: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 48 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 96 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 24 for: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 48 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 24 for: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 20 for: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 40 for: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 80 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 20 for: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 40 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 8 for: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
%v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
@@ -386,26 +386,26 @@ define void @vrev2() {
define void @vrev4() {
; CHECK-MVE-LABEL: 'vrev4'
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:256 CodeSize:128 Lat:256 SizeLat:256 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-NEON-LABEL: 'vrev4'
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEON-NEXT: Cost Model: Found costs of 48 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 96 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 48 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 96 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 48 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 40 for: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 80 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+; CHECK-NEON-NEXT: Cost Model: Found costs of 40 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-NEON-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
%v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
diff --git a/llvm/test/Analysis/CostModel/ARM/sub-cast-vect.ll b/llvm/test/Analysis/CostModel/ARM/sub-cast-vect.ll
index df26121..924a629 100644
--- a/llvm/test/Analysis/CostModel/ARM/sub-cast-vect.ll
+++ b/llvm/test/Analysis/CostModel/ARM/sub-cast-vect.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 | FileCheck --check-prefix=COST %s
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 | FileCheck --check-prefix=COST %s
+
; To see the assembly output: llc -mcpu=cortex-a9 < %s | FileCheck --check-prefix=ASM %s
; ASM lines below are only for reference, tests on that direction should go to tests/CodeGen/ARM
@@ -17,11 +18,11 @@ target triple = "armv7--linux-gnueabihf"
define void @direct(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
; COST-LABEL: 'direct'
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i32>, ptr %loadaddr, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r3 = sub <4 x i32> %v0, %v1
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> %r3, ptr %storeaddr, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i32>, ptr %loadaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT: Cost Model: Found costs of 1 for: %r3 = sub <4 x i32> %v0, %v1
+; COST-NEXT: Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%v0 = load %T432, ptr %loadaddr
; ASM: vld1.64
@@ -36,13 +37,13 @@ define void @direct(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
define void @ups1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
; COST-LABEL: 'ups1632'
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i16>, ptr %loadaddr, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r1 = sext <4 x i16> %v0 to <4 x i32>
-; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r2 = sext <4 x i16> %v1 to <4 x i32>
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r3 = sub <4 x i32> %r1, %r2
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> %r3, ptr %storeaddr, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i16>, ptr %loadaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
+; COST-NEXT: Cost Model: Found costs of 0 for: %r1 = sext <4 x i16> %v0 to <4 x i32>
+; COST-NEXT: Cost Model: Found costs of 0 for: %r2 = sext <4 x i16> %v1 to <4 x i32>
+; COST-NEXT: Cost Model: Found costs of 1 for: %r3 = sub <4 x i32> %r1, %r2
+; COST-NEXT: Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%v0 = load %T416, ptr %loadaddr
; ASM: vldr
@@ -59,13 +60,13 @@ define void @ups1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
define void @upu1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
; COST-LABEL: 'upu1632'
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i16>, ptr %loadaddr, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r1 = zext <4 x i16> %v0 to <4 x i32>
-; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r2 = zext <4 x i16> %v1 to <4 x i32>
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r3 = sub <4 x i32> %r1, %r2
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> %r3, ptr %storeaddr, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i16>, ptr %loadaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
+; COST-NEXT: Cost Model: Found costs of 0 for: %r1 = zext <4 x i16> %v0 to <4 x i32>
+; COST-NEXT: Cost Model: Found costs of 0 for: %r2 = zext <4 x i16> %v1 to <4 x i32>
+; COST-NEXT: Cost Model: Found costs of 1 for: %r3 = sub <4 x i32> %r1, %r2
+; COST-NEXT: Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%v0 = load %T416, ptr %loadaddr
; ASM: vldr
@@ -82,12 +83,12 @@ define void @upu1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
define void @ups3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
; COST-LABEL: 'ups3264'
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <2 x i32>, ptr %loadaddr, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r3 = sub <2 x i32> %v0, %v1
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %st = sext <2 x i32> %r3 to <2 x i64>
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> %st, ptr %storeaddr, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <2 x i32>, ptr %loadaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT: Cost Model: Found costs of 1 for: %r3 = sub <2 x i32> %v0, %v1
+; COST-NEXT: Cost Model: Found costs of 1 for: %st = sext <2 x i32> %r3 to <2 x i64>
+; COST-NEXT: Cost Model: Found costs of 1 for: store <2 x i64> %st, ptr %storeaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%v0 = load %T232, ptr %loadaddr
; ASM: vldr
@@ -104,12 +105,12 @@ define void @ups3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
define void @upu3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
; COST-LABEL: 'upu3264'
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <2 x i32>, ptr %loadaddr, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r3 = sub <2 x i32> %v0, %v1
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %st = zext <2 x i32> %r3 to <2 x i64>
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> %st, ptr %storeaddr, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <2 x i32>, ptr %loadaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT: Cost Model: Found costs of 1 for: %r3 = sub <2 x i32> %v0, %v1
+; COST-NEXT: Cost Model: Found costs of 1 for: %st = zext <2 x i32> %r3 to <2 x i64>
+; COST-NEXT: Cost Model: Found costs of 1 for: store <2 x i64> %st, ptr %storeaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%v0 = load %T232, ptr %loadaddr
; ASM: vldr
@@ -126,12 +127,12 @@ define void @upu3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
define void @dn3216(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
; COST-LABEL: 'dn3216'
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i32>, ptr %loadaddr, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r3 = sub <4 x i32> %v0, %v1
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %st = trunc <4 x i32> %r3 to <4 x i16>
-; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i16> %st, ptr %storeaddr, align 8
-; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i32>, ptr %loadaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT: Cost Model: Found costs of 1 for: %r3 = sub <4 x i32> %v0, %v1
+; COST-NEXT: Cost Model: Found costs of 1 for: %st = trunc <4 x i32> %r3 to <4 x i16>
+; COST-NEXT: Cost Model: Found costs of 1 for: store <4 x i16> %st, ptr %storeaddr, align 8
+; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%v0 = load %T432, ptr %loadaddr
; ASM: vld1.64
diff --git a/llvm/test/Analysis/CostModel/ARM/target-intrinsics.ll b/llvm/test/Analysis/CostModel/ARM/target-intrinsics.ll
deleted file mode 100644
index 2d6b318..0000000
--- a/llvm/test/Analysis/CostModel/ARM/target-intrinsics.ll
+++ /dev/null
@@ -1,41 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main -mattr=+mve | FileCheck %s --check-prefix=CHECK-THUMB2-RECIP
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=throughput -mtriple=thumbv8.1m.main -mattr=+mve | FileCheck %s --check-prefix=CHECK-THUMB2-RECIP
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mtriple=thumbv8.1m.main -mattr=+mve | FileCheck %s --check-prefix=CHECK-THUMB2-LAT
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8.1m.main -mattr=+mve | FileCheck %s --check-prefix=CHECK-THUMB2-SIZE
-
-target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
-
-define void @intrinsics() {
-; CHECK-THUMB2-RECIP-LABEL: 'intrinsics'
-; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t1 = call i32 @llvm.arm.ssat(i32 undef, i32 undef)
-; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t2 = tail call { <8 x half>, <8 x half> } @llvm.arm.mve.vld2q.v8f16.p0(ptr undef)
-; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t3 = call { i32, i32 } @llvm.arm.mve.sqrshrl(i32 undef, i32 undef, i32 undef, i32 48)
-; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t4 = tail call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 0, i32 0, i32 0, <8 x i16> undef, <8 x i16> undef)
-; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; CHECK-THUMB2-LAT-LABEL: 'intrinsics'
-; CHECK-THUMB2-LAT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t1 = call i32 @llvm.arm.ssat(i32 undef, i32 undef)
-; CHECK-THUMB2-LAT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t2 = tail call { <8 x half>, <8 x half> } @llvm.arm.mve.vld2q.v8f16.p0(ptr undef)
-; CHECK-THUMB2-LAT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t3 = call { i32, i32 } @llvm.arm.mve.sqrshrl(i32 undef, i32 undef, i32 undef, i32 48)
-; CHECK-THUMB2-LAT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t4 = tail call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 0, i32 0, i32 0, <8 x i16> undef, <8 x i16> undef)
-; CHECK-THUMB2-LAT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-THUMB2-SIZE-LABEL: 'intrinsics'
-; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t1 = call i32 @llvm.arm.ssat(i32 undef, i32 undef)
-; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t2 = tail call { <8 x half>, <8 x half> } @llvm.arm.mve.vld2q.v8f16.p0(ptr undef)
-; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t3 = call { i32, i32 } @llvm.arm.mve.sqrshrl(i32 undef, i32 undef, i32 undef, i32 48)
-; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t4 = tail call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 0, i32 0, i32 0, <8 x i16> undef, <8 x i16> undef)
-; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
- %t1 = call i32 @llvm.arm.ssat(i32 undef, i32 undef)
- %t2 = tail call { <8 x half>, <8 x half> } @llvm.arm.mve.vld2q.v8f16.p0(ptr undef)
- %t3 = call { i32, i32 } @llvm.arm.mve.sqrshrl(i32 undef, i32 undef, i32 undef, i32 48)
- %t4 = tail call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 0, i32 0, i32 0, <8 x i16> undef, <8 x i16> undef)
- ret void
-}
-
-declare i32 @llvm.arm.ssat(i32, i32)
-declare { <8 x half>, <8 x half> } @llvm.arm.mve.vld2q.v8f16.p0(ptr)
-declare { i32, i32 } @llvm.arm.mve.sqrshrl(i32, i32, i32, i32)
-declare { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32, i32, i32, i32, i32, <8 x i16>, <8 x i16>)
diff --git a/llvm/test/Analysis/CostModel/SystemZ/intrinsic-cost-crash.ll b/llvm/test/Analysis/CostModel/SystemZ/intrinsic-cost-crash.ll
index 245e8f7..058370c 100644
--- a/llvm/test/Analysis/CostModel/SystemZ/intrinsic-cost-crash.ll
+++ b/llvm/test/Analysis/CostModel/SystemZ/intrinsic-cost-crash.ll
@@ -23,10 +23,10 @@
%"class.llvm::Metadata.306.1758.9986.10470.10954.11438.11922.12406.12890.13374.13858.15310.15794.16278.17730.19182.21118.25958.26926.29346.29830.30314.30798.31282.31766.32250.32734.33702.36606.38058.41638" = type { i8, i8, i16, i32 }
; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end(ptr nocapture) #0
+declare void @llvm.lifetime.end(ptr nocapture)
; Function Attrs: nounwind ssp uwtable
-define hidden void @fun(ptr %N, i1 %arg) #1 align 2 {
+define hidden void @fun(ptr %N, i1 %arg) align 2 {
; CHECK: define
entry:
%NumOperands.i = getelementptr inbounds %"class.llvm::SDNode.310.1762.9990.10474.10958.11442.11926.12410.12894.13378.13862.15314.15798.16282.17734.19186.21122.25962.26930.29350.29834.30318.30802.31286.31770.32254.32738.33706.36610.38062.41642", ptr %N, i64 0, i32 8
@@ -47,8 +47,6 @@ for.body: ; preds = %for.body, %for.body
br i1 %exitcond193, label %for.cond.cleanup, label %for.body
}
-attributes #0 = { argmemonly nounwind }
-attributes #1 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
!llvm.ident = !{!0}
diff --git a/llvm/test/Analysis/Delinearization/constant_functions_multi_dim.ll b/llvm/test/Analysis/Delinearization/constant_functions_multi_dim.ll
index 0c0fb41..891d604 100644
--- a/llvm/test/Analysis/Delinearization/constant_functions_multi_dim.ll
+++ b/llvm/test/Analysis/Delinearization/constant_functions_multi_dim.ll
@@ -4,7 +4,7 @@
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
; Function Attrs: noinline nounwind uwtable
-define void @mat_mul(ptr %C, ptr %A, ptr %B, i64 %N) #0 !kernel_arg_addr_space !2 !kernel_arg_access_qual !3 !kernel_arg_type !4 !kernel_arg_base_type !4 !kernel_arg_type_qual !5 {
+define void @mat_mul(ptr %C, ptr %A, ptr %B, i64 %N) !kernel_arg_addr_space !2 !kernel_arg_access_qual !3 !kernel_arg_type !4 !kernel_arg_base_type !4 !kernel_arg_type_qual !5 {
; CHECK-LABEL: 'mat_mul'
; CHECK-NEXT: Inst: %tmp = load float, ptr %arrayidx, align 4
; CHECK-NEXT: AccessFunction: {(4 * %N * %call),+,4}<%for.inc>
@@ -22,8 +22,8 @@ entry:
br label %entry.split
entry.split: ; preds = %entry
- %call = tail call i64 @_Z13get_global_idj(i32 0) #3
- %call1 = tail call i64 @_Z13get_global_idj(i32 1) #3
+ %call = tail call i64 @_Z13get_global_idj(i32 0)
+ %call1 = tail call i64 @_Z13get_global_idj(i32 1)
%cmp1 = icmp sgt i64 %N, 0
%mul = mul nsw i64 %call, %N
br i1 %cmp1, label %for.inc.lr.ph, label %for.end
@@ -59,15 +59,10 @@ for.end: ; preds = %for.cond.for.end_cr
}
; Function Attrs: nounwind readnone
-declare i64 @_Z13get_global_idj(i32) #1
+declare i64 @_Z13get_global_idj(i32)
; Function Attrs: nounwind readnone speculatable
-declare float @llvm.fmuladd.f32(float, float, float) #2
-
-attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone speculatable }
-attributes #3 = { nounwind readnone }
+declare float @llvm.fmuladd.f32(float, float, float)
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
diff --git a/llvm/test/Analysis/DependenceAnalysis/MIVCheckConst.ll b/llvm/test/Analysis/DependenceAnalysis/MIVCheckConst.ll
index b498d7064..f5be89a 100644
--- a/llvm/test/Analysis/DependenceAnalysis/MIVCheckConst.ll
+++ b/llvm/test/Analysis/DependenceAnalysis/MIVCheckConst.ll
@@ -30,7 +30,7 @@ target datalayout = "e-m:e-p:32:32-i1:32-i64:64-a:0-v32:32-n16:32"
%20 = type { [768 x i32] }
%21 = type { [416 x i32] }
-define void @test(ptr %A, ptr %B, i1 %arg, i32 %n, i32 %m) #0 align 2 {
+define void @test(ptr %A, ptr %B, i1 %arg, i32 %n, i32 %m) align 2 {
; CHECK-LABEL: 'test'
; CHECK-NEXT: Src: %v1 = load i32, ptr %B, align 4 --> Dst: %v1 = load i32, ptr %B, align 4
; CHECK-NEXT: da analyze - none!
@@ -91,5 +91,3 @@ bb38:
bb40:
ret void
}
-
-attributes #0 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Analysis/DependenceAnalysis/NonCanonicalizedSubscript.ll b/llvm/test/Analysis/DependenceAnalysis/NonCanonicalizedSubscript.ll
index e5d5d21e..eba017a 100644
--- a/llvm/test/Analysis/DependenceAnalysis/NonCanonicalizedSubscript.ll
+++ b/llvm/test/Analysis/DependenceAnalysis/NonCanonicalizedSubscript.ll
@@ -52,7 +52,7 @@ for.end:
@a = global [10004 x [10004 x i32]] zeroinitializer, align 16
; Function Attrs: nounwind uwtable
-define void @coupled_miv_type_mismatch(i32 %n) #0 {
+define void @coupled_miv_type_mismatch(i32 %n) {
; CHECK-LABEL: 'coupled_miv_type_mismatch'
; CHECK-NEXT: Src: %2 = load i32, ptr %arrayidx5, align 4 --> Dst: %2 = load i32, ptr %arrayidx5, align 4
; CHECK-NEXT: da analyze - none!
@@ -101,8 +101,6 @@ for.end13: ; preds = %for.cond
ret void
}
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.ident = !{!0}
!0 = !{!"clang version 3.7.0"}
diff --git a/llvm/test/Analysis/DependenceAnalysis/compute-absolute-value.ll b/llvm/test/Analysis/DependenceAnalysis/compute-absolute-value.ll
new file mode 100644
index 0000000..64fad37
--- /dev/null
+++ b/llvm/test/Analysis/DependenceAnalysis/compute-absolute-value.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -disable-output "-passes=print<da>" 2>&1 | FileCheck %s
+
+; for (i = 0; i < 3; i++) {
+; a[-k * i] = 1;
+; a[-k * i + (2 * k + 1)] = 2;
+; }
+;
+; When k = -1, dependency exists between the two stores. Accesses will be:
+;
+; - a[-k * i] : a[ 0], a[-1], a[-2]
+; - a[-k * i + (2 * k + 1)] : a[-1], a[-2], a[-3]
+;
+; We cannot determine the sign of `k` and `2*k + 1` at compile time,
+;
+define void @unknown_sign(ptr %a, i64 %k) {
+; CHECK-LABEL: 'unknown_sign'
+; CHECK-NEXT: Src: store i8 1, ptr %idx.0, align 1 --> Dst: store i8 1, ptr %idx.0, align 1
+; CHECK-NEXT: da analyze - none!
+; CHECK-NEXT: Src: store i8 1, ptr %idx.0, align 1 --> Dst: store i8 2, ptr %idx.1, align 1
+; CHECK-NEXT: da analyze - output [<>]!
+; CHECK-NEXT: Src: store i8 2, ptr %idx.1, align 1 --> Dst: store i8 2, ptr %idx.1, align 1
+; CHECK-NEXT: da analyze - none!
+;
+entry:
+ %k.neg = sub nsw i64 0, %k
+ %kk = mul nsw i64 %k, 2
+ %subscript.1.init = add i64 1, %kk
+ br label %loop
+
+loop:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %loop ]
+ %subscript.0 = phi i64 [ 0, %entry ], [ %subscript.0.next, %loop ]
+ %subscript.1 = phi i64 [ %subscript.1.init, %entry ], [ %subscript.1.next, %loop ]
+ %idx.0 = getelementptr i8, ptr %a, i64 %subscript.0
+ %idx.1 = getelementptr i8, ptr %a, i64 %subscript.1
+ store i8 1, ptr %idx.0
+ store i8 2, ptr %idx.1
+ %i.next = add i64 %i, 1
+ %subscript.0.next = add nsw i64 %subscript.0, %k.neg
+ %subscript.1.next = add nsw i64 %subscript.1, %k.neg
+ %cond.exit = icmp eq i64 %i.next, 3
+ br i1 %cond.exit, label %exit, label %loop
+
+exit:
+ ret void
+}
+
diff --git a/llvm/test/Analysis/MemoryDependenceAnalysis/invariant.group-bug.ll b/llvm/test/Analysis/MemoryDependenceAnalysis/invariant.group-bug.ll
index c11191e..5470ef9 100644
--- a/llvm/test/Analysis/MemoryDependenceAnalysis/invariant.group-bug.ll
+++ b/llvm/test/Analysis/MemoryDependenceAnalysis/invariant.group-bug.ll
@@ -17,13 +17,13 @@ target triple = "x86_64-grtev4-linux-gnu"
%4 = type { ptr }
%5 = type { i64, [8 x i8] }
-define void @fail(ptr noalias sret(i1) %arg, ptr %arg1, ptr %arg2, ptr %arg3, i1 %arg4) local_unnamed_addr #0 {
+define void @fail(ptr noalias sret(i1) %arg, ptr %arg1, ptr %arg2, ptr %arg3, i1 %arg4) local_unnamed_addr {
; CHECK-LABEL: @fail(
; CHECK-NEXT: bb:
; CHECK-NEXT: [[I4:%.*]] = load ptr, ptr [[ARG1:%.*]], align 8, !invariant.group [[META6:![0-9]+]]
; CHECK-NEXT: [[I5:%.*]] = getelementptr inbounds ptr, ptr [[I4]], i64 6
; CHECK-NEXT: [[I6:%.*]] = load ptr, ptr [[I5]], align 8, !invariant.load [[META6]]
-; CHECK-NEXT: [[I7:%.*]] = tail call i64 [[I6]](ptr [[ARG1]]) #[[ATTR1:[0-9]+]]
+; CHECK-NEXT: [[I7:%.*]] = tail call i64 [[I6]](ptr [[ARG1]])
; CHECK-NEXT: [[I9:%.*]] = load ptr, ptr [[ARG2:%.*]], align 8
; CHECK-NEXT: store i8 0, ptr [[I9]], align 1
; CHECK-NEXT: br i1 [[ARG4:%.*]], label [[BB10:%.*]], label [[BB29:%.*]]
@@ -32,7 +32,7 @@ define void @fail(ptr noalias sret(i1) %arg, ptr %arg1, ptr %arg2, ptr %arg3, i1
; CHECK-NEXT: [[I15_PRE:%.*]] = load ptr, ptr [[I14_PHI_TRANS_INSERT]], align 8, !invariant.load [[META6]]
; CHECK-NEXT: br label [[BB12:%.*]]
; CHECK: bb12:
-; CHECK-NEXT: [[I16:%.*]] = call i64 [[I15_PRE]](ptr nonnull [[ARG1]], ptr null, i64 0) #[[ATTR1]]
+; CHECK-NEXT: [[I16:%.*]] = call i64 [[I15_PRE]](ptr nonnull [[ARG1]], ptr null, i64 0)
; CHECK-NEXT: br i1 true, label [[BB28:%.*]], label [[BB17:%.*]]
; CHECK: bb17:
; CHECK-NEXT: br i1 true, label [[BB18:%.*]], label [[BB21:%.*]]
@@ -55,7 +55,7 @@ bb:
%i4 = load ptr, ptr %arg1, align 8, !invariant.group !6
%i5 = getelementptr inbounds ptr, ptr %i4, i64 6
%i6 = load ptr, ptr %i5, align 8, !invariant.load !6
- %i7 = tail call i64 %i6(ptr %arg1) #1
+ %i7 = tail call i64 %i6(ptr %arg1)
%i9 = load ptr, ptr %arg2, align 8
store i8 0, ptr %i9, align 1
br i1 %arg4, label %bb10, label %bb29
@@ -67,7 +67,7 @@ bb12: ; preds = %bb28, %bb10
%i13 = load ptr, ptr %arg1, align 8, !invariant.group !6
%i14 = getelementptr inbounds ptr, ptr %i13, i64 22
%i15 = load ptr, ptr %i14, align 8, !invariant.load !6
- %i16 = call i64 %i15(ptr nonnull %arg1, ptr null, i64 0) #1
+ %i16 = call i64 %i15(ptr nonnull %arg1, ptr null, i64 0)
br i1 %arg4, label %bb28, label %bb17
bb17: ; preds = %bb12
@@ -110,9 +110,6 @@ bb29: ; preds = %bb28, %bb
ret void
}
-attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="non-leaf" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="non-leaf" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.linker.options = !{}
!llvm.module.flags = !{!0, !1, !3, !4, !5}
diff --git a/llvm/test/Analysis/MemorySSA/pr28880.ll b/llvm/test/Analysis/MemorySSA/pr28880.ll
index 98f3261..a2690b9 100644
--- a/llvm/test/Analysis/MemorySSA/pr28880.ll
+++ b/llvm/test/Analysis/MemorySSA/pr28880.ll
@@ -8,7 +8,7 @@
@global.1 = external hidden unnamed_addr global double, align 8
; Function Attrs: nounwind ssp uwtable
-define hidden fastcc void @hoge(i1 %arg) unnamed_addr #0 {
+define hidden fastcc void @hoge(i1 %arg) unnamed_addr {
bb:
br i1 %arg, label %bb1, label %bb2
@@ -45,6 +45,3 @@ bb4: ; preds = %bb3
bb6: ; preds = %bb3
unreachable
}
-
-attributes #0 = { nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
diff --git a/llvm/test/Analysis/MemorySSA/pr39197.ll b/llvm/test/Analysis/MemorySSA/pr39197.ll
index af57b3c..6be0c58 100644
--- a/llvm/test/Analysis/MemorySSA/pr39197.ll
+++ b/llvm/test/Analysis/MemorySSA/pr39197.ll
@@ -12,13 +12,13 @@ declare void @dummy()
; CHECK-LABEL: @main()
; Function Attrs: nounwind
-define dso_local void @main() #0 {
+define dso_local void @main() {
call void @func_1()
unreachable
}
; Function Attrs: nounwind
-define dso_local void @func_1() #0 {
+define dso_local void @func_1() {
%1 = alloca ptr, align 8
%2 = call signext i32 @func_2()
%3 = icmp ne i32 %2, 0
@@ -64,45 +64,45 @@ define dso_local void @func_1() #0 {
}
; Function Attrs: nounwind
-declare dso_local signext i32 @func_2() #0
+declare dso_local signext i32 @func_2()
; Function Attrs: nounwind
-define dso_local void @safe_sub_func_uint8_t_u_u() #0 {
+define dso_local void @safe_sub_func_uint8_t_u_u() {
ret void
}
; Function Attrs: nounwind
-define dso_local void @safe_add_func_int64_t_s_s() #0 {
+define dso_local void @safe_add_func_int64_t_s_s() {
ret void
}
; Function Attrs: nounwind
-define dso_local void @safe_rshift_func_int16_t_s_u() #0 {
+define dso_local void @safe_rshift_func_int16_t_s_u() {
ret void
}
; Function Attrs: nounwind
-define dso_local void @safe_div_func_uint8_t_u_u() #0 {
+define dso_local void @safe_div_func_uint8_t_u_u() {
ret void
}
; Function Attrs: nounwind
-define dso_local void @safe_mul_func_uint16_t_u_u() #0 {
+define dso_local void @safe_mul_func_uint16_t_u_u() {
ret void
}
; Function Attrs: nounwind
-define dso_local void @safe_mul_func_int16_t_s_s() #0 {
+define dso_local void @safe_mul_func_int16_t_s_s() {
ret void
}
; Function Attrs: nounwind
-define dso_local void @safe_div_func_int32_t_s_s() #0 {
+define dso_local void @safe_div_func_int32_t_s_s() {
ret void
}
; Function Attrs: nounwind
-define dso_local signext i16 @safe_sub_func_int16_t_s_s(i16 signext) #0 {
+define dso_local signext i16 @safe_sub_func_int16_t_s_s(i16 signext) {
%2 = alloca i16, align 2
store i16 %0, ptr %2, align 2, !tbaa !1
%3 = load i16, ptr %2, align 2, !tbaa !1
@@ -113,29 +113,25 @@ define dso_local signext i16 @safe_sub_func_int16_t_s_s(i16 signext) #0 {
}
; Function Attrs: nounwind
-define dso_local void @safe_add_func_uint16_t_u_u() #0 {
+define dso_local void @safe_add_func_uint16_t_u_u() {
ret void
}
; Function Attrs: nounwind
-define dso_local void @safe_div_func_int8_t_s_s() #0 {
+define dso_local void @safe_div_func_int8_t_s_s() {
ret void
}
; Function Attrs: nounwind
-define dso_local void @safe_add_func_int16_t_s_s() #0 {
+define dso_local void @safe_add_func_int16_t_s_s() {
ret void
}
; Function Attrs: nounwind
-define dso_local void @safe_add_func_uint8_t_u_u() #0 {
+define dso_local void @safe_add_func_uint8_t_u_u() {
ret void
}
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="z13" "target-features"="+transactional-execution,+vector" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind }
-
!llvm.ident = !{!0}
!0 = !{!"clang version 8.0.0 (http://llvm.org/git/clang.git 7cda4756fc9713d98fd3513b8df172700f267bad) (http://llvm.org/git/llvm.git 199c0d32e96b646bd8cf6beeaf0f99f8a434b56a)"}
diff --git a/llvm/test/Analysis/MemorySSA/pr40038.ll b/llvm/test/Analysis/MemorySSA/pr40038.ll
index efdcbe5..39ea78b 100644
--- a/llvm/test/Analysis/MemorySSA/pr40038.ll
+++ b/llvm/test/Analysis/MemorySSA/pr40038.ll
@@ -10,21 +10,21 @@ target triple = "s390x-ibm-linux"
; Function Attrs: nounwind
; CHECK-LABEL: @main
-define dso_local void @main() #0 {
+define dso_local void @main() {
bb:
call void @func_1()
unreachable
}
; Function Attrs: nounwind
-define dso_local void @func_1() #0 {
+define dso_local void @func_1() {
bb:
call void @func_2()
unreachable
}
; Function Attrs: nounwind
-define dso_local void @func_2() #0 {
+define dso_local void @func_2() {
bb:
%tmp = alloca i32, align 4
store i32 0, ptr @g_80, align 4, !tbaa !1
@@ -68,10 +68,7 @@ bb18: ; preds = %bb12, %bb1
}
; Function Attrs: cold noreturn nounwind
-declare void @llvm.trap() #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="z13" "target-features"="+transactional-execution,+vector" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { cold noreturn nounwind }
+declare void @llvm.trap()
!llvm.ident = !{!0}
diff --git a/llvm/test/Analysis/MemorySSA/pr43569.ll b/llvm/test/Analysis/MemorySSA/pr43569.ll
index 02d074e..c81f8d4 100644
--- a/llvm/test/Analysis/MemorySSA/pr43569.ll
+++ b/llvm/test/Analysis/MemorySSA/pr43569.ll
@@ -10,7 +10,7 @@ target triple = "x86_64-unknown-linux-gnu"
; CHECK-LABEL: @c()
; Function Attrs: nounwind uwtable
-define dso_local void @c() #0 {
+define dso_local void @c() {
entry:
call void @llvm.instrprof.increment(ptr @__profn_c, i64 68269137, i32 3, i32 0)
br label %for.cond
@@ -42,8 +42,4 @@ for.end: ; preds = %for.cond1
}
; Function Attrs: nounwind
-declare void @llvm.instrprof.increment(ptr, i64, i32, i32) #1
-
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind }
-
+declare void @llvm.instrprof.increment(ptr, i64, i32, i32)
diff --git a/llvm/test/Analysis/ScalarEvolution/pr22674.ll b/llvm/test/Analysis/ScalarEvolution/pr22674.ll
index 95f96ca..b2f4ae6 100644
--- a/llvm/test/Analysis/ScalarEvolution/pr22674.ll
+++ b/llvm/test/Analysis/ScalarEvolution/pr22674.ll
@@ -11,7 +11,7 @@ target triple = "x86_64-pc-linux-gnux32"
%"class.llvm::AttributeImpl.2.1802.3601.5914.6685.7456.8227.9255.9769.10026.18508" = type <{ ptr, %"class.llvm::FoldingSetImpl::Node.1.1801.3600.5913.6684.7455.8226.9254.9768.10025.18505", i8, [3 x i8] }>
; Function Attrs: nounwind uwtable
-define void @_ZNK4llvm11AttrBuilder13hasAttributesENS_12AttributeSetEy(i1 %arg) #0 align 2 {
+define void @_ZNK4llvm11AttrBuilder13hasAttributesENS_12AttributeSetEy(i1 %arg) align 2 {
entry:
br i1 %arg, label %cond.false, label %_ZNK4llvm12AttributeSet11getNumSlotsEv.exit
@@ -82,8 +82,6 @@ return: ; preds = %_ZNK4llvm9Attribute
ret void
}
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
diff --git a/llvm/test/Analysis/ScalarEvolution/scev-canonical-mode.ll b/llvm/test/Analysis/ScalarEvolution/scev-canonical-mode.ll
index 3879b2e7..d9cc3e5 100644
--- a/llvm/test/Analysis/ScalarEvolution/scev-canonical-mode.ll
+++ b/llvm/test/Analysis/ScalarEvolution/scev-canonical-mode.ll
@@ -6,7 +6,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; Function Attrs: norecurse nounwind uwtable
-define void @ehF(i1 %arg) #0 {
+define void @ehF(i1 %arg) {
entry:
br i1 %arg, label %if.then.i, label %hup.exit
@@ -28,5 +28,3 @@ for.body.i: ; preds = %for.body.i, %for.bo
hup.exit: ; preds = %for.body.i, %if.then.i, %entry
ret void
}
-
-attributes #0 = { norecurse nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Analysis/TypeBasedAliasAnalysis/PR17620.ll b/llvm/test/Analysis/TypeBasedAliasAnalysis/PR17620.ll
index 67d81e7..7fb4231 100644
--- a/llvm/test/Analysis/TypeBasedAliasAnalysis/PR17620.ll
+++ b/llvm/test/Analysis/TypeBasedAliasAnalysis/PR17620.ll
@@ -13,7 +13,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
%classD = type { ptr }
; Function Attrs: ssp uwtable
-define ptr @test(ptr %this, ptr %p1) #0 align 2 {
+define ptr @test(ptr %this, ptr %p1) align 2 {
entry:
; CHECK-LABEL: @test
; CHECK: load ptr, ptr %p1, align 8, !tbaa
@@ -25,10 +25,7 @@ entry:
unreachable
}
-declare void @callee(ptr, ptr) #1
-
-attributes #0 = { ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+declare void @callee(ptr, ptr)
!llvm.ident = !{!0}
diff --git a/llvm/test/Analysis/TypeBasedAliasAnalysis/tbaa-path.ll b/llvm/test/Analysis/TypeBasedAliasAnalysis/tbaa-path.ll
index 942fdf5..f9a2988 100644
--- a/llvm/test/Analysis/TypeBasedAliasAnalysis/tbaa-path.ll
+++ b/llvm/test/Analysis/TypeBasedAliasAnalysis/tbaa-path.ll
@@ -9,7 +9,7 @@
%struct.StructC = type { i16, %struct.StructB, i32 }
%struct.StructD = type { i16, %struct.StructB, i32, i8 }
-define i32 @_Z1gPjP7StructAy(ptr %s, ptr %A, i64 %count) #0 {
+define i32 @_Z1gPjP7StructAy(ptr %s, ptr %A, i64 %count) {
entry:
; Access to ptr and &(A->f32).
; CHECK: Function
@@ -35,7 +35,7 @@ entry:
ret i32 %3
}
-define i32 @_Z2g2PjP7StructAy(ptr %s, ptr %A, i64 %count) #0 {
+define i32 @_Z2g2PjP7StructAy(ptr %s, ptr %A, i64 %count) {
entry:
; Access to ptr and &(A->f16).
; CHECK: Function
@@ -60,7 +60,7 @@ entry:
ret i32 %3
}
-define i32 @_Z2g3P7StructAP7StructBy(ptr %A, ptr %B, i64 %count) #0 {
+define i32 @_Z2g3P7StructAP7StructBy(ptr %A, ptr %B, i64 %count) {
entry:
; Access to &(A->f32) and &(B->a.f32).
; CHECK: Function
@@ -89,7 +89,7 @@ entry:
ret i32 %3
}
-define i32 @_Z2g4P7StructAP7StructBy(ptr %A, ptr %B, i64 %count) #0 {
+define i32 @_Z2g4P7StructAP7StructBy(ptr %A, ptr %B, i64 %count) {
entry:
; Access to &(A->f32) and &(B->a.f16).
; CHECK: Function
@@ -117,7 +117,7 @@ entry:
ret i32 %3
}
-define i32 @_Z2g5P7StructAP7StructBy(ptr %A, ptr %B, i64 %count) #0 {
+define i32 @_Z2g5P7StructAP7StructBy(ptr %A, ptr %B, i64 %count) {
entry:
; Access to &(A->f32) and &(B->f32).
; CHECK: Function
@@ -145,7 +145,7 @@ entry:
ret i32 %3
}
-define i32 @_Z2g6P7StructAP7StructBy(ptr %A, ptr %B, i64 %count) #0 {
+define i32 @_Z2g6P7StructAP7StructBy(ptr %A, ptr %B, i64 %count) {
entry:
; Access to &(A->f32) and &(B->a.f32_2).
; CHECK: Function
@@ -174,7 +174,7 @@ entry:
ret i32 %3
}
-define i32 @_Z2g7P7StructAP7StructSy(ptr %A, ptr %S, i64 %count) #0 {
+define i32 @_Z2g7P7StructAP7StructSy(ptr %A, ptr %S, i64 %count) {
entry:
; Access to &(A->f32) and &(S->f32).
; CHECK: Function
@@ -202,7 +202,7 @@ entry:
ret i32 %3
}
-define i32 @_Z2g8P7StructAP7StructSy(ptr %A, ptr %S, i64 %count) #0 {
+define i32 @_Z2g8P7StructAP7StructSy(ptr %A, ptr %S, i64 %count) {
entry:
; Access to &(A->f32) and &(S->f16).
; CHECK: Function
@@ -229,7 +229,7 @@ entry:
ret i32 %3
}
-define i32 @_Z2g9P7StructSP8StructS2y(ptr %S, ptr %S2, i64 %count) #0 {
+define i32 @_Z2g9P7StructSP8StructS2y(ptr %S, ptr %S2, i64 %count) {
entry:
; Access to &(S->f32) and &(S2->f32).
; CHECK: Function
@@ -257,7 +257,7 @@ entry:
ret i32 %3
}
-define i32 @_Z3g10P7StructSP8StructS2y(ptr %S, ptr %S2, i64 %count) #0 {
+define i32 @_Z3g10P7StructSP8StructS2y(ptr %S, ptr %S2, i64 %count) {
entry:
; Access to &(S->f32) and &(S2->f16).
; CHECK: Function
@@ -284,7 +284,7 @@ entry:
ret i32 %3
}
-define i32 @_Z3g11P7StructCP7StructDy(ptr %C, ptr %D, i64 %count) #0 {
+define i32 @_Z3g11P7StructCP7StructDy(ptr %C, ptr %D, i64 %count) {
entry:
; Access to &(C->b.a.f32) and &(D->b.a.f32).
; CHECK: Function
@@ -318,7 +318,7 @@ entry:
ret i32 %3
}
-define i32 @_Z3g12P7StructCP7StructDy(ptr %C, ptr %D, i64 %count) #0 {
+define i32 @_Z3g12P7StructCP7StructDy(ptr %C, ptr %D, i64 %count) {
entry:
; Access to &(b1->a.f32) and &(b2->a.f32).
; CHECK: Function
@@ -357,8 +357,6 @@ entry:
ret i32 %5
}
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!0 = !{!1, !1, i64 0}
!1 = !{!"any pointer", !2}
!2 = !{!"omnipotent char", !3}
diff --git a/llvm/test/Assembler/atomic.ll b/llvm/test/Assembler/atomic.ll
index 39f33f9f..6609edc 100644
--- a/llvm/test/Assembler/atomic.ll
+++ b/llvm/test/Assembler/atomic.ll
@@ -52,6 +52,25 @@ define void @f(ptr %x) {
; CHECK: atomicrmw volatile usub_sat ptr %x, i32 10 syncscope("agent") monotonic
atomicrmw volatile usub_sat ptr %x, i32 10 syncscope("agent") monotonic
+ ; CHECK : load atomic <1 x i32>, ptr %x unordered, align 4
+ load atomic <1 x i32>, ptr %x unordered, align 4
+ ; CHECK : store atomic <1 x i32> splat (i32 3), ptr %x release, align 4
+ store atomic <1 x i32> <i32 3>, ptr %x release, align 4
+ ; CHECK : load atomic <2 x i32>, ptr %x unordered, align 4
+ load atomic <2 x i32>, ptr %x unordered, align 4
+ ; CHECK : store atomic <2 x i32> <i32 3, i32 4>, ptr %x release, align 4
+ store atomic <2 x i32> <i32 3, i32 4>, ptr %x release, align 4
+
+ ; CHECK : load atomic <2 x ptr>, ptr %x unordered, align 4
+ load atomic <2 x ptr>, ptr %x unordered, align 4
+ ; CHECK : store atomic <2 x ptr> zeroinitializer, ptr %x release, align 4
+ store atomic <2 x ptr> zeroinitializer, ptr %x release, align 4
+
+ ; CHECK : load atomic <2 x float>, ptr %x unordered, align 4
+ load atomic <2 x float>, ptr %x unordered, align 4
+ ; CHECK : store atomic <2 x float> <float 3.0, float 4.0>, ptr %x release, align 4
+ store atomic <2 x float> <float 3.0, float 4.0>, ptr %x release, align 4
+
; CHECK: fence syncscope("singlethread") release
fence syncscope("singlethread") release
; CHECK: fence seq_cst
diff --git a/llvm/test/Assembler/metadata-annotations.ll b/llvm/test/Assembler/metadata-annotations.ll
new file mode 100644
index 0000000..4fd4713
--- /dev/null
+++ b/llvm/test/Assembler/metadata-annotations.ll
@@ -0,0 +1,9 @@
+; RUN: llvm-as < %s | llvm-dis --materialize-metadata --show-annotations | FileCheck %s
+
+; CHECK: ; Materializable
+; CHECK-NEXT: define dso_local i32 @test() {}
+define dso_local i32 @test() {
+entry:
+ ret i32 0
+}
+
diff --git a/llvm/test/Bitcode/DILocation-implicit-code.ll b/llvm/test/Bitcode/DILocation-implicit-code.ll
index 159cb8a..1090bff 100644
--- a/llvm/test/Bitcode/DILocation-implicit-code.ll
+++ b/llvm/test/Bitcode/DILocation-implicit-code.ll
@@ -13,7 +13,7 @@ $_ZN1A3fooEi = comdat any
@_ZTIi = external dso_local constant i8*
; Function Attrs: noinline optnone uwtable
-define dso_local void @_Z5test1v() #0 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) !dbg !7 {
+define dso_local void @_Z5test1v() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) !dbg !7 {
entry:
%retval = alloca %struct.A, align 1
%a = alloca %struct.A, align 1
@@ -40,13 +40,13 @@ lpad: ; preds = %entry
catch.dispatch: ; preds = %lpad
%sel = load i32, i32* %ehselector.slot, align 4, !dbg !13
- %3 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*)) #4, !dbg !13
+ %3 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*)), !dbg !13
%matches = icmp eq i32 %sel, %3, !dbg !13
br i1 %matches, label %catch, label %eh.resume, !dbg !13
catch: ; preds = %catch.dispatch
%exn = load i8*, i8** %exn.slot, align 8, !dbg !13
- %4 = call i8* @__cxa_begin_catch(i8* %exn) #4, !dbg !13
+ %4 = call i8* @__cxa_begin_catch(i8* %exn), !dbg !13
%5 = bitcast i8* %4 to i32*, !dbg !13
%6 = load i32, i32* %5, align 4, !dbg !13
store i32 %6, i32* %e, align 4, !dbg !13
@@ -55,7 +55,7 @@ catch: ; preds = %catch.dispatch
to label %invoke.cont2 unwind label %lpad1, !dbg !15
invoke.cont2: ; preds = %catch
- call void @__cxa_end_catch() #4, !dbg !16
+ call void @__cxa_end_catch(), !dbg !16
br label %return
lpad1: ; preds = %catch
@@ -65,7 +65,7 @@ lpad1: ; preds = %catch
store i8* %9, i8** %exn.slot, align 8, !dbg !12
%10 = extractvalue { i8*, i32 } %8, 1, !dbg !12
store i32 %10, i32* %ehselector.slot, align 4, !dbg !12
- call void @__cxa_end_catch() #4, !dbg !16
+ call void @__cxa_end_catch(), !dbg !16
br label %eh.resume, !dbg !16
try.cont: ; No predecessors!
@@ -84,7 +84,7 @@ eh.resume: ; preds = %lpad1, %catch.dispa
}
; Function Attrs: noinline nounwind optnone uwtable
-define linkonce_odr dso_local void @_ZN1AC2Ev(%struct.A* %this) unnamed_addr #1 comdat align 2 !dbg !17 {
+define linkonce_odr dso_local void @_ZN1AC2Ev(%struct.A* %this) unnamed_addr comdat align 2 !dbg !17 {
entry:
%this.addr = alloca %struct.A*, align 8
store %struct.A* %this, %struct.A** %this.addr, align 8
@@ -93,7 +93,7 @@ entry:
}
; Function Attrs: noinline optnone uwtable
-define linkonce_odr dso_local void @_ZN1A3fooEi(%struct.A* %this, i32 %i) #0 comdat align 2 !dbg !19 {
+define linkonce_odr dso_local void @_ZN1A3fooEi(%struct.A* %this, i32 %i) comdat align 2 !dbg !19 {
entry:
%retval = alloca %struct.A, align 1
%this.addr = alloca %struct.A*, align 8
@@ -106,10 +106,10 @@ entry:
br i1 %cmp, label %if.then, label %if.end, !dbg !20
if.then: ; preds = %entry
- %exception = call i8* @__cxa_allocate_exception(i64 4) #4, !dbg !22
+ %exception = call i8* @__cxa_allocate_exception(i64 4), !dbg !22
%1 = bitcast i8* %exception to i32*, !dbg !22
store i32 1, i32* %1, align 16, !dbg !22
- call void @__cxa_throw(i8* %exception, i8* bitcast (i8** @_ZTIi to i8*), i8* null) #5, !dbg !22
+ call void @__cxa_throw(i8* %exception, i8* bitcast (i8** @_ZTIi to i8*), i8* null), !dbg !22
unreachable, !dbg !22
if.end: ; preds = %entry
@@ -119,17 +119,17 @@ if.end: ; preds = %entry
declare dso_local i32 @__gxx_personality_v0(...)
; Function Attrs: nounwind readnone
-declare i32 @llvm.eh.typeid.for(i8*) #2
+declare i32 @llvm.eh.typeid.for(i8*)
declare dso_local i8* @__cxa_begin_catch(i8*)
declare dso_local void @__cxa_end_catch()
; Function Attrs: noreturn nounwind
-declare void @llvm.trap() #3
+declare void @llvm.trap()
; Function Attrs: noinline optnone uwtable
-define dso_local void @_Z5test2v() #0 !dbg !24 {
+define dso_local void @_Z5test2v() !dbg !24 {
entry:
%a = alloca %struct.A, align 1
%b = alloca %struct.A, align 1
@@ -143,13 +143,6 @@ declare dso_local i8* @__cxa_allocate_exception(i64)
declare dso_local void @__cxa_throw(i8*, i8*, i8*)
-attributes #0 = { noinline optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { noreturn nounwind }
-attributes #4 = { nounwind }
-attributes #5 = { noreturn }
-
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
!llvm.ident = !{!6}
diff --git a/llvm/test/Bitcode/drop-debug-info.3.5.ll b/llvm/test/Bitcode/drop-debug-info.3.5.ll
index 35d3958..16102a0 100644
--- a/llvm/test/Bitcode/drop-debug-info.3.5.ll
+++ b/llvm/test/Bitcode/drop-debug-info.3.5.ll
@@ -12,15 +12,13 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.10.0"
; Function Attrs: nounwind ssp uwtable
-define i32 @main() #0 {
+define i32 @main() {
entry:
%retval = alloca i32, align 4
store i32 0, i32* %retval
ret i32 0, !dbg !12
}
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!9, !10}
!llvm.ident = !{!11}
diff --git a/llvm/test/Bitcode/thinlto-deadstrip-flag.ll b/llvm/test/Bitcode/thinlto-deadstrip-flag.ll
deleted file mode 100644
index 00c0131..0000000
--- a/llvm/test/Bitcode/thinlto-deadstrip-flag.ll
+++ /dev/null
@@ -1,20 +0,0 @@
-; REQUIRES: x86-registered-target
-; RUN: opt -module-summary %s -o %t.o
-
-; Ensure dead stripping performed flag is set on distributed index
-; RUN: llvm-lto2 run %t.o -o %t.out -thinlto-distributed-indexes \
-; RUN: -r %t.o,glob,plx
-; RUN: llvm-bcanalyzer -dump %t.o.thinlto.bc | FileCheck %s --check-prefix=WITHDEAD
-; WITHDEAD: <FLAGS op0=97/>
-
-; Ensure dead stripping performed flag is not set on distributed index
-; when option used to disable dead stripping computation.
-; RUN: llvm-lto2 run %t.o -o %t.out -thinlto-distributed-indexes \
-; RUN: -r %t.o,glob,plx -compute-dead=false
-; RUN: llvm-bcanalyzer -dump %t.o.thinlto.bc | FileCheck %s --check-prefix=NODEAD
-; NODEAD: <FLAGS op0=96/>
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-@glob = global i32 0
diff --git a/llvm/test/Bitcode/thinlto-index-flags.ll b/llvm/test/Bitcode/thinlto-index-flags.ll
new file mode 100644
index 0000000..e957ce6
--- /dev/null
+++ b/llvm/test/Bitcode/thinlto-index-flags.ll
@@ -0,0 +1,39 @@
+; REQUIRES: x86-registered-target
+; RUN: opt -module-summary %s -o %t.o
+
+;; By default, the indexing step should perform and set the appropriate index
+;; flags for dead stripping, attribute propagation, DSO local propagation,
+;; and internalization/promotion.
+; RUN: llvm-lto2 run %t.o -o %t.out -thinlto-distributed-indexes \
+; RUN: -r %t.o,glob,plx
+; RUN: llvm-bcanalyzer -dump %t.o.thinlto.bc | FileCheck %s --check-prefix=ALL
+;; The flag value should be 0x461 aka 1121:
+;; 0x1: Dead stripping
+;; 0x20: Attribute propagation
+;; 0x40: DSO local propagation
+;; 0x400: Internalization/promotion
+; ALL: <FLAGS op0=1121/>
+
+;; Ensure dead stripping performed flag is not set on distributed index
+;; when option used to disable dead stripping computation.
+; RUN: llvm-lto2 run %t.o -o %t.out -thinlto-distributed-indexes \
+; RUN: -r %t.o,glob,plx -compute-dead=false
+; RUN: llvm-bcanalyzer -dump %t.o.thinlto.bc | FileCheck %s --check-prefix=NODEAD
+;; Flag should be 0x460 aka 1120.
+; NODEAD: <FLAGS op0=1120/>
+
+;; Disabling attribute propagation should disable that as well as DSO local
+;; propagation.
+; RUN: llvm-lto2 run %t.o -o %t.out -thinlto-distributed-indexes \
+; RUN: -r %t.o,glob,plx -propagate-attrs=false
+; RUN: llvm-bcanalyzer -dump %t.o.thinlto.bc | FileCheck %s --check-prefix=NOPROP
+;; Flag should be 0x401 aka 1025.
+; NOPROP: <FLAGS op0=1025/>
+
+;; Note there isn't currently a way to disable internalization+promotion, which
+;; are performed together.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@glob = global i32 0
diff --git a/llvm/test/Bitcode/upgrade-tbaa.ll b/llvm/test/Bitcode/upgrade-tbaa.ll
index 7a70d99..893ba69 100644
--- a/llvm/test/Bitcode/upgrade-tbaa.ll
+++ b/llvm/test/Bitcode/upgrade-tbaa.ll
@@ -2,7 +2,7 @@
; RUN: verify-uselistorder < %s
; Function Attrs: nounwind
-define void @_Z4testPiPf(i32* nocapture %pI, float* nocapture %pF) #0 {
+define void @_Z4testPiPf(i32* nocapture %pI, float* nocapture %pF) {
entry:
store i32 0, i32* %pI, align 4, !tbaa !{!"int", !0}
; CHECK: store i32 0, ptr %pI, align 4, !tbaa [[TAG_INT:!.*]]
@@ -11,8 +11,6 @@ entry:
ret void
}
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!0 = !{!"omnipotent char", !1}
!1 = !{!"Simple C/C++ TBAA"}
!2 = !{!"float", !0}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
index 0f75887..c68ae92 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
@@ -1734,8 +1734,7 @@ define float @test_different_call_conv_target(float %x) {
define <2 x i32> @test_shufflevector_s32_v2s32(i32 %arg) {
; CHECK-LABEL: name: test_shufflevector_s32_v2s32
; CHECK: [[ARG:%[0-9]+]]:_(s32) = COPY $w0
-; CHECK-DAG: [[UNDEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = G_SHUFFLE_VECTOR [[ARG]](s32), [[UNDEF]], shufflemask(0, 0)
+; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[ARG]](s32), [[ARG]](s32)
; CHECK: $d0 = COPY [[VEC]](<2 x s32>)
%vec = insertelement <1 x i32> undef, i32 %arg, i32 0
%res = shufflevector <1 x i32> %vec, <1 x i32> undef, <2 x i32> zeroinitializer
@@ -1745,7 +1744,8 @@ define <2 x i32> @test_shufflevector_s32_v2s32(i32 %arg) {
define i32 @test_shufflevector_v2s32_s32(<2 x i32> %arg) {
; CHECK-LABEL: name: test_shufflevector_v2s32_s32
; CHECK: [[ARG:%[0-9]+]]:_(<2 x s32>) = COPY $d0
-; CHECK: [[RES:%[0-9]+]]:_(s32) = G_SHUFFLE_VECTOR [[ARG]](<2 x s32>), [[UNDEF]], shufflemask(1)
+; CHECK: [[IDX:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+; CHECK: [[RES:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[ARG]](<2 x s32>), [[IDX]](s64)
; CHECK: $w0 = COPY [[RES]](s32)
%vec = shufflevector <2 x i32> %arg, <2 x i32> undef, <1 x i32> <i32 1>
%res = extractelement <1 x i32> %vec, i32 0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir
index 2e70252..fdd0ebb 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir
@@ -119,33 +119,6 @@ body: |
...
---
-name: shuffle_1elt_mask
-alignment: 4
-tracksRegLiveness: true
-body: |
- bb.1:
- liveins: $d0, $d1
-
- ; CHECK-LABEL: name: shuffle_1elt_mask
- ; CHECK: liveins: $d0, $d1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $d1
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY]](s64)
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[COPY1]](s64)
- ; CHECK-NEXT: $d0 = COPY [[COPY2]](s64)
- ; CHECK-NEXT: $d1 = COPY [[COPY3]](s64)
- ; CHECK-NEXT: RET_ReallyLR implicit $d0, implicit $d1
- %0:_(s64) = COPY $d0
- %1:_(s64) = COPY $d1
- %3:_(s64) = G_SHUFFLE_VECTOR %0:_(s64), %1:_, shufflemask(0)
- %4:_(s64) = G_SHUFFLE_VECTOR %0:_(s64), %1:_, shufflemask(1)
- $d0 = COPY %3(s64)
- $d1 = COPY %4(s64)
- RET_ReallyLR implicit $d0, implicit $d1
-
-...
----
name: oversize_shuffle_v4i64
alignment: 4
tracksRegLiveness: true
@@ -641,7 +614,8 @@ body: |
%0:_(<8 x s1>) = G_TRUNC %2:_(<8 x s8>)
%3:_(<8 x s8>) = COPY $d1
%1:_(<8 x s1>) = G_TRUNC %3:_(<8 x s8>)
- %4:_(s1) = G_SHUFFLE_VECTOR %0:_(<8 x s1>), %1:_, shufflemask(12)
+ %7:_(s64) = G_CONSTANT i64 4
+ %4:_(s1) = G_EXTRACT_VECTOR_ELT %1:_(<8 x s1>), %7(s64)
%5:_(s8) = G_ZEXT %4:_(s1)
%6:_(s32) = G_ANYEXT %5:_(s8)
$w0 = COPY %6:_(s32)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector-disjoint-mask.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector-disjoint-mask.mir
index 9261d7a..914dfa0 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector-disjoint-mask.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector-disjoint-mask.mir
@@ -80,22 +80,3 @@ body: |
%2:_(<4 x s32>) = G_SHUFFLE_VECTOR %0, %1, shufflemask(-1,0,1,-1)
RET_ReallyLR implicit %2
...
-
----
-name: shuffle_vector_scalar
-tracksRegLiveness: true
-body: |
- bb.1:
- liveins: $x0, $x1
-
- ; CHECK-LABEL: name: shuffle_vector_scalar
- ; CHECK: liveins: $x0, $x1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[COPY]](s64), [[COPY]](s64), [[COPY]](s64), [[COPY]](s64)
- ; CHECK-NEXT: RET_ReallyLR implicit [[BUILD_VECTOR]](<4 x s64>)
- %0:_(s64) = COPY $x0
- %1:_(s64) = COPY $x1
- %2:_(<4 x s64>) = G_SHUFFLE_VECTOR %0, %1, shufflemask(0, 0, 0, 0)
- RET_ReallyLR implicit %2
-...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector-undef-rhs.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector-undef-rhs.mir
index 9bf7993..7a314ba 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector-undef-rhs.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector-undef-rhs.mir
@@ -20,23 +20,3 @@ body: |
%2:_(<4 x s32>) = G_SHUFFLE_VECTOR %0(<2 x s32>), %1(<2 x s32>), shufflemask(0, 2, 1, 3)
RET_ReallyLR implicit %2
...
-
----
-name: shuffle_vector_undef_rhs_scalar
-tracksRegLiveness: true
-body: |
- bb.1:
- liveins: $x0
-
- ; CHECK-LABEL: name: shuffle_vector_undef_rhs_scalar
- ; CHECK: liveins: $x0
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[COPY]](s64), [[DEF]](s64)
- ; CHECK-NEXT: RET_ReallyLR implicit [[BUILD_VECTOR]](<2 x s64>)
- %0:_(s64) = COPY $x0
- %1:_(s64) = G_IMPLICIT_DEF
- %2:_(<2 x s64>) = G_SHUFFLE_VECTOR %0(s64), %1(s64), shufflemask(0, 1)
- RET_ReallyLR implicit %2
-...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector.mir
index 2c9ae5b..9013410 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector.mir
@@ -386,7 +386,7 @@ body: |
; CHECK-NEXT: RET_ReallyLR implicit [[BUILD_VECTOR]](<4 x p0>)
%0:_(p0) = COPY $x0
%1:_(p0) = COPY $x1
- %6:_(<4 x p0>) = G_SHUFFLE_VECTOR %0, %1, shufflemask(0,1,0,1)
+ %6:_(<4 x p0>) = G_BUILD_VECTOR %0, %1, %0, %1
RET_ReallyLR implicit %6
...
@@ -408,104 +408,6 @@ body: |
; CHECK-NEXT: RET_ReallyLR implicit [[BUILD_VECTOR]](<2 x p0>)
%0:_(p0) = COPY $x0
%1:_(p0) = COPY $x1
- %6:_(<2 x p0>) = G_SHUFFLE_VECTOR %0, %1, shufflemask(1,0)
- RET_ReallyLR implicit %6
-...
-
-# Check that we properly use undef values when shuffle_vector
-# on scalars gets lowered to build_vector.
----
-name: shuffle_vector_on_scalars_to_build_vector_with_undef
-tracksRegLiveness: true
-body: |
- bb.1:
- liveins: $x0, $x1
-
- ; CHECK-LABEL: name: shuffle_vector_on_scalars_to_build_vector_with_undef
- ; CHECK: liveins: $x0, $x1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[COPY]](s64), [[DEF]](s64), [[DEF]](s64), [[COPY1]](s64)
- ; CHECK-NEXT: RET_ReallyLR implicit [[BUILD_VECTOR]](<4 x s64>)
- %0:_(s64) = COPY $x0
- %1:_(s64) = COPY $x1
- %6:_(<4 x s64>) = G_SHUFFLE_VECTOR %0, %1, shufflemask(0,-1,-1,1)
- RET_ReallyLR implicit %6
-...
-
-# Check that shuffle_vector on scalars gets combined into a plain
-# copy when the resulting type is a scalar as well and the sizes
-# are compatible.
----
-name: shuffle_vector_on_scalars_to_copy_ptr
-tracksRegLiveness: true
-body: |
- bb.1:
- liveins: $x0
-
- ; CHECK-LABEL: name: shuffle_vector_on_scalars_to_copy_ptr
- ; CHECK: liveins: $x0
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
- ; CHECK-NEXT: RET_ReallyLR implicit [[COPY]](p0)
- %0:_(p0) = COPY $x0
- %6:_(p0) = G_SHUFFLE_VECTOR %0, %0, shufflemask(0)
- RET_ReallyLR implicit %6
-...
----
-name: shuffle_vector_to_copy_lhs
-tracksRegLiveness: true
-body: |
- bb.1:
- liveins: $x0, $x1
-
- ; CHECK-LABEL: name: shuffle_vector_to_copy_lhs
- ; CHECK: liveins: $x0, $x1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $x0
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
- ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<2 x s32>), [[C]](s64)
- ; CHECK-NEXT: RET_ReallyLR implicit [[EVEC]](s32)
- %0:_(<2 x s32>) = COPY $x0
- %1:_(<2 x s32>) = COPY $x1
- %6:_(s32) = G_SHUFFLE_VECTOR %0, %1, shufflemask(1)
- RET_ReallyLR implicit %6
-...
----
-name: shuffle_vector_to_copy_rhs
-tracksRegLiveness: true
-body: |
- bb.1:
- liveins: $x0, $x1
-
- ; CHECK-LABEL: name: shuffle_vector_to_copy_rhs
- ; CHECK: liveins: $x0, $x1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $x1
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
- ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<2 x s32>), [[C]](s64)
- ; CHECK-NEXT: RET_ReallyLR implicit [[EVEC]](s32)
- %0:_(<2 x s32>) = COPY $x0
- %1:_(<2 x s32>) = COPY $x1
- %6:_(s32) = G_SHUFFLE_VECTOR %0, %1, shufflemask(2)
- RET_ReallyLR implicit %6
-...
----
-name: shuffle_vector_to_copy_undef
-tracksRegLiveness: true
-body: |
- bb.1:
- liveins: $x0, $x1
-
- ; CHECK-LABEL: name: shuffle_vector_to_copy_undef
- ; CHECK: liveins: $x0, $x1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
- ; CHECK-NEXT: RET_ReallyLR implicit [[DEF]](s32)
- %0:_(<2 x s32>) = COPY $x0
- %1:_(<2 x s32>) = COPY $x1
- %6:_(s32) = G_SHUFFLE_VECTOR %0, %1, shufflemask(-1)
+ %6:_(<2 x p0>) = G_BUILD_VECTOR %1, %0
RET_ReallyLR implicit %6
...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/regbank-fp-use-def.mir b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-fp-use-def.mir
index b252884..46dbc15 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/regbank-fp-use-def.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-fp-use-def.mir
@@ -96,7 +96,7 @@ body: |
; CHECK-NEXT: [[SITOFP:%[0-9]+]]:fpr(s32) = G_SITOFP [[COPY1]](s32)
; CHECK-NEXT: [[COPY3:%[0-9]+]]:fpr(s32) = COPY [[COPY2]](s32)
; CHECK-NEXT: [[SELECT:%[0-9]+]]:fpr(s32) = G_SELECT [[COPY2]](s32), [[COPY3]], [[SITOFP]]
- ; CHECK-NEXT: [[FPTOSI:%[0-9]+]]:gpr(s32) = G_FPTOSI [[SELECT]](s32)
+ ; CHECK-NEXT: [[FPTOSI:%[0-9]+]]:fpr(s32) = G_FPTOSI [[SELECT]](s32)
%0:_(s32) = COPY $w0
%2:_(s32) = COPY $w1
%3:_(s32) = COPY $w2
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index 0933e67..b54f262 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -749,12 +749,429 @@ for.body: ; preds = %for.body.preheader1
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
}
+define i64 @red_mla_dup_ext_u8_s8_s64(ptr noalias noundef readonly captures(none) %A, i8 noundef %B, i32 noundef %n) {
+; CHECK-SD-LABEL: red_mla_dup_ext_u8_s8_s64:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK-SD-NEXT: cbz w2, .LBB6_3
+; CHECK-SD-NEXT: // %bb.1: // %iter.check
+; CHECK-SD-NEXT: str x25, [sp, #-64]! // 8-byte Folded Spill
+; CHECK-SD-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT: .cfi_def_cfa_offset 64
+; CHECK-SD-NEXT: .cfi_offset w19, -8
+; CHECK-SD-NEXT: .cfi_offset w20, -16
+; CHECK-SD-NEXT: .cfi_offset w21, -24
+; CHECK-SD-NEXT: .cfi_offset w22, -32
+; CHECK-SD-NEXT: .cfi_offset w23, -40
+; CHECK-SD-NEXT: .cfi_offset w24, -48
+; CHECK-SD-NEXT: .cfi_offset w25, -64
+; CHECK-SD-NEXT: sxtb x9, w1
+; CHECK-SD-NEXT: cmp w2, #3
+; CHECK-SD-NEXT: mov w10, w2
+; CHECK-SD-NEXT: b.hi .LBB6_4
+; CHECK-SD-NEXT: // %bb.2:
+; CHECK-SD-NEXT: mov x11, xzr
+; CHECK-SD-NEXT: mov x8, xzr
+; CHECK-SD-NEXT: b .LBB6_13
+; CHECK-SD-NEXT: .LBB6_3:
+; CHECK-SD-NEXT: mov x0, xzr
+; CHECK-SD-NEXT: ret
+; CHECK-SD-NEXT: .LBB6_4: // %vector.main.loop.iter.check
+; CHECK-SD-NEXT: dup v0.2d, x9
+; CHECK-SD-NEXT: cmp w2, #16
+; CHECK-SD-NEXT: b.hs .LBB6_6
+; CHECK-SD-NEXT: // %bb.5:
+; CHECK-SD-NEXT: mov x11, xzr
+; CHECK-SD-NEXT: mov x8, xzr
+; CHECK-SD-NEXT: b .LBB6_10
+; CHECK-SD-NEXT: .LBB6_6: // %vector.ph
+; CHECK-SD-NEXT: movi v1.2d, #0000000000000000
+; CHECK-SD-NEXT: mov x8, v0.d[1]
+; CHECK-SD-NEXT: and x12, x10, #0xc
+; CHECK-SD-NEXT: movi v2.2d, #0000000000000000
+; CHECK-SD-NEXT: movi v4.2d, #0000000000000000
+; CHECK-SD-NEXT: and x11, x10, #0xfffffff0
+; CHECK-SD-NEXT: movi v3.2d, #0000000000000000
+; CHECK-SD-NEXT: movi v7.2d, #0000000000000000
+; CHECK-SD-NEXT: mov x15, x0
+; CHECK-SD-NEXT: movi v5.2d, #0000000000000000
+; CHECK-SD-NEXT: movi v16.2d, #0000000000000000
+; CHECK-SD-NEXT: and x16, x10, #0xfffffff0
+; CHECK-SD-NEXT: movi v6.2d, #0000000000000000
+; CHECK-SD-NEXT: fmov x13, d0
+; CHECK-SD-NEXT: fmov x14, d0
+; CHECK-SD-NEXT: .LBB6_7: // %vector.body
+; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-SD-NEXT: ldr q17, [x15], #16
+; CHECK-SD-NEXT: subs x16, x16, #16
+; CHECK-SD-NEXT: ushll v18.8h, v17.8b, #0
+; CHECK-SD-NEXT: ushll2 v19.8h, v17.16b, #0
+; CHECK-SD-NEXT: ushll v17.4s, v18.4h, #0
+; CHECK-SD-NEXT: ushll2 v20.4s, v19.8h, #0
+; CHECK-SD-NEXT: ushll2 v18.4s, v18.8h, #0
+; CHECK-SD-NEXT: ushll v19.4s, v19.4h, #0
+; CHECK-SD-NEXT: ushll v21.2d, v17.2s, #0
+; CHECK-SD-NEXT: ushll2 v22.2d, v20.4s, #0
+; CHECK-SD-NEXT: ushll2 v17.2d, v17.4s, #0
+; CHECK-SD-NEXT: ushll v23.2d, v18.2s, #0
+; CHECK-SD-NEXT: ushll v20.2d, v20.2s, #0
+; CHECK-SD-NEXT: ushll2 v18.2d, v18.4s, #0
+; CHECK-SD-NEXT: fmov x17, d21
+; CHECK-SD-NEXT: mov x2, v21.d[1]
+; CHECK-SD-NEXT: ushll v21.2d, v19.2s, #0
+; CHECK-SD-NEXT: ushll2 v19.2d, v19.4s, #0
+; CHECK-SD-NEXT: fmov x18, d22
+; CHECK-SD-NEXT: fmov x1, d17
+; CHECK-SD-NEXT: fmov x3, d23
+; CHECK-SD-NEXT: fmov x21, d20
+; CHECK-SD-NEXT: fmov x22, d18
+; CHECK-SD-NEXT: fmov x19, d21
+; CHECK-SD-NEXT: mul x17, x13, x17
+; CHECK-SD-NEXT: mov x4, v22.d[1]
+; CHECK-SD-NEXT: fmov x24, d19
+; CHECK-SD-NEXT: mov x5, v23.d[1]
+; CHECK-SD-NEXT: mov x6, v21.d[1]
+; CHECK-SD-NEXT: mov x7, v20.d[1]
+; CHECK-SD-NEXT: mov x20, v18.d[1]
+; CHECK-SD-NEXT: mov x23, v19.d[1]
+; CHECK-SD-NEXT: mov x25, v17.d[1]
+; CHECK-SD-NEXT: mul x18, x14, x18
+; CHECK-SD-NEXT: mul x1, x13, x1
+; CHECK-SD-NEXT: fmov d17, x17
+; CHECK-SD-NEXT: mul x3, x13, x3
+; CHECK-SD-NEXT: fmov d18, x18
+; CHECK-SD-NEXT: mul x19, x13, x19
+; CHECK-SD-NEXT: fmov d19, x1
+; CHECK-SD-NEXT: mul x21, x13, x21
+; CHECK-SD-NEXT: fmov d20, x3
+; CHECK-SD-NEXT: mul x22, x13, x22
+; CHECK-SD-NEXT: fmov d21, x19
+; CHECK-SD-NEXT: mul x24, x13, x24
+; CHECK-SD-NEXT: fmov d24, x21
+; CHECK-SD-NEXT: mul x2, x8, x2
+; CHECK-SD-NEXT: fmov d22, x22
+; CHECK-SD-NEXT: mul x4, x8, x4
+; CHECK-SD-NEXT: fmov d23, x24
+; CHECK-SD-NEXT: mul x5, x8, x5
+; CHECK-SD-NEXT: mov v17.d[1], x2
+; CHECK-SD-NEXT: mul x6, x8, x6
+; CHECK-SD-NEXT: mov v18.d[1], x4
+; CHECK-SD-NEXT: mul x7, x8, x7
+; CHECK-SD-NEXT: mov v20.d[1], x5
+; CHECK-SD-NEXT: add v1.2d, v17.2d, v1.2d
+; CHECK-SD-NEXT: mul x20, x8, x20
+; CHECK-SD-NEXT: mov v21.d[1], x6
+; CHECK-SD-NEXT: add v6.2d, v18.2d, v6.2d
+; CHECK-SD-NEXT: mul x23, x8, x23
+; CHECK-SD-NEXT: mov v24.d[1], x7
+; CHECK-SD-NEXT: add v4.2d, v20.2d, v4.2d
+; CHECK-SD-NEXT: mul x17, x8, x25
+; CHECK-SD-NEXT: mov v22.d[1], x20
+; CHECK-SD-NEXT: add v7.2d, v21.2d, v7.2d
+; CHECK-SD-NEXT: mov v23.d[1], x23
+; CHECK-SD-NEXT: add v16.2d, v24.2d, v16.2d
+; CHECK-SD-NEXT: mov v19.d[1], x17
+; CHECK-SD-NEXT: add v3.2d, v22.2d, v3.2d
+; CHECK-SD-NEXT: add v5.2d, v23.2d, v5.2d
+; CHECK-SD-NEXT: add v2.2d, v19.2d, v2.2d
+; CHECK-SD-NEXT: b.ne .LBB6_7
+; CHECK-SD-NEXT: // %bb.8: // %middle.block
+; CHECK-SD-NEXT: add v1.2d, v1.2d, v7.2d
+; CHECK-SD-NEXT: add v4.2d, v4.2d, v16.2d
+; CHECK-SD-NEXT: cmp x11, x10
+; CHECK-SD-NEXT: add v2.2d, v2.2d, v5.2d
+; CHECK-SD-NEXT: add v3.2d, v3.2d, v6.2d
+; CHECK-SD-NEXT: add v1.2d, v1.2d, v4.2d
+; CHECK-SD-NEXT: add v2.2d, v2.2d, v3.2d
+; CHECK-SD-NEXT: add v1.2d, v1.2d, v2.2d
+; CHECK-SD-NEXT: addp d1, v1.2d
+; CHECK-SD-NEXT: fmov x8, d1
+; CHECK-SD-NEXT: b.eq .LBB6_15
+; CHECK-SD-NEXT: // %bb.9: // %vec.epilog.iter.check
+; CHECK-SD-NEXT: cbz x12, .LBB6_13
+; CHECK-SD-NEXT: .LBB6_10: // %vec.epilog.ph
+; CHECK-SD-NEXT: movi v1.2d, #0000000000000000
+; CHECK-SD-NEXT: movi v2.2d, #0000000000000000
+; CHECK-SD-NEXT: mov x13, x11
+; CHECK-SD-NEXT: movi v3.2d, #0x000000000000ff
+; CHECK-SD-NEXT: fmov x14, d0
+; CHECK-SD-NEXT: and x11, x10, #0xfffffffc
+; CHECK-SD-NEXT: fmov x15, d0
+; CHECK-SD-NEXT: sub x12, x13, x11
+; CHECK-SD-NEXT: add x13, x0, x13
+; CHECK-SD-NEXT: mov v1.d[0], x8
+; CHECK-SD-NEXT: mov x8, v0.d[1]
+; CHECK-SD-NEXT: .LBB6_11: // %vec.epilog.vector.body
+; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-SD-NEXT: ldr s0, [x13], #4
+; CHECK-SD-NEXT: adds x12, x12, #4
+; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT: ushll v4.2d, v0.2s, #0
+; CHECK-SD-NEXT: ushll2 v0.2d, v0.4s, #0
+; CHECK-SD-NEXT: and v4.16b, v4.16b, v3.16b
+; CHECK-SD-NEXT: and v0.16b, v0.16b, v3.16b
+; CHECK-SD-NEXT: fmov x16, d4
+; CHECK-SD-NEXT: fmov x18, d0
+; CHECK-SD-NEXT: mov x17, v4.d[1]
+; CHECK-SD-NEXT: mov x1, v0.d[1]
+; CHECK-SD-NEXT: mul x16, x14, x16
+; CHECK-SD-NEXT: mul x18, x15, x18
+; CHECK-SD-NEXT: mul x17, x8, x17
+; CHECK-SD-NEXT: fmov d0, x16
+; CHECK-SD-NEXT: mul x1, x8, x1
+; CHECK-SD-NEXT: fmov d4, x18
+; CHECK-SD-NEXT: mov v0.d[1], x17
+; CHECK-SD-NEXT: mov v4.d[1], x1
+; CHECK-SD-NEXT: add v1.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT: add v2.2d, v4.2d, v2.2d
+; CHECK-SD-NEXT: b.ne .LBB6_11
+; CHECK-SD-NEXT: // %bb.12: // %vec.epilog.middle.block
+; CHECK-SD-NEXT: add v0.2d, v1.2d, v2.2d
+; CHECK-SD-NEXT: cmp x11, x10
+; CHECK-SD-NEXT: addp d0, v0.2d
+; CHECK-SD-NEXT: fmov x8, d0
+; CHECK-SD-NEXT: b.eq .LBB6_15
+; CHECK-SD-NEXT: .LBB6_13: // %for.body.preheader
+; CHECK-SD-NEXT: sub x10, x10, x11
+; CHECK-SD-NEXT: add x11, x0, x11
+; CHECK-SD-NEXT: .LBB6_14: // %for.body
+; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-SD-NEXT: ldrb w12, [x11], #1
+; CHECK-SD-NEXT: subs x10, x10, #1
+; CHECK-SD-NEXT: smaddl x8, w12, w9, x8
+; CHECK-SD-NEXT: b.ne .LBB6_14
+; CHECK-SD-NEXT: .LBB6_15:
+; CHECK-SD-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT: ldr x25, [sp], #64 // 8-byte Folded Reload
+; CHECK-SD-NEXT: mov x0, x8
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: red_mla_dup_ext_u8_s8_s64:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK-GI-NEXT: cbz w2, .LBB6_7
+; CHECK-GI-NEXT: // %bb.1: // %iter.check
+; CHECK-GI-NEXT: movi d0, #0000000000000000
+; CHECK-GI-NEXT: sxtb x9, w1
+; CHECK-GI-NEXT: mov x11, xzr
+; CHECK-GI-NEXT: cmp w2, #4
+; CHECK-GI-NEXT: mov w10, w2
+; CHECK-GI-NEXT: b.lo .LBB6_12
+; CHECK-GI-NEXT: // %bb.2: // %vector.main.loop.iter.check
+; CHECK-GI-NEXT: movi d0, #0000000000000000
+; CHECK-GI-NEXT: dup v1.2d, x9
+; CHECK-GI-NEXT: mov x11, xzr
+; CHECK-GI-NEXT: cmp w2, #16
+; CHECK-GI-NEXT: b.lo .LBB6_9
+; CHECK-GI-NEXT: // %bb.3: // %vector.ph
+; CHECK-GI-NEXT: movi v0.2d, #0000000000000000
+; CHECK-GI-NEXT: xtn v2.2s, v1.2d
+; CHECK-GI-NEXT: and x8, x10, #0xc
+; CHECK-GI-NEXT: movi v3.2d, #0000000000000000
+; CHECK-GI-NEXT: movi v4.2d, #0000000000000000
+; CHECK-GI-NEXT: and x11, x10, #0xfffffff0
+; CHECK-GI-NEXT: movi v5.2d, #0000000000000000
+; CHECK-GI-NEXT: movi v6.2d, #0000000000000000
+; CHECK-GI-NEXT: mov x12, x0
+; CHECK-GI-NEXT: movi v7.2d, #0000000000000000
+; CHECK-GI-NEXT: movi v16.2d, #0000000000000000
+; CHECK-GI-NEXT: and x13, x10, #0xfffffff0
+; CHECK-GI-NEXT: movi v17.2d, #0000000000000000
+; CHECK-GI-NEXT: .LBB6_4: // %vector.body
+; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-GI-NEXT: ldr q18, [x12], #16
+; CHECK-GI-NEXT: subs x13, x13, #16
+; CHECK-GI-NEXT: ushll v19.8h, v18.8b, #0
+; CHECK-GI-NEXT: ushll2 v18.8h, v18.16b, #0
+; CHECK-GI-NEXT: ushll v20.4s, v19.4h, #0
+; CHECK-GI-NEXT: ushll2 v19.4s, v19.8h, #0
+; CHECK-GI-NEXT: ushll v21.4s, v18.4h, #0
+; CHECK-GI-NEXT: ushll2 v18.4s, v18.8h, #0
+; CHECK-GI-NEXT: mov d22, v20.d[1]
+; CHECK-GI-NEXT: mov d23, v19.d[1]
+; CHECK-GI-NEXT: mov d24, v21.d[1]
+; CHECK-GI-NEXT: mov d25, v18.d[1]
+; CHECK-GI-NEXT: smlal v0.2d, v2.2s, v20.2s
+; CHECK-GI-NEXT: smlal v4.2d, v2.2s, v19.2s
+; CHECK-GI-NEXT: smlal v6.2d, v2.2s, v21.2s
+; CHECK-GI-NEXT: smlal v16.2d, v2.2s, v18.2s
+; CHECK-GI-NEXT: smlal v3.2d, v2.2s, v22.2s
+; CHECK-GI-NEXT: smlal v5.2d, v2.2s, v23.2s
+; CHECK-GI-NEXT: smlal v7.2d, v2.2s, v24.2s
+; CHECK-GI-NEXT: smlal v17.2d, v2.2s, v25.2s
+; CHECK-GI-NEXT: b.ne .LBB6_4
+; CHECK-GI-NEXT: // %bb.5: // %middle.block
+; CHECK-GI-NEXT: add v0.2d, v0.2d, v3.2d
+; CHECK-GI-NEXT: add v2.2d, v4.2d, v5.2d
+; CHECK-GI-NEXT: cmp x11, x10
+; CHECK-GI-NEXT: add v3.2d, v6.2d, v7.2d
+; CHECK-GI-NEXT: add v4.2d, v16.2d, v17.2d
+; CHECK-GI-NEXT: add v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT: add v2.2d, v3.2d, v4.2d
+; CHECK-GI-NEXT: add v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT: addp d0, v0.2d
+; CHECK-GI-NEXT: b.ne .LBB6_8
+; CHECK-GI-NEXT: // %bb.6:
+; CHECK-GI-NEXT: fmov x8, d0
+; CHECK-GI-NEXT: mov x0, x8
+; CHECK-GI-NEXT: ret
+; CHECK-GI-NEXT: .LBB6_7:
+; CHECK-GI-NEXT: mov x8, xzr
+; CHECK-GI-NEXT: mov x0, x8
+; CHECK-GI-NEXT: ret
+; CHECK-GI-NEXT: .LBB6_8: // %vec.epilog.iter.check
+; CHECK-GI-NEXT: cbz x8, .LBB6_12
+; CHECK-GI-NEXT: .LBB6_9: // %vec.epilog.ph
+; CHECK-GI-NEXT: mov v0.d[1], xzr
+; CHECK-GI-NEXT: movi v2.2d, #0000000000000000
+; CHECK-GI-NEXT: mov x12, x11
+; CHECK-GI-NEXT: xtn v1.2s, v1.2d
+; CHECK-GI-NEXT: and x11, x10, #0xfffffffc
+; CHECK-GI-NEXT: sub x8, x12, x11
+; CHECK-GI-NEXT: add x12, x0, x12
+; CHECK-GI-NEXT: .LBB6_10: // %vec.epilog.vector.body
+; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-GI-NEXT: ldr w13, [x12], #4
+; CHECK-GI-NEXT: adds x8, x8, #4
+; CHECK-GI-NEXT: fmov s3, w13
+; CHECK-GI-NEXT: uxtb w13, w13
+; CHECK-GI-NEXT: mov b4, v3.b[2]
+; CHECK-GI-NEXT: mov b5, v3.b[1]
+; CHECK-GI-NEXT: mov b6, v3.b[3]
+; CHECK-GI-NEXT: fmov s3, w13
+; CHECK-GI-NEXT: fmov w14, s4
+; CHECK-GI-NEXT: fmov w15, s5
+; CHECK-GI-NEXT: fmov w16, s6
+; CHECK-GI-NEXT: uxtb w14, w14
+; CHECK-GI-NEXT: uxtb w15, w15
+; CHECK-GI-NEXT: uxtb w16, w16
+; CHECK-GI-NEXT: fmov s4, w14
+; CHECK-GI-NEXT: mov v3.s[1], w15
+; CHECK-GI-NEXT: mov v4.s[1], w16
+; CHECK-GI-NEXT: smlal v0.2d, v1.2s, v3.2s
+; CHECK-GI-NEXT: smlal v2.2d, v1.2s, v4.2s
+; CHECK-GI-NEXT: b.ne .LBB6_10
+; CHECK-GI-NEXT: // %bb.11: // %vec.epilog.middle.block
+; CHECK-GI-NEXT: add v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT: cmp x11, x10
+; CHECK-GI-NEXT: addp d0, v0.2d
+; CHECK-GI-NEXT: fmov x8, d0
+; CHECK-GI-NEXT: b.eq .LBB6_14
+; CHECK-GI-NEXT: .LBB6_12: // %for.body.preheader
+; CHECK-GI-NEXT: sub x10, x10, x11
+; CHECK-GI-NEXT: add x11, x0, x11
+; CHECK-GI-NEXT: .LBB6_13: // %for.body
+; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-GI-NEXT: ldrb w8, [x11], #1
+; CHECK-GI-NEXT: fmov x12, d0
+; CHECK-GI-NEXT: subs x10, x10, #1
+; CHECK-GI-NEXT: madd x8, x8, x9, x12
+; CHECK-GI-NEXT: fmov d0, x8
+; CHECK-GI-NEXT: b.ne .LBB6_13
+; CHECK-GI-NEXT: .LBB6_14: // %for.cond.cleanup
+; CHECK-GI-NEXT: mov x0, x8
+; CHECK-GI-NEXT: ret
+entry:
+ %cmp5.not = icmp eq i32 %n, 0
+ br i1 %cmp5.not, label %for.cond.cleanup, label %iter.check
+
+iter.check: ; preds = %entry
+ %conv1 = sext i8 %B to i64
+ %wide.trip.count = zext i32 %n to i64
+ %min.iters.check = icmp ult i32 %n, 4
+ br i1 %min.iters.check, label %for.body.preheader, label %vector.main.loop.iter.check
+
+vector.main.loop.iter.check: ; preds = %iter.check
+ %min.iters.check9 = icmp ult i32 %n, 16
+ br i1 %min.iters.check9, label %vec.epilog.ph, label %vector.ph
+
+vector.ph: ; preds = %vector.main.loop.iter.check
+ %n.mod.vf = and i64 %wide.trip.count, 12
+ %n.vec = and i64 %wide.trip.count, 4294967280
+ %broadcast.splatinsert = insertelement <16 x i64> poison, i64 %conv1, i64 0
+ %broadcast.splat = shufflevector <16 x i64> %broadcast.splatinsert, <16 x i64> poison, <16 x i32> zeroinitializer
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %vector.ph
+ %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+ %vec.phi = phi <16 x i64> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
+ %0 = getelementptr inbounds nuw i8, ptr %A, i64 %index
+ %wide.load = load <16 x i8>, ptr %0, align 1
+ %1 = zext <16 x i8> %wide.load to <16 x i64>
+ %2 = mul nsw <16 x i64> %broadcast.splat, %1
+ %3 = add <16 x i64> %2, %vec.phi
+ %index.next = add nuw i64 %index, 16
+ %4 = icmp eq i64 %index.next, %n.vec
+ br i1 %4, label %middle.block, label %vector.body
+
+middle.block: ; preds = %vector.body
+ %5 = tail call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %3)
+ %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
+ br i1 %cmp.n, label %for.cond.cleanup, label %vec.epilog.iter.check
+
+vec.epilog.iter.check: ; preds = %middle.block
+ %min.epilog.iters.check = icmp eq i64 %n.mod.vf, 0
+ br i1 %min.epilog.iters.check, label %for.body.preheader, label %vec.epilog.ph
+
+vec.epilog.ph: ; preds = %vector.main.loop.iter.check, %vec.epilog.iter.check
+ %vec.epilog.resume.val = phi i64 [ %n.vec, %vec.epilog.iter.check ], [ 0, %vector.main.loop.iter.check ]
+ %bc.merge.rdx = phi i64 [ %5, %vec.epilog.iter.check ], [ 0, %vector.main.loop.iter.check ]
+ %n.vec11 = and i64 %wide.trip.count, 4294967292
+ %6 = insertelement <4 x i64> <i64 poison, i64 0, i64 0, i64 0>, i64 %bc.merge.rdx, i64 0
+ %broadcast.splatinsert12 = insertelement <4 x i64> poison, i64 %conv1, i64 0
+ %broadcast.splat13 = shufflevector <4 x i64> %broadcast.splatinsert12, <4 x i64> poison, <4 x i32> zeroinitializer
+ br label %vec.epilog.vector.body
+
+vec.epilog.vector.body: ; preds = %vec.epilog.vector.body, %vec.epilog.ph
+ %index14 = phi i64 [ %vec.epilog.resume.val, %vec.epilog.ph ], [ %index.next17, %vec.epilog.vector.body ]
+ %vec.phi15 = phi <4 x i64> [ %6, %vec.epilog.ph ], [ %10, %vec.epilog.vector.body ]
+ %7 = getelementptr inbounds nuw i8, ptr %A, i64 %index14
+ %wide.load16 = load <4 x i8>, ptr %7, align 1
+ %8 = zext <4 x i8> %wide.load16 to <4 x i64>
+ %9 = mul nsw <4 x i64> %broadcast.splat13, %8
+ %10 = add <4 x i64> %9, %vec.phi15
+ %index.next17 = add nuw i64 %index14, 4
+ %11 = icmp eq i64 %index.next17, %n.vec11
+ br i1 %11, label %vec.epilog.middle.block, label %vec.epilog.vector.body
+
+vec.epilog.middle.block: ; preds = %vec.epilog.vector.body
+ %12 = tail call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %10)
+ %cmp.n18 = icmp eq i64 %n.vec11, %wide.trip.count
+ br i1 %cmp.n18, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader: ; preds = %iter.check, %vec.epilog.iter.check, %vec.epilog.middle.block
+ %indvars.iv.ph = phi i64 [ 0, %iter.check ], [ %n.vec, %vec.epilog.iter.check ], [ %n.vec11, %vec.epilog.middle.block ]
+ %s.06.ph = phi i64 [ 0, %iter.check ], [ %5, %vec.epilog.iter.check ], [ %12, %vec.epilog.middle.block ]
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %middle.block, %vec.epilog.middle.block, %entry
+ %s.0.lcssa = phi i64 [ 0, %entry ], [ %5, %middle.block ], [ %12, %vec.epilog.middle.block ], [ %add, %for.body ]
+ ret i64 %s.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader ]
+ %s.06 = phi i64 [ %add, %for.body ], [ %s.06.ph, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds nuw i8, ptr %A, i64 %indvars.iv
+ %13 = load i8, ptr %arrayidx, align 1
+ %conv = zext i8 %13 to i64
+ %mul = mul nsw i64 %conv, %conv1
+ %add = add nsw i64 %mul, %s.06
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
define void @sink_v2z64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
; CHECK-SD-LABEL: sink_v2z64_1:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: mov x8, xzr
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT: .LBB6_1: // %loop
+; CHECK-SD-NEXT: .LBB7_1: // %loop
; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-SD-NEXT: ldr d1, [x0]
; CHECK-SD-NEXT: subs x2, x2, #8
@@ -762,7 +1179,7 @@ define void @sink_v2z64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
; CHECK-SD-NEXT: umull v1.2d, v1.2s, v0.s[1]
; CHECK-SD-NEXT: shrn v1.2s, v1.2d, #15
; CHECK-SD-NEXT: str d1, [x0], #32
-; CHECK-SD-NEXT: b.ne .LBB6_1
+; CHECK-SD-NEXT: b.ne .LBB7_1
; CHECK-SD-NEXT: // %bb.2: // %exit
; CHECK-SD-NEXT: ret
;
@@ -772,7 +1189,7 @@ define void @sink_v2z64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
; CHECK-GI-NEXT: mov x8, xzr
; CHECK-GI-NEXT: dup v0.2d, v0.d[1]
; CHECK-GI-NEXT: xtn v0.2s, v0.2d
-; CHECK-GI-NEXT: .LBB6_1: // %loop
+; CHECK-GI-NEXT: .LBB7_1: // %loop
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-GI-NEXT: ldr d1, [x0]
; CHECK-GI-NEXT: subs x2, x2, #8
@@ -780,7 +1197,7 @@ define void @sink_v2z64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
; CHECK-GI-NEXT: umull v1.2d, v1.2s, v0.2s
; CHECK-GI-NEXT: shrn v1.2s, v1.2d, #15
; CHECK-GI-NEXT: str d1, [x0], #32
-; CHECK-GI-NEXT: b.ne .LBB6_1
+; CHECK-GI-NEXT: b.ne .LBB7_1
; CHECK-GI-NEXT: // %bb.2: // %exit
; CHECK-GI-NEXT: ret
entry:
@@ -813,7 +1230,7 @@ define void @sink_v4i64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: mov x8, xzr
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT: .LBB7_1: // %loop
+; CHECK-SD-NEXT: .LBB8_1: // %loop
; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-SD-NEXT: ldr q1, [x0]
; CHECK-SD-NEXT: subs x2, x2, #8
@@ -823,7 +1240,7 @@ define void @sink_v4i64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
; CHECK-SD-NEXT: shrn v2.2s, v2.2d, #15
; CHECK-SD-NEXT: shrn2 v2.4s, v1.2d, #15
; CHECK-SD-NEXT: str q2, [x0], #32
-; CHECK-SD-NEXT: b.ne .LBB7_1
+; CHECK-SD-NEXT: b.ne .LBB8_1
; CHECK-SD-NEXT: // %bb.2: // %exit
; CHECK-SD-NEXT: ret
;
@@ -833,7 +1250,7 @@ define void @sink_v4i64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
; CHECK-GI-NEXT: mov x8, xzr
; CHECK-GI-NEXT: dup v0.2d, v0.d[1]
; CHECK-GI-NEXT: xtn v0.2s, v0.2d
-; CHECK-GI-NEXT: .LBB7_1: // %loop
+; CHECK-GI-NEXT: .LBB8_1: // %loop
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-GI-NEXT: ldr q1, [x0]
; CHECK-GI-NEXT: subs x2, x2, #8
@@ -844,7 +1261,7 @@ define void @sink_v4i64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
; CHECK-GI-NEXT: shrn v1.2s, v1.2d, #15
; CHECK-GI-NEXT: shrn2 v1.4s, v2.2d, #15
; CHECK-GI-NEXT: str q1, [x0], #32
-; CHECK-GI-NEXT: b.ne .LBB7_1
+; CHECK-GI-NEXT: b.ne .LBB8_1
; CHECK-GI-NEXT: // %bb.2: // %exit
; CHECK-GI-NEXT: ret
entry:
@@ -877,7 +1294,7 @@ define void @sink_v8z16_0(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: dup v0.8b, v0.b[0]
; CHECK-SD-NEXT: mov x8, xzr
-; CHECK-SD-NEXT: .LBB8_1: // %loop
+; CHECK-SD-NEXT: .LBB9_1: // %loop
; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-SD-NEXT: ldr d1, [x0]
; CHECK-SD-NEXT: subs x2, x2, #8
@@ -886,7 +1303,7 @@ define void @sink_v8z16_0(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
; CHECK-SD-NEXT: cmlt v1.8h, v1.8h, #0
; CHECK-SD-NEXT: xtn v1.8b, v1.8h
; CHECK-SD-NEXT: str d1, [x0], #32
-; CHECK-SD-NEXT: b.ne .LBB8_1
+; CHECK-SD-NEXT: b.ne .LBB9_1
; CHECK-SD-NEXT: // %bb.2: // %exit
; CHECK-SD-NEXT: ret
;
@@ -896,7 +1313,7 @@ define void @sink_v8z16_0(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
; CHECK-GI-NEXT: mov x8, xzr
; CHECK-GI-NEXT: dup v0.8h, v0.h[0]
; CHECK-GI-NEXT: xtn v0.8b, v0.8h
-; CHECK-GI-NEXT: .LBB8_1: // %loop
+; CHECK-GI-NEXT: .LBB9_1: // %loop
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-GI-NEXT: ldr d1, [x0]
; CHECK-GI-NEXT: subs x2, x2, #8
@@ -905,7 +1322,7 @@ define void @sink_v8z16_0(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
; CHECK-GI-NEXT: cmlt v1.8h, v1.8h, #0
; CHECK-GI-NEXT: xtn v1.8b, v1.8h
; CHECK-GI-NEXT: str d1, [x0], #32
-; CHECK-GI-NEXT: b.ne .LBB8_1
+; CHECK-GI-NEXT: b.ne .LBB9_1
; CHECK-GI-NEXT: // %bb.2: // %exit
; CHECK-GI-NEXT: ret
entry:
@@ -938,7 +1355,7 @@ define void @sink_v16s16_8(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: dup v0.16b, v0.b[10]
; CHECK-SD-NEXT: mov x8, xzr
-; CHECK-SD-NEXT: .LBB9_1: // %loop
+; CHECK-SD-NEXT: .LBB10_1: // %loop
; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-SD-NEXT: ldr q1, [x0]
; CHECK-SD-NEXT: subs x2, x2, #8
@@ -949,7 +1366,7 @@ define void @sink_v16s16_8(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
; CHECK-SD-NEXT: cmlt v2.8h, v2.8h, #0
; CHECK-SD-NEXT: uzp1 v1.16b, v2.16b, v1.16b
; CHECK-SD-NEXT: str q1, [x0], #32
-; CHECK-SD-NEXT: b.ne .LBB9_1
+; CHECK-SD-NEXT: b.ne .LBB10_1
; CHECK-SD-NEXT: // %bb.2: // %exit
; CHECK-SD-NEXT: ret
;
@@ -959,7 +1376,7 @@ define void @sink_v16s16_8(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
; CHECK-GI-NEXT: mov x8, xzr
; CHECK-GI-NEXT: dup v0.8h, v0.h[2]
; CHECK-GI-NEXT: xtn v0.8b, v0.8h
-; CHECK-GI-NEXT: .LBB9_1: // %loop
+; CHECK-GI-NEXT: .LBB10_1: // %loop
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-GI-NEXT: ldr q1, [x0]
; CHECK-GI-NEXT: subs x2, x2, #8
@@ -971,7 +1388,7 @@ define void @sink_v16s16_8(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
; CHECK-GI-NEXT: cmlt v2.8h, v2.8h, #0
; CHECK-GI-NEXT: uzp1 v1.16b, v1.16b, v2.16b
; CHECK-GI-NEXT: str q1, [x0], #32
-; CHECK-GI-NEXT: b.ne .LBB9_1
+; CHECK-GI-NEXT: b.ne .LBB10_1
; CHECK-GI-NEXT: // %bb.2: // %exit
; CHECK-GI-NEXT: ret
entry:
@@ -1005,7 +1422,7 @@ define void @matrix_mul_unsigned_and(i32 %N, ptr nocapture %C, ptr nocapture rea
; CHECK-SD-NEXT: dup v0.4h, w3
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-SD-NEXT: and x8, x0, #0xfffffff8
-; CHECK-SD-NEXT: .LBB10_1: // %vector.body
+; CHECK-SD-NEXT: .LBB11_1: // %vector.body
; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-SD-NEXT: add x9, x2, w0, uxtw #1
; CHECK-SD-NEXT: subs x8, x8, #8
@@ -1015,7 +1432,7 @@ define void @matrix_mul_unsigned_and(i32 %N, ptr nocapture %C, ptr nocapture rea
; CHECK-SD-NEXT: umull v1.4s, v0.4h, v1.4h
; CHECK-SD-NEXT: umull v2.4s, v0.4h, v2.4h
; CHECK-SD-NEXT: stp q1, q2, [x9]
-; CHECK-SD-NEXT: b.ne .LBB10_1
+; CHECK-SD-NEXT: b.ne .LBB11_1
; CHECK-SD-NEXT: // %bb.2: // %for.end12
; CHECK-SD-NEXT: ret
;
@@ -1026,7 +1443,7 @@ define void @matrix_mul_unsigned_and(i32 %N, ptr nocapture %C, ptr nocapture rea
; CHECK-GI-NEXT: mov w8, w0
; CHECK-GI-NEXT: and x8, x8, #0xfffffff8
; CHECK-GI-NEXT: xtn v0.4h, v0.4s
-; CHECK-GI-NEXT: .LBB10_1: // %vector.body
+; CHECK-GI-NEXT: .LBB11_1: // %vector.body
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-GI-NEXT: add x9, x2, w0, uxtw #1
; CHECK-GI-NEXT: subs x8, x8, #8
@@ -1036,7 +1453,7 @@ define void @matrix_mul_unsigned_and(i32 %N, ptr nocapture %C, ptr nocapture rea
; CHECK-GI-NEXT: umull v1.4s, v0.4h, v1.4h
; CHECK-GI-NEXT: umull v2.4s, v0.4h, v2.4h
; CHECK-GI-NEXT: stp q1, q2, [x9]
-; CHECK-GI-NEXT: b.ne .LBB10_1
+; CHECK-GI-NEXT: b.ne .LBB11_1
; CHECK-GI-NEXT: // %bb.2: // %for.end12
; CHECK-GI-NEXT: ret
vector.header:
@@ -1089,7 +1506,7 @@ define void @matrix_mul_unsigned_and_double(i32 %N, ptr nocapture %C, ptr nocapt
; CHECK-SD-NEXT: dup v0.8h, w3
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-SD-NEXT: and x8, x0, #0xfffffff0
-; CHECK-SD-NEXT: .LBB11_1: // %vector.body
+; CHECK-SD-NEXT: .LBB12_1: // %vector.body
; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-SD-NEXT: add x9, x2, w0, uxtw #1
; CHECK-SD-NEXT: subs x8, x8, #16
@@ -1103,7 +1520,7 @@ define void @matrix_mul_unsigned_and_double(i32 %N, ptr nocapture %C, ptr nocapt
; CHECK-SD-NEXT: umull v2.4s, v0.4h, v2.4h
; CHECK-SD-NEXT: stp q1, q3, [x9]
; CHECK-SD-NEXT: stp q2, q4, [x9, #32]
-; CHECK-SD-NEXT: b.ne .LBB11_1
+; CHECK-SD-NEXT: b.ne .LBB12_1
; CHECK-SD-NEXT: // %bb.2: // %for.end12
; CHECK-SD-NEXT: ret
;
@@ -1114,7 +1531,7 @@ define void @matrix_mul_unsigned_and_double(i32 %N, ptr nocapture %C, ptr nocapt
; CHECK-GI-NEXT: mov w8, w0
; CHECK-GI-NEXT: and x8, x8, #0xfffffff0
; CHECK-GI-NEXT: xtn v0.4h, v0.4s
-; CHECK-GI-NEXT: .LBB11_1: // %vector.body
+; CHECK-GI-NEXT: .LBB12_1: // %vector.body
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-GI-NEXT: add x9, x2, w0, uxtw #1
; CHECK-GI-NEXT: subs x8, x8, #16
@@ -1130,7 +1547,7 @@ define void @matrix_mul_unsigned_and_double(i32 %N, ptr nocapture %C, ptr nocapt
; CHECK-GI-NEXT: umull v4.4s, v0.4h, v4.4h
; CHECK-GI-NEXT: stp q1, q3, [x9]
; CHECK-GI-NEXT: stp q2, q4, [x9, #32]!
-; CHECK-GI-NEXT: b.ne .LBB11_1
+; CHECK-GI-NEXT: b.ne .LBB12_1
; CHECK-GI-NEXT: // %bb.2: // %for.end12
; CHECK-GI-NEXT: ret
vector.header:
@@ -1184,7 +1601,7 @@ define void @matrix_mul_signed_and(i32 %N, ptr nocapture %C, ptr nocapture reado
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-SD-NEXT: and x8, x0, #0xfffffff8
; CHECK-SD-NEXT: fmov s0, w9
-; CHECK-SD-NEXT: .LBB12_1: // %vector.body
+; CHECK-SD-NEXT: .LBB13_1: // %vector.body
; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-SD-NEXT: add x9, x2, w0, uxtw #1
; CHECK-SD-NEXT: subs x8, x8, #8
@@ -1196,7 +1613,7 @@ define void @matrix_mul_signed_and(i32 %N, ptr nocapture %C, ptr nocapture reado
; CHECK-SD-NEXT: mul v1.4s, v1.4s, v0.s[0]
; CHECK-SD-NEXT: mul v2.4s, v2.4s, v0.s[0]
; CHECK-SD-NEXT: stp q1, q2, [x9]
-; CHECK-SD-NEXT: b.ne .LBB12_1
+; CHECK-SD-NEXT: b.ne .LBB13_1
; CHECK-SD-NEXT: // %bb.2: // %for.end12
; CHECK-SD-NEXT: ret
;
@@ -1206,7 +1623,7 @@ define void @matrix_mul_signed_and(i32 %N, ptr nocapture %C, ptr nocapture reado
; CHECK-GI-NEXT: dup v0.4s, w8
; CHECK-GI-NEXT: mov w8, w0
; CHECK-GI-NEXT: and x8, x8, #0xfffffff8
-; CHECK-GI-NEXT: .LBB12_1: // %vector.body
+; CHECK-GI-NEXT: .LBB13_1: // %vector.body
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-GI-NEXT: add x9, x2, w0, uxtw #1
; CHECK-GI-NEXT: subs x8, x8, #8
@@ -1218,7 +1635,7 @@ define void @matrix_mul_signed_and(i32 %N, ptr nocapture %C, ptr nocapture reado
; CHECK-GI-NEXT: mul v1.4s, v0.4s, v1.4s
; CHECK-GI-NEXT: mul v2.4s, v0.4s, v2.4s
; CHECK-GI-NEXT: stp q1, q2, [x9]
-; CHECK-GI-NEXT: b.ne .LBB12_1
+; CHECK-GI-NEXT: b.ne .LBB13_1
; CHECK-GI-NEXT: // %bb.2: // %for.end12
; CHECK-GI-NEXT: ret
vector.header:
@@ -1272,7 +1689,7 @@ define void @matrix_mul_signed_and_double(i32 %N, ptr nocapture %C, ptr nocaptur
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-SD-NEXT: and x8, x0, #0xfffffff0
; CHECK-SD-NEXT: fmov s0, w9
-; CHECK-SD-NEXT: .LBB13_1: // %vector.body
+; CHECK-SD-NEXT: .LBB14_1: // %vector.body
; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-SD-NEXT: add x9, x2, w0, uxtw #1
; CHECK-SD-NEXT: subs x8, x8, #16
@@ -1290,7 +1707,7 @@ define void @matrix_mul_signed_and_double(i32 %N, ptr nocapture %C, ptr nocaptur
; CHECK-SD-NEXT: mul v2.4s, v2.4s, v0.s[0]
; CHECK-SD-NEXT: stp q1, q3, [x9]
; CHECK-SD-NEXT: stp q2, q4, [x9, #32]
-; CHECK-SD-NEXT: b.ne .LBB13_1
+; CHECK-SD-NEXT: b.ne .LBB14_1
; CHECK-SD-NEXT: // %bb.2: // %for.end12
; CHECK-SD-NEXT: ret
;
@@ -1300,7 +1717,7 @@ define void @matrix_mul_signed_and_double(i32 %N, ptr nocapture %C, ptr nocaptur
; CHECK-GI-NEXT: dup v0.4s, w8
; CHECK-GI-NEXT: mov w8, w0
; CHECK-GI-NEXT: and x8, x8, #0xfffffff0
-; CHECK-GI-NEXT: .LBB13_1: // %vector.body
+; CHECK-GI-NEXT: .LBB14_1: // %vector.body
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-GI-NEXT: add x9, x2, w0, uxtw #1
; CHECK-GI-NEXT: subs x8, x8, #16
@@ -1318,7 +1735,7 @@ define void @matrix_mul_signed_and_double(i32 %N, ptr nocapture %C, ptr nocaptur
; CHECK-GI-NEXT: mul v2.4s, v0.4s, v2.4s
; CHECK-GI-NEXT: stp q3, q1, [x9]
; CHECK-GI-NEXT: stp q4, q2, [x9, #32]!
-; CHECK-GI-NEXT: b.ne .LBB13_1
+; CHECK-GI-NEXT: b.ne .LBB14_1
; CHECK-GI-NEXT: // %bb.2: // %for.end12
; CHECK-GI-NEXT: ret
vector.header:
@@ -1369,9 +1786,9 @@ define noundef <8 x i16> @cmplx_mul_combined_re_im(<8 x i16> noundef %a, i64 %sc
; CHECK-SD-LABEL: cmplx_mul_combined_re_im:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: lsr x9, x0, #16
-; CHECK-SD-NEXT: adrp x8, .LCPI14_0
+; CHECK-SD-NEXT: adrp x8, .LCPI15_0
; CHECK-SD-NEXT: dup v4.8h, w0
-; CHECK-SD-NEXT: ldr q3, [x8, :lo12:.LCPI14_0]
+; CHECK-SD-NEXT: ldr q3, [x8, :lo12:.LCPI15_0]
; CHECK-SD-NEXT: dup v2.8h, w9
; CHECK-SD-NEXT: sqneg v1.8h, v2.8h
; CHECK-SD-NEXT: tbl v1.16b, { v1.16b, v2.16b }, v3.16b
@@ -1386,12 +1803,12 @@ define noundef <8 x i16> @cmplx_mul_combined_re_im(<8 x i16> noundef %a, i64 %sc
; CHECK-GI-LABEL: cmplx_mul_combined_re_im:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: lsr w9, w0, #16
-; CHECK-GI-NEXT: adrp x8, .LCPI14_0
+; CHECK-GI-NEXT: adrp x8, .LCPI15_0
; CHECK-GI-NEXT: rev32 v4.8h, v0.8h
; CHECK-GI-NEXT: dup v1.8h, w9
; CHECK-GI-NEXT: fmov s3, w9
; CHECK-GI-NEXT: sqneg v2.8h, v1.8h
-; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI14_0]
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI15_0]
; CHECK-GI-NEXT: tbl v1.16b, { v2.16b, v3.16b }, v1.16b
; CHECK-GI-NEXT: mov d2, v0.d[1]
; CHECK-GI-NEXT: dup v3.8h, w0
diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
index 6e5c666..0cd885e 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
@@ -222,22 +222,20 @@ define <4 x i32> @smull_zext_v4i16_v4i32(ptr %A, ptr %B) nounwind {
define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind {
; CHECK-NEON-LABEL: smull_zext_v2i32_v2i64:
; CHECK-NEON: // %bb.0:
-; CHECK-NEON-NEXT: ldrh w8, [x0]
-; CHECK-NEON-NEXT: ldrh w9, [x0, #2]
+; CHECK-NEON-NEXT: ldrh w8, [x0, #2]
+; CHECK-NEON-NEXT: ldr h0, [x0]
; CHECK-NEON-NEXT: ldr d1, [x1]
-; CHECK-NEON-NEXT: fmov d0, x8
-; CHECK-NEON-NEXT: mov v0.d[1], x9
+; CHECK-NEON-NEXT: mov v0.d[1], x8
; CHECK-NEON-NEXT: xtn v0.2s, v0.2d
; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s
; CHECK-NEON-NEXT: ret
;
; CHECK-SVE-LABEL: smull_zext_v2i32_v2i64:
; CHECK-SVE: // %bb.0:
-; CHECK-SVE-NEXT: ldrh w8, [x0]
-; CHECK-SVE-NEXT: ldrh w9, [x0, #2]
+; CHECK-SVE-NEXT: ldrh w8, [x0, #2]
+; CHECK-SVE-NEXT: ldr h0, [x0]
; CHECK-SVE-NEXT: ldr d1, [x1]
-; CHECK-SVE-NEXT: fmov d0, x8
-; CHECK-SVE-NEXT: mov v0.d[1], x9
+; CHECK-SVE-NEXT: mov v0.d[1], x8
; CHECK-SVE-NEXT: xtn v0.2s, v0.2d
; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s
; CHECK-SVE-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/arm64-cvt-simd-fptoi.ll b/llvm/test/CodeGen/AArch64/arm64-cvt-simd-fptoi.ll
new file mode 100644
index 0000000..a729772
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-cvt-simd-fptoi.ll
@@ -0,0 +1,1943 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple aarch64-unknown-unknown -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK-NOFPRCVT
+; RUN: llc < %s -mtriple aarch64-unknown-unknown -mattr=+fprcvt,+fullfp16 | FileCheck %s --check-prefixes=CHECK
+; RUN: llc < %s -mtriple aarch64-unknown-unknown -global-isel -global-isel-abort=2 -mattr=+fprcvt,+fullfp16 2>&1 | FileCheck %s --check-prefixes=CHECK
+
+; CHECK-GI: warning: Instruction selection used fallback path for fptosi_i32_f16_simd
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptosi_i64_f16_simd
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptosi_i64_f32_simd
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptosi_i32_f64_simd
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptosi_i64_f64_simd
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptosi_i32_f32_simd
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptoui_i32_f16_simd
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptoui_i64_f16_simd
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptoui_i64_f32_simd
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptoui_i32_f64_simd
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptoui_i64_f64_simd
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptoui_i32_f32_simd
+
+;
+; FPTOI
+;
+
+define float @test_fptosi_f16_i32_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: test_fptosi_f16_i32_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs w8, h0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: test_fptosi_f16_i32_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs s0, h0
+; CHECK-NEXT: ret
+ %r = fptosi half %a to i32
+ %bc = bitcast i32 %r to float
+ ret float %bc
+}
+
+define double @test_fptosi_f16_i64_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: test_fptosi_f16_i64_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs x8, h0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: test_fptosi_f16_i64_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, h0
+; CHECK-NEXT: ret
+ %r = fptosi half %a to i64
+ %bc = bitcast i64 %r to double
+ ret double %bc
+}
+
+define float @test_fptosi_f64_i32_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: test_fptosi_f64_i32_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs w8, d0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: test_fptosi_f64_i32_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs s0, d0
+; CHECK-NEXT: ret
+ %r = fptosi double %a to i32
+ %bc = bitcast i32 %r to float
+ ret float %bc
+}
+
+define double @test_fptosi_f32_i64_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: test_fptosi_f32_i64_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: test_fptosi_f32_i64_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, s0
+; CHECK-NEXT: ret
+ %r = fptosi float %a to i64
+ %bc = bitcast i64 %r to double
+ ret double %bc
+}
+
+define double @test_fptosi_f64_i64_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: test_fptosi_f64_i64_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: test_fptosi_f64_i64_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, d0
+; CHECK-NEXT: ret
+ %r = fptosi double %a to i64
+ %bc = bitcast i64 %r to double
+ ret double %bc
+}
+
+
+define float @test_fptosi_f32_i32_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: test_fptosi_f32_i32_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs s0, s0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: test_fptosi_f32_i32_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: ret
+ %r = fptosi float %a to i32
+ %bc = bitcast i32 %r to float
+ ret float %bc
+}
+
+define float @test_fptoui_f16_i32_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: test_fptoui_f16_i32_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu w8, h0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: test_fptoui_f16_i32_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu s0, h0
+; CHECK-NEXT: ret
+ %r = fptoui half %a to i32
+ %bc = bitcast i32 %r to float
+ ret float %bc
+}
+
+define double @test_fptoui_f16_i64_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: test_fptoui_f16_i64_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu x8, h0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: test_fptoui_f16_i64_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu d0, h0
+; CHECK-NEXT: ret
+ %r = fptoui half %a to i64
+ %bc = bitcast i64 %r to double
+ ret double %bc
+}
+
+define float @test_fptoui_f64_i32_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: test_fptoui_f64_i32_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu w8, d0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: test_fptoui_f64_i32_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu s0, d0
+; CHECK-NEXT: ret
+ %r = fptoui double %a to i32
+ %bc = bitcast i32 %r to float
+ ret float %bc
+}
+
+define double @test_fptoui_f32_i64_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: test_fptoui_f32_i64_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: test_fptoui_f32_i64_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu d0, s0
+; CHECK-NEXT: ret
+ %r = fptoui float %a to i64
+ %bc = bitcast i64 %r to double
+ ret double %bc
+}
+
+define double @test_fptoui_f64_i64_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: test_fptoui_f64_i64_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: test_fptoui_f64_i64_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu d0, d0
+; CHECK-NEXT: ret
+ %r = fptoui double %a to i64
+ %bc = bitcast i64 %r to double
+ ret double %bc
+}
+
+
+define float @test_fptoui_f32_i32_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: test_fptoui_f32_i32_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu s0, s0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: test_fptoui_f32_i32_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu s0, s0
+; CHECK-NEXT: ret
+ %r = fptoui float %a to i32
+ %bc = bitcast i32 %r to float
+ ret float %bc
+}
+
+
+;
+; FPTOI strictfp
+;
+
+define float @fptosi_i32_f16_simd(half %x) {
+; CHECK-NOFPRCVT-LABEL: fptosi_i32_f16_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs w8, h0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fptosi_i32_f16_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs s0, h0
+; CHECK-NEXT: ret
+ %val = call i32 @llvm.experimental.constrained.fptosi.i32.f16(half %x, metadata !"fpexcept.strict")
+ %sum = bitcast i32 %val to float
+ ret float %sum
+}
+
+define double @fptosi_i64_f16_simd(half %x) {
+; CHECK-NOFPRCVT-LABEL: fptosi_i64_f16_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs x8, h0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fptosi_i64_f16_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, h0
+; CHECK-NEXT: ret
+ %val = call i64 @llvm.experimental.constrained.fptosi.i64.f16(half %x, metadata !"fpexcept.strict")
+ %sum = bitcast i64 %val to double
+ ret double %sum
+}
+
+define double @fptosi_i64_f32_simd(float %x) {
+; CHECK-NOFPRCVT-LABEL: fptosi_i64_f32_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fptosi_i64_f32_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, s0
+; CHECK-NEXT: ret
+ %val = call i64 @llvm.experimental.constrained.fptosi.i64.f32(float %x, metadata !"fpexcept.strict")
+ %bc = bitcast i64 %val to double
+ ret double %bc
+}
+
+define float @fptosi_i32_f64_simd(double %x) {
+; CHECK-NOFPRCVT-LABEL: fptosi_i32_f64_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs w8, d0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fptosi_i32_f64_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs s0, d0
+; CHECK-NEXT: ret
+ %val = call i32 @llvm.experimental.constrained.fptosi.i32.f64(double %x, metadata !"fpexcept.strict")
+ %bc = bitcast i32 %val to float
+ ret float %bc
+}
+
+define double @fptosi_i64_f64_simd(double %x) {
+; CHECK-NOFPRCVT-LABEL: fptosi_i64_f64_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fptosi_i64_f64_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, d0
+; CHECK-NEXT: ret
+ %val = call i64 @llvm.experimental.constrained.fptosi.i64.f64(double %x, metadata !"fpexcept.strict")
+ %bc = bitcast i64 %val to double
+ ret double %bc
+}
+
+define float @fptosi_i32_f32_simd(float %x) {
+; CHECK-NOFPRCVT-LABEL: fptosi_i32_f32_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs s0, s0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fptosi_i32_f32_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: ret
+ %val = call i32 @llvm.experimental.constrained.fptosi.i32.f32(float %x, metadata !"fpexcept.strict")
+ %bc = bitcast i32 %val to float
+ ret float %bc
+}
+
+
+
+define float @fptoui_i32_f16_simd(half %x) {
+; CHECK-NOFPRCVT-LABEL: fptoui_i32_f16_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu w8, h0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fptoui_i32_f16_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu s0, h0
+; CHECK-NEXT: ret
+ %val = call i32 @llvm.experimental.constrained.fptoui.i32.f16(half %x, metadata !"fpexcept.strict")
+ %sum = bitcast i32 %val to float
+ ret float %sum
+}
+
+define double @fptoui_i64_f16_simd(half %x) {
+; CHECK-NOFPRCVT-LABEL: fptoui_i64_f16_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu x8, h0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fptoui_i64_f16_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu d0, h0
+; CHECK-NEXT: ret
+ %val = call i64 @llvm.experimental.constrained.fptoui.i64.f16(half %x, metadata !"fpexcept.strict")
+ %sum = bitcast i64 %val to double
+ ret double %sum
+}
+
+define double @fptoui_i64_f32_simd(float %x) {
+; CHECK-NOFPRCVT-LABEL: fptoui_i64_f32_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fptoui_i64_f32_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu d0, s0
+; CHECK-NEXT: ret
+ %val = call i64 @llvm.experimental.constrained.fptoui.i64.f32(float %x, metadata !"fpexcept.strict")
+ %bc = bitcast i64 %val to double
+ ret double %bc
+}
+
+define float @fptoui_i32_f64_simd(double %x) {
+; CHECK-NOFPRCVT-LABEL: fptoui_i32_f64_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu w8, d0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fptoui_i32_f64_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu s0, d0
+; CHECK-NEXT: ret
+ %val = call i32 @llvm.experimental.constrained.fptoui.i32.f64(double %x, metadata !"fpexcept.strict")
+ %bc = bitcast i32 %val to float
+ ret float %bc
+}
+
+define double @fptoui_i64_f64_simd(double %x) {
+; CHECK-NOFPRCVT-LABEL: fptoui_i64_f64_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fptoui_i64_f64_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu d0, d0
+; CHECK-NEXT: ret
+ %val = call i64 @llvm.experimental.constrained.fptoui.i64.f64(double %x, metadata !"fpexcept.strict")
+ %bc = bitcast i64 %val to double
+ ret double %bc
+}
+
+define float @fptoui_i32_f32_simd(float %x) {
+; CHECK-NOFPRCVT-LABEL: fptoui_i32_f32_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu s0, s0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fptoui_i32_f32_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu s0, s0
+; CHECK-NEXT: ret
+ %val = call i32 @llvm.experimental.constrained.fptoui.i32.f32(float %x, metadata !"fpexcept.strict")
+ %bc = bitcast i32 %val to float
+ ret float %bc
+}
+
+;
+; FPTOI rounding
+;
+
+
+define double @fcvtas_ds_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtas_ds_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtas x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtas_ds_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtas d0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.round.f32(float %a)
+ %i = fptosi float %r to i64
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtas_sd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtas_sd_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtas w8, d0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtas_sd_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtas s0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.round.f64(double %a)
+ %i = fptosi double %r to i32
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define float @fcvtas_ss_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtas_ss_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtas s0, s0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtas_ss_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtas s0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.round.f32(float %a)
+ %i = fptosi float %r to i32
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtas_dd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtas_dd_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtas d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtas_dd_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtas d0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.round.f64(double %a)
+ %i = fptosi double %r to i64
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+
+define double @fcvtau_ds_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtau_ds_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtau x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtau_ds_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtau d0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.round.f32(float %a)
+ %i = fptoui float %r to i64
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtau_sd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtau_sd_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtau w8, d0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtau_sd_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtau s0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.round.f64(double %a)
+ %i = fptoui double %r to i32
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define float @fcvtau_ss_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtau_ss_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtas s0, s0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtau_ss_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtas s0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.round.f32(float %a)
+ %i = fptosi float %r to i32
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtau_dd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtau_dd_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtas d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtau_dd_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtas d0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.round.f64(double %a)
+ %i = fptosi double %r to i64
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+
+define double @fcvtms_ds_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtms_ds_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtms x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtms_ds_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtms d0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.floor.f32(float %a)
+ %i = fptosi float %r to i64
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtms_sd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtms_sd_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtms w8, d0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtms_sd_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtms s0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.floor.f64(double %a)
+ %i = fptosi double %r to i32
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define float @fcvtms_ss_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtms_ss_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtms s0, s0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtms_ss_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtms s0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.floor.f32(float %a)
+ %i = fptosi float %r to i32
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtms_dd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtms_dd_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtms d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtms_dd_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtms d0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.floor.f64(double %a)
+ %i = fptosi double %r to i64
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+
+
+define double @fcvtmu_ds_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtmu_ds_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtmu x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtmu_ds_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtmu d0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.floor.f32(float %a)
+ %i = fptoui float %r to i64
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtmu_sd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtmu_sd_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtmu w8, d0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtmu_sd_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtmu s0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.floor.f64(double %a)
+ %i = fptoui double %r to i32
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define float @fcvtmu_ss_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtmu_ss_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtms s0, s0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtmu_ss_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtms s0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.floor.f32(float %a)
+ %i = fptosi float %r to i32
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtmu_dd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtmu_dd_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtms d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtmu_dd_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtms d0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.floor.f64(double %a)
+ %i = fptosi double %r to i64
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+
+define double @fcvtps_ds_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtps_ds_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtps x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtps_ds_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtps d0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.ceil.f32(float %a)
+ %i = fptosi float %r to i64
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtps_sd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtps_sd_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtps w8, d0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtps_sd_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtps s0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.ceil.f64(double %a)
+ %i = fptosi double %r to i32
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define float @fcvtps_ss_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtps_ss_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtps s0, s0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtps_ss_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtps s0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.ceil.f32(float %a)
+ %i = fptosi float %r to i32
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtps_dd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtps_dd_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtps d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtps_dd_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtps d0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.ceil.f64(double %a)
+ %i = fptosi double %r to i64
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+
+define double @fcvtpu_ds_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtpu_ds_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtpu x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtpu_ds_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtpu d0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.ceil.f32(float %a)
+ %i = fptoui float %r to i64
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtpu_sd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtpu_sd_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtpu w8, d0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtpu_sd_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtpu s0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.ceil.f64(double %a)
+ %i = fptoui double %r to i32
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define float @fcvtpu_ss_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtpu_ss_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtps s0, s0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtpu_ss_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtps s0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.ceil.f32(float %a)
+ %i = fptosi float %r to i32
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtpu_dd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtpu_dd_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtps d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtpu_dd_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtps d0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.ceil.f64(double %a)
+ %i = fptosi double %r to i64
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+
+define double @fcvtzs_ds_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_ds_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzs_ds_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.trunc.f32(float %a)
+ %i = fptosi float %r to i64
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtzs_sd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_sd_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs w8, d0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzs_sd_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs s0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.trunc.f64(double %a)
+ %i = fptosi double %r to i32
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define float @fcvtzs_ss_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_ss_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs s0, s0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzs_ss_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.trunc.f32(float %a)
+ %i = fptosi float %r to i32
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtzs_dd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_dd_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzs_dd_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.trunc.f64(double %a)
+ %i = fptosi double %r to i64
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define double @fcvtzu_ds_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_ds_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzu_ds_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu d0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.trunc.f32(float %a)
+ %i = fptoui float %r to i64
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtzu_sd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_sd_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu w8, d0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzu_sd_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu s0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.trunc.f64(double %a)
+ %i = fptoui double %r to i32
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define float @fcvtzu_ss_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_ss_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs s0, s0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzu_ss_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.trunc.f32(float %a)
+ %i = fptosi float %r to i32
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtzu_dd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_dd_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzu_dd_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.trunc.f64(double %a)
+ %i = fptosi double %r to i64
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+
+;
+; FPTOI saturating
+;
+
+define float @fcvtzs_sh_sat_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_sh_sat_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs w8, h0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzs_sh_sat_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs s0, h0
+; CHECK-NEXT: ret
+ %i = call i32 @llvm.fptosi.sat.i32.f16(half %a)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtzs_dh_sat_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_dh_sat_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs x8, h0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzs_dh_sat_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, h0
+; CHECK-NEXT: ret
+ %i = call i64 @llvm.fptosi.sat.i64.f16(half %a)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define double @fcvtzs_ds_sat_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_ds_sat_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzs_ds_sat_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, s0
+; CHECK-NEXT: ret
+ %i = call i64 @llvm.fptosi.sat.i64.f32(float %a)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtzs_sd_sat_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_sd_sat_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs w8, d0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzs_sd_sat_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs s0, d0
+; CHECK-NEXT: ret
+ %i = call i32 @llvm.fptosi.sat.i32.f64(double %a)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define float @fcvtzs_ss_sat_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_ss_sat_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs s0, s0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzs_ss_sat_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: ret
+ %i = call i32 @llvm.fptosi.sat.i32.f32(float %a)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtzs_dd_sat_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_dd_sat_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzs_dd_sat_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, d0
+; CHECK-NEXT: ret
+ %i = call i64 @llvm.fptosi.sat.i64.f64(double %a)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtzu_sh_sat_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_sh_sat_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu w8, h0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzu_sh_sat_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu s0, h0
+; CHECK-NEXT: ret
+ %i = call i32 @llvm.fptoui.sat.i32.f16(half %a)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtzu_dh_sat_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_dh_sat_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu x8, h0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzu_dh_sat_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu d0, h0
+; CHECK-NEXT: ret
+ %i = call i64 @llvm.fptoui.sat.i64.f16(half %a)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define double @fcvtzu_ds_sat_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_ds_sat_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzu_ds_sat_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu d0, s0
+; CHECK-NEXT: ret
+ %i = call i64 @llvm.fptoui.sat.i64.f32(float %a)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtzu_sd_sat_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_sd_sat_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu w8, d0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzu_sd_sat_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu s0, d0
+; CHECK-NEXT: ret
+ %i = call i32 @llvm.fptoui.sat.i32.f64(double %a)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define float @fcvtzu_ss_sat_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_ss_sat_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs s0, s0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzu_ss_sat_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: ret
+ %i = call i32 @llvm.fptosi.sat.i32.f32(float %a)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtzu_dd_sat_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_dd_sat_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzu_dd_sat_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, d0
+; CHECK-NEXT: ret
+ %i = call i64 @llvm.fptosi.sat.i64.f64(double %a)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+;
+; FPTOI saturating with rounding
+;
+
+define float @fcvtas_sh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtas_sh_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtas w8, h0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtas_sh_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtas s0, h0
+; CHECK-NEXT: ret
+ %r = call half @llvm.round.f16(half %a) nounwind readnone
+ %i = call i32 @llvm.fptosi.sat.i32.f16(half %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtas_dh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtas_dh_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtas x8, h0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtas_dh_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtas d0, h0
+; CHECK-NEXT: ret
+ %r = call half @llvm.round.f16(half %a) nounwind readnone
+ %i = call i64 @llvm.fptosi.sat.i64.f16(half %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define double @fcvtas_ds_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtas_ds_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtas x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtas_ds_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtas d0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.round.f32(float %a)
+ %i = call i64 @llvm.fptosi.sat.i64.f32(float %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtas_sd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtas_sd_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtas w8, d0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtas_sd_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtas s0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.round.f64(double %a)
+ %i = call i32 @llvm.fptosi.sat.i32.f64(double %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define float @fcvtas_ss_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtas_ss_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtas s0, s0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtas_ss_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtas s0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.round.f32(float %a)
+ %i = call i32 @llvm.fptosi.sat.i32.f32(float %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtas_dd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtas_dd_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtas d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtas_dd_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtas d0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.round.f64(double %a)
+ %i = call i64 @llvm.fptosi.sat.i64.f64(double %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtau_sh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtau_sh_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtau w8, h0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtau_sh_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtau s0, h0
+; CHECK-NEXT: ret
+ %r = call half @llvm.round.f16(half %a) nounwind readnone
+ %i = call i32 @llvm.fptoui.sat.i32.f16(half %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtau_dh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtau_dh_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtau x8, h0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtau_dh_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtau d0, h0
+; CHECK-NEXT: ret
+ %r = call half @llvm.round.f16(half %a) nounwind readnone
+ %i = call i64 @llvm.fptoui.sat.i64.f16(half %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define double @fcvtau_ds_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtau_ds_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtau x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtau_ds_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtau d0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.round.f32(float %a)
+ %i = call i64 @llvm.fptoui.sat.i64.f32(float %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtau_sd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtau_sd_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtau w8, d0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtau_sd_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtau s0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.round.f64(double %a)
+ %i = call i32 @llvm.fptoui.sat.i32.f64(double %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define float @fcvtau_ss_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtau_ss_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtas s0, s0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtau_ss_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtas s0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.round.f32(float %a)
+ %i = call i32 @llvm.fptosi.sat.i32.f32(float %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtau_dd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtau_dd_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtas d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtau_dd_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtas d0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.round.f64(double %a)
+ %i = call i64 @llvm.fptosi.sat.i64.f64(double %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtms_sh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtms_sh_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtms w8, h0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtms_sh_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtms s0, h0
+; CHECK-NEXT: ret
+ %r = call half @llvm.floor.f16(half %a) nounwind readnone
+ %i = call i32 @llvm.fptosi.sat.i32.f16(half %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtms_dh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtms_dh_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtms x8, h0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtms_dh_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtms d0, h0
+; CHECK-NEXT: ret
+ %r = call half @llvm.floor.f16(half %a) nounwind readnone
+ %i = call i64 @llvm.fptosi.sat.i64.f16(half %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define double @fcvtms_ds_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtms_ds_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtms x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtms_ds_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtms d0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.floor.f32(float %a)
+ %i = call i64 @llvm.fptosi.sat.i64.f32(float %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtms_sd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtms_sd_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtms w8, d0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtms_sd_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtms s0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.floor.f64(double %a)
+ %i = call i32 @llvm.fptosi.sat.i32.f64(double %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define float @fcvtms_ss_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtms_ss_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtms s0, s0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtms_ss_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtms s0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.floor.f32(float %a)
+ %i = call i32 @llvm.fptosi.sat.i32.f32(float %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtms_dd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtms_dd_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtms d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtms_dd_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtms d0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.floor.f64(double %a)
+ %i = call i64 @llvm.fptosi.sat.i64.f64(double %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtmu_sh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtmu_sh_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtmu w8, h0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtmu_sh_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtmu s0, h0
+; CHECK-NEXT: ret
+ %r = call half @llvm.floor.f16(half %a) nounwind readnone
+ %i = call i32 @llvm.fptoui.sat.i32.f16(half %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtmu_dh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtmu_dh_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtmu x8, h0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtmu_dh_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtmu d0, h0
+; CHECK-NEXT: ret
+ %r = call half @llvm.floor.f16(half %a) nounwind readnone
+ %i = call i64 @llvm.fptoui.sat.i64.f16(half %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define double @fcvtmu_ds_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtmu_ds_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtmu x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtmu_ds_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtmu d0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.floor.f32(float %a)
+ %i = call i64 @llvm.fptoui.sat.i64.f32(float %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtmu_sd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtmu_sd_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtmu w8, d0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtmu_sd_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtmu s0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.floor.f64(double %a)
+ %i = call i32 @llvm.fptoui.sat.i32.f64(double %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define float @fcvtmu_ss_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtmu_ss_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtms s0, s0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtmu_ss_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtms s0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.floor.f32(float %a)
+ %i = call i32 @llvm.fptosi.sat.i32.f32(float %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtmu_dd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtmu_dd_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtms d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtmu_dd_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtms d0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.floor.f64(double %a)
+ %i = call i64 @llvm.fptosi.sat.i64.f64(double %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtps_sh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtps_sh_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtps w8, h0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtps_sh_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtps s0, h0
+; CHECK-NEXT: ret
+ %r = call half @llvm.ceil.f16(half %a) nounwind readnone
+ %i = call i32 @llvm.fptosi.sat.i32.f16(half %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtps_dh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtps_dh_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtps x8, h0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtps_dh_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtps d0, h0
+; CHECK-NEXT: ret
+ %r = call half @llvm.ceil.f16(half %a) nounwind readnone
+ %i = call i64 @llvm.fptosi.sat.i64.f16(half %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define double @fcvtps_ds_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtps_ds_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtps x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtps_ds_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtps d0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.ceil.f32(float %a)
+ %i = call i64 @llvm.fptosi.sat.i64.f32(float %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtps_sd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtps_sd_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtps w8, d0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtps_sd_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtps s0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.ceil.f64(double %a)
+ %i = call i32 @llvm.fptosi.sat.i32.f64(double %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define float @fcvtps_ss_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtps_ss_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtps s0, s0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtps_ss_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtps s0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.ceil.f32(float %a)
+ %i = call i32 @llvm.fptosi.sat.i32.f32(float %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtps_dd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtps_dd_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtps d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtps_dd_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtps d0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.ceil.f64(double %a)
+ %i = call i64 @llvm.fptosi.sat.i64.f64(double %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtpu_sh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtpu_sh_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtpu w8, h0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtpu_sh_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtpu s0, h0
+; CHECK-NEXT: ret
+ %r = call half @llvm.ceil.f16(half %a) nounwind readnone
+ %i = call i32 @llvm.fptoui.sat.i32.f16(half %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtpu_dh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtpu_dh_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtpu x8, h0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtpu_dh_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtpu d0, h0
+; CHECK-NEXT: ret
+ %r = call half @llvm.ceil.f16(half %a) nounwind readnone
+ %i = call i64 @llvm.fptoui.sat.i64.f16(half %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define double @fcvtpu_ds_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtpu_ds_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtpu x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtpu_ds_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtpu d0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.ceil.f32(float %a)
+ %i = call i64 @llvm.fptoui.sat.i64.f32(float %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtpu_sd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtpu_sd_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtpu w8, d0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtpu_sd_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtpu s0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.ceil.f64(double %a)
+ %i = call i32 @llvm.fptoui.sat.i32.f64(double %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define float @fcvtpu_ss_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtpu_ss_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtps s0, s0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtpu_ss_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtps s0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.ceil.f32(float %a)
+ %i = call i32 @llvm.fptosi.sat.i32.f32(float %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtpu_dd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtpu_dd_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtps d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtpu_dd_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtps d0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.ceil.f64(double %a)
+ %i = call i64 @llvm.fptosi.sat.i64.f64(double %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtzs_sh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_sh_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs w8, h0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzs_sh_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs s0, h0
+; CHECK-NEXT: ret
+ %r = call half @llvm.trunc.f16(half %a) nounwind readnone
+ %i = call i32 @llvm.fptosi.sat.i32.f16(half %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtzs_dh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_dh_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs x8, h0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzs_dh_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, h0
+; CHECK-NEXT: ret
+ %r = call half @llvm.trunc.f16(half %a) nounwind readnone
+ %i = call i64 @llvm.fptosi.sat.i64.f16(half %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define double @fcvtzs_ds_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_ds_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzs_ds_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.trunc.f32(float %a)
+ %i = call i64 @llvm.fptosi.sat.i64.f32(float %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtzs_sd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_sd_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs w8, d0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzs_sd_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs s0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.trunc.f64(double %a)
+ %i = call i32 @llvm.fptosi.sat.i32.f64(double %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define float @fcvtzs_ss_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_ss_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs s0, s0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzs_ss_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.trunc.f32(float %a)
+ %i = call i32 @llvm.fptosi.sat.i32.f32(float %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtzs_dd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_dd_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzs_dd_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.trunc.f64(double %a)
+ %i = call i64 @llvm.fptosi.sat.i64.f64(double %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtzu_sh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_sh_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu w8, h0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzu_sh_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu s0, h0
+; CHECK-NEXT: ret
+ %r = call half @llvm.trunc.f16(half %a) nounwind readnone
+ %i = call i32 @llvm.fptoui.sat.i32.f16(half %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtzu_dh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_dh_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu x8, h0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzu_dh_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu d0, h0
+; CHECK-NEXT: ret
+ %r = call half @llvm.trunc.f16(half %a) nounwind readnone
+ %i = call i64 @llvm.fptoui.sat.i64.f16(half %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define double @fcvtzu_ds_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_ds_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzu_ds_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu d0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.trunc.f32(float %a)
+ %i = call i64 @llvm.fptoui.sat.i64.f32(float %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtzu_sd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_sd_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu w8, d0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzu_sd_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu s0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.trunc.f64(double %a)
+ %i = call i32 @llvm.fptoui.sat.i32.f64(double %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define float @fcvtzu_ss_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_ss_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu s0, s0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzu_ss_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu s0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.trunc.f32(float %a)
+ %i = call i32 @llvm.fptoui.sat.i32.f32(float %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtzu_dd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_dd_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzu_dd_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu d0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.trunc.f64(double %a)
+ %i = call i64 @llvm.fptoui.sat.i64.f64(double %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
index e18a5f6..d8f3708 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -980,12 +980,18 @@ define <1 x double> @test_bitcasti64tov1f64(i64 %in) {
}
define <1 x i64> @test_bitcastv8i8tov1f64(<8 x i8> %a) #0 {
-; CHECK-LABEL: test_bitcastv8i8tov1f64:
-; CHECK: // %bb.0:
-; CHECK-NEXT: neg v0.8b, v0.8b
-; CHECK-NEXT: fcvtzs x8, d0
-; CHECK-NEXT: fmov d0, x8
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: test_bitcastv8i8tov1f64:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: neg v0.8b, v0.8b
+; CHECK-SD-NEXT: fcvtzs x8, d0
+; CHECK-SD-NEXT: fmov d0, x8
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_bitcastv8i8tov1f64:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: neg v0.8b, v0.8b
+; CHECK-GI-NEXT: fcvtzs d0, d0
+; CHECK-GI-NEXT: ret
%sub.i = sub <8 x i8> zeroinitializer, %a
%1 = bitcast <8 x i8> %sub.i to <1 x double>
%vcvt.i = fptosi <1 x double> %1 to <1 x i64>
@@ -993,12 +999,18 @@ define <1 x i64> @test_bitcastv8i8tov1f64(<8 x i8> %a) #0 {
}
define <1 x i64> @test_bitcastv4i16tov1f64(<4 x i16> %a) #0 {
-; CHECK-LABEL: test_bitcastv4i16tov1f64:
-; CHECK: // %bb.0:
-; CHECK-NEXT: neg v0.4h, v0.4h
-; CHECK-NEXT: fcvtzs x8, d0
-; CHECK-NEXT: fmov d0, x8
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: test_bitcastv4i16tov1f64:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: neg v0.4h, v0.4h
+; CHECK-SD-NEXT: fcvtzs x8, d0
+; CHECK-SD-NEXT: fmov d0, x8
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_bitcastv4i16tov1f64:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: neg v0.4h, v0.4h
+; CHECK-GI-NEXT: fcvtzs d0, d0
+; CHECK-GI-NEXT: ret
%sub.i = sub <4 x i16> zeroinitializer, %a
%1 = bitcast <4 x i16> %sub.i to <1 x double>
%vcvt.i = fptosi <1 x double> %1 to <1 x i64>
@@ -1006,12 +1018,18 @@ define <1 x i64> @test_bitcastv4i16tov1f64(<4 x i16> %a) #0 {
}
define <1 x i64> @test_bitcastv2i32tov1f64(<2 x i32> %a) #0 {
-; CHECK-LABEL: test_bitcastv2i32tov1f64:
-; CHECK: // %bb.0:
-; CHECK-NEXT: neg v0.2s, v0.2s
-; CHECK-NEXT: fcvtzs x8, d0
-; CHECK-NEXT: fmov d0, x8
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: test_bitcastv2i32tov1f64:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: neg v0.2s, v0.2s
+; CHECK-SD-NEXT: fcvtzs x8, d0
+; CHECK-SD-NEXT: fmov d0, x8
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_bitcastv2i32tov1f64:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: neg v0.2s, v0.2s
+; CHECK-GI-NEXT: fcvtzs d0, d0
+; CHECK-GI-NEXT: ret
%sub.i = sub <2 x i32> zeroinitializer, %a
%1 = bitcast <2 x i32> %sub.i to <1 x double>
%vcvt.i = fptosi <1 x double> %1 to <1 x i64>
@@ -1031,8 +1049,7 @@ define <1 x i64> @test_bitcastv1i64tov1f64(<1 x i64> %a) #0 {
; CHECK-GI-NEXT: fmov x8, d0
; CHECK-GI-NEXT: neg x8, x8
; CHECK-GI-NEXT: fmov d0, x8
-; CHECK-GI-NEXT: fcvtzs x8, d0
-; CHECK-GI-NEXT: fmov d0, x8
+; CHECK-GI-NEXT: fcvtzs d0, d0
; CHECK-GI-NEXT: ret
%sub.i = sub <1 x i64> zeroinitializer, %a
%1 = bitcast <1 x i64> %sub.i to <1 x double>
diff --git a/llvm/test/CodeGen/AArch64/arm64-vcvt.ll b/llvm/test/CodeGen/AArch64/arm64-vcvt.ll
index 627d31f..1e0cfa0 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vcvt.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vcvt.ll
@@ -359,11 +359,16 @@ define <2 x i64> @fcvtzs_2d(<2 x double> %A) nounwind {
; FIXME: Generate "fcvtzs d0, d0"?
define <1 x i64> @fcvtzs_1d(<1 x double> %A) nounwind {
-; CHECK-LABEL: fcvtzs_1d:
-; CHECK: // %bb.0:
-; CHECK-NEXT: fcvtzs x8, d0
-; CHECK-NEXT: fmov d0, x8
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: fcvtzs_1d:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: fcvtzs x8, d0
+; CHECK-SD-NEXT: fmov d0, x8
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: fcvtzs_1d:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: fcvtzs d0, d0
+; CHECK-GI-NEXT: ret
%tmp3 = fptosi <1 x double> %A to <1 x i64>
ret <1 x i64> %tmp3
}
@@ -438,11 +443,16 @@ define <2 x i64> @fcvtzu_2d(<2 x double> %A) nounwind {
; FIXME: Generate "fcvtzu d0, d0"?
define <1 x i64> @fcvtzu_1d(<1 x double> %A) nounwind {
-; CHECK-LABEL: fcvtzu_1d:
-; CHECK: // %bb.0:
-; CHECK-NEXT: fcvtzu x8, d0
-; CHECK-NEXT: fmov d0, x8
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: fcvtzu_1d:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: fcvtzu x8, d0
+; CHECK-SD-NEXT: fmov d0, x8
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: fcvtzu_1d:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: fcvtzu d0, d0
+; CHECK-GI-NEXT: ret
%tmp3 = fptoui <1 x double> %A to <1 x i64>
ret <1 x i64> %tmp3
}
diff --git a/llvm/test/CodeGen/AArch64/dup-ext-load-combine.ll b/llvm/test/CodeGen/AArch64/dup-ext-load-combine.ll
new file mode 100644
index 0000000..cf52934
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/dup-ext-load-combine.ll
@@ -0,0 +1,178 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+
+; Test optimization of DUP with extended narrow loads
+; This should avoid GPR->SIMD transfers by loading directly into vector registers
+
+define <4 x i16> @test_dup_zextload_i8_v4i16(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr b0, [x0]
+; CHECK-NEXT: dup v0.4h, v0.h[0]
+; CHECK-NEXT: ret
+ %load = load i8, ptr %p, align 1
+ %ext = zext i8 %load to i16
+ %vec = insertelement <4 x i16> poison, i16 %ext, i32 0
+ %dup = shufflevector <4 x i16> %vec, <4 x i16> poison, <4 x i32> zeroinitializer
+ ret <4 x i16> %dup
+}
+
+define <8 x i16> @test_dup_zextload_i8_v8i16(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr b0, [x0]
+; CHECK-NEXT: dup v0.8h, v0.h[0]
+; CHECK-NEXT: ret
+ %load = load i8, ptr %p, align 1
+ %ext = zext i8 %load to i16
+ %vec = insertelement <8 x i16> poison, i16 %ext, i32 0
+ %dup = shufflevector <8 x i16> %vec, <8 x i16> poison, <8 x i32> zeroinitializer
+ ret <8 x i16> %dup
+}
+
+define <2 x i32> @test_dup_zextload_i8_v2i32(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr b0, [x0]
+; CHECK-NEXT: dup v0.2s, v0.s[0]
+; CHECK-NEXT: ret
+ %load = load i8, ptr %p, align 1
+ %ext = zext i8 %load to i32
+ %vec = insertelement <2 x i32> poison, i32 %ext, i32 0
+ %dup = shufflevector <2 x i32> %vec, <2 x i32> poison, <2 x i32> zeroinitializer
+ ret <2 x i32> %dup
+}
+
+define <4 x i32> @test_dup_zextload_i8_v4i32(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr b0, [x0]
+; CHECK-NEXT: dup v0.4s, v0.s[0]
+; CHECK-NEXT: ret
+ %load = load i8, ptr %p, align 1
+ %ext = zext i8 %load to i32
+ %vec = insertelement <4 x i32> poison, i32 %ext, i32 0
+ %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
+ ret <4 x i32> %dup
+}
+
+define <4 x i32> @test_dup_zextload_i8_v4i32_offset(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v4i32_offset:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr b0, [x0, #4]
+; CHECK-NEXT: dup v0.4s, v0.s[0]
+; CHECK-NEXT: ret
+ %addr = getelementptr inbounds i8, ptr %p, i64 4
+ %load = load i8, ptr %addr, align 1
+ %ext = zext i8 %load to i32
+ %vec = insertelement <4 x i32> poison, i32 %ext, i32 0
+ %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
+ ret <4 x i32> %dup
+}
+
+define <4 x i32> @test_dup_zextload_i8_v4i32_reg_offset(ptr %p, i64 %offset) {
+; CHECK-LABEL: test_dup_zextload_i8_v4i32_reg_offset:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr b0, [x0, x1]
+; CHECK-NEXT: dup v0.4s, v0.s[0]
+; CHECK-NEXT: ret
+ %addr = getelementptr inbounds i8, ptr %p, i64 %offset
+ %load = load i8, ptr %addr, align 1
+ %ext = zext i8 %load to i32
+ %vec = insertelement <4 x i32> poison, i32 %ext, i32 0
+ %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
+ ret <4 x i32> %dup
+}
+
+define <2 x i64> @test_dup_zextload_i8_v2i64(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr b0, [x0]
+; CHECK-NEXT: dup v0.2d, v0.d[0]
+; CHECK-NEXT: ret
+ %load = load i8, ptr %p, align 1
+ %ext = zext i8 %load to i64
+ %vec = insertelement <2 x i64> poison, i64 %ext, i32 0
+ %dup = shufflevector <2 x i64> %vec, <2 x i64> poison, <2 x i32> zeroinitializer
+ ret <2 x i64> %dup
+}
+
+define <2 x i32> @test_dup_zextload_i16_v2i32(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i16_v2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr h0, [x0]
+; CHECK-NEXT: dup v0.2s, v0.s[0]
+; CHECK-NEXT: ret
+ %load = load i16, ptr %p, align 1
+ %ext = zext i16 %load to i32
+ %vec = insertelement <2 x i32> poison, i32 %ext, i32 0
+ %dup = shufflevector <2 x i32> %vec, <2 x i32> poison, <2 x i32> zeroinitializer
+ ret <2 x i32> %dup
+}
+
+define <4 x i32> @test_dup_zextload_i16_v4i32(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i16_v4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr h0, [x0]
+; CHECK-NEXT: dup v0.4s, v0.s[0]
+; CHECK-NEXT: ret
+ %load = load i16, ptr %p, align 1
+ %ext = zext i16 %load to i32
+ %vec = insertelement <4 x i32> poison, i32 %ext, i32 0
+ %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
+ ret <4 x i32> %dup
+}
+
+define <4 x i32> @test_dup_zextload_i16_v4i32_offset(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i16_v4i32_offset:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr h0, [x0, #8]
+; CHECK-NEXT: dup v0.4s, v0.s[0]
+; CHECK-NEXT: ret
+ %addr = getelementptr inbounds i16, ptr %p, i64 4
+ %load = load i16, ptr %addr, align 1
+ %ext = zext i16 %load to i32
+ %vec = insertelement <4 x i32> poison, i32 %ext, i32 0
+ %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
+ ret <4 x i32> %dup
+}
+
+define <4 x i32> @test_dup_zextload_i16_v4i32_reg_offset(ptr %p, i64 %offset) {
+; CHECK-LABEL: test_dup_zextload_i16_v4i32_reg_offset:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr h0, [x0, x1, lsl #1]
+; CHECK-NEXT: dup v0.4s, v0.s[0]
+; CHECK-NEXT: ret
+ %addr = getelementptr inbounds i16, ptr %p, i64 %offset
+ %load = load i16, ptr %addr, align 1
+ %ext = zext i16 %load to i32
+ %vec = insertelement <4 x i32> poison, i32 %ext, i32 0
+ %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
+ ret <4 x i32> %dup
+}
+
+define <2 x i64> @test_dup_zextload_i16_v2i64(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i16_v2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr h0, [x0]
+; CHECK-NEXT: dup v0.2d, v0.d[0]
+; CHECK-NEXT: ret
+ %load = load i16, ptr %p, align 1
+ %ext = zext i16 %load to i64
+ %vec = insertelement <2 x i64> poison, i64 %ext, i32 0
+ %dup = shufflevector <2 x i64> %vec, <2 x i64> poison, <2 x i32> zeroinitializer
+ ret <2 x i64> %dup
+}
+
+define <2 x i64> @test_dup_zextload_i32_v2i64(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i32_v2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr s0, [x0]
+; CHECK-NEXT: dup v0.2d, v0.d[0]
+; CHECK-NEXT: ret
+ %load = load i32, ptr %p, align 1
+ %ext = zext i32 %load to i64
+ %vec = insertelement <2 x i64> poison, i64 %ext, i32 0
+ %dup = shufflevector <2 x i64> %vec, <2 x i64> poison, <2 x i32> zeroinitializer
+ ret <2 x i64> %dup
+}
diff --git a/llvm/test/CodeGen/AArch64/dup.ll b/llvm/test/CodeGen/AArch64/dup.ll
index 079ff10..670574f2 100644
--- a/llvm/test/CodeGen/AArch64/dup.ll
+++ b/llvm/test/CodeGen/AArch64/dup.ll
@@ -32,8 +32,8 @@ entry:
define <2 x i8> @loaddup_v2i8(ptr %p) {
; CHECK-LABEL: loaddup_v2i8:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldrb w8, [x0]
-; CHECK-NEXT: dup v0.2s, w8
+; CHECK-NEXT: ldr b0, [x0]
+; CHECK-NEXT: dup v0.2s, v0.s[0]
; CHECK-NEXT: ret
entry:
%a = load i8, ptr %p
@@ -189,8 +189,8 @@ entry:
define <4 x i8> @loaddup_v4i8(ptr %p) {
; CHECK-SD-LABEL: loaddup_v4i8:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ldrb w8, [x0]
-; CHECK-SD-NEXT: dup v0.4h, w8
+; CHECK-SD-NEXT: ldr b0, [x0]
+; CHECK-SD-NEXT: dup v0.4h, v0.h[0]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: loaddup_v4i8:
@@ -444,8 +444,8 @@ entry:
define <2 x i16> @loaddup_v2i16(ptr %p) {
; CHECK-SD-LABEL: loaddup_v2i16:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ldrh w8, [x0]
-; CHECK-SD-NEXT: dup v0.2s, w8
+; CHECK-SD-NEXT: ldr h0, [x0]
+; CHECK-SD-NEXT: dup v0.2s, v0.s[0]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: loaddup_v2i16:
diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
index c741129..b963acd 100644
--- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
@@ -31,8 +31,7 @@ define <1 x i32> @test_signed_v1f32_v1i32(<1 x float> %f) {
;
; CHECK-GI-LABEL: test_signed_v1f32_v1i32:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: fcvtzs w8, s0
-; CHECK-GI-NEXT: fmov s0, w8
+; CHECK-GI-NEXT: fcvtzs s0, s0
; CHECK-GI-NEXT: ret
%x = call <1 x i32> @llvm.fptosi.sat.v1f32.v1i32(<1 x float> %f)
ret <1 x i32> %x
@@ -1162,18 +1161,24 @@ declare <7 x i32> @llvm.fptosi.sat.v7f16.v7i32 (<7 x half>)
declare <8 x i32> @llvm.fptosi.sat.v8f16.v8i32 (<8 x half>)
define <1 x i32> @test_signed_v1f16_v1i32(<1 x half> %f) {
-; CHECK-CVT-LABEL: test_signed_v1f16_v1i32:
-; CHECK-CVT: // %bb.0:
-; CHECK-CVT-NEXT: fcvt s0, h0
-; CHECK-CVT-NEXT: fcvtzs w8, s0
-; CHECK-CVT-NEXT: fmov s0, w8
-; CHECK-CVT-NEXT: ret
+; CHECK-SD-CVT-LABEL: test_signed_v1f16_v1i32:
+; CHECK-SD-CVT: // %bb.0:
+; CHECK-SD-CVT-NEXT: fcvt s0, h0
+; CHECK-SD-CVT-NEXT: fcvtzs w8, s0
+; CHECK-SD-CVT-NEXT: fmov s0, w8
+; CHECK-SD-CVT-NEXT: ret
;
; CHECK-FP16-LABEL: test_signed_v1f16_v1i32:
; CHECK-FP16: // %bb.0:
; CHECK-FP16-NEXT: fcvtzs w8, h0
; CHECK-FP16-NEXT: fmov s0, w8
; CHECK-FP16-NEXT: ret
+;
+; CHECK-GI-CVT-LABEL: test_signed_v1f16_v1i32:
+; CHECK-GI-CVT: // %bb.0:
+; CHECK-GI-CVT-NEXT: fcvt s0, h0
+; CHECK-GI-CVT-NEXT: fcvtzs s0, s0
+; CHECK-GI-CVT-NEXT: ret
%x = call <1 x i32> @llvm.fptosi.sat.v1f16.v1i32(<1 x half> %f)
ret <1 x i32> %x
}
diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
index efe0a1b..5a66b68 100644
--- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
@@ -31,8 +31,7 @@ define <1 x i32> @test_unsigned_v1f32_v1i32(<1 x float> %f) {
;
; CHECK-GI-LABEL: test_unsigned_v1f32_v1i32:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: fcvtzu w8, s0
-; CHECK-GI-NEXT: fmov s0, w8
+; CHECK-GI-NEXT: fcvtzu s0, s0
; CHECK-GI-NEXT: ret
%x = call <1 x i32> @llvm.fptoui.sat.v1f32.v1i32(<1 x float> %f)
ret <1 x i32> %x
@@ -993,18 +992,24 @@ declare <7 x i32> @llvm.fptoui.sat.v7f16.v7i32 (<7 x half>)
declare <8 x i32> @llvm.fptoui.sat.v8f16.v8i32 (<8 x half>)
define <1 x i32> @test_unsigned_v1f16_v1i32(<1 x half> %f) {
-; CHECK-CVT-LABEL: test_unsigned_v1f16_v1i32:
-; CHECK-CVT: // %bb.0:
-; CHECK-CVT-NEXT: fcvt s0, h0
-; CHECK-CVT-NEXT: fcvtzu w8, s0
-; CHECK-CVT-NEXT: fmov s0, w8
-; CHECK-CVT-NEXT: ret
+; CHECK-SD-CVT-LABEL: test_unsigned_v1f16_v1i32:
+; CHECK-SD-CVT: // %bb.0:
+; CHECK-SD-CVT-NEXT: fcvt s0, h0
+; CHECK-SD-CVT-NEXT: fcvtzu w8, s0
+; CHECK-SD-CVT-NEXT: fmov s0, w8
+; CHECK-SD-CVT-NEXT: ret
;
; CHECK-FP16-LABEL: test_unsigned_v1f16_v1i32:
; CHECK-FP16: // %bb.0:
; CHECK-FP16-NEXT: fcvtzu w8, h0
; CHECK-FP16-NEXT: fmov s0, w8
; CHECK-FP16-NEXT: ret
+;
+; CHECK-GI-CVT-LABEL: test_unsigned_v1f16_v1i32:
+; CHECK-GI-CVT: // %bb.0:
+; CHECK-GI-CVT-NEXT: fcvt s0, h0
+; CHECK-GI-CVT-NEXT: fcvtzu s0, s0
+; CHECK-GI-CVT-NEXT: ret
%x = call <1 x i32> @llvm.fptoui.sat.v1f16.v1i32(<1 x half> %f)
ret <1 x i32> %x
}
diff --git a/llvm/test/CodeGen/AArch64/ldst-prepost-uses.ll b/llvm/test/CodeGen/AArch64/ldst-prepost-uses.ll
new file mode 100644
index 0000000..85991fb
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/ldst-prepost-uses.ll
@@ -0,0 +1,73 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -O3 -mtriple=aarch64 | FileCheck %s
+
+; From #164775, this generates a pre-index load feeding a post-index store, that
+; was checking the wrong uses for post-inc. It seems quite delicate for it to
+; generate this combination at the wrong point to hit the same issue.
+
+@g_260 = dso_local global i16 0
+@g_480 = dso_local global i16 0
+
+define i32 @func_1(ptr %l_3253) {
+; CHECK-LABEL: func_1:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sub sp, sp, #128
+; CHECK-NEXT: .cfi_def_cfa_offset 128
+; CHECK-NEXT: movi v0.2d, #0000000000000000
+; CHECK-NEXT: mov w9, #2 // =0x2
+; CHECK-NEXT: mov w10, #96 // =0x60
+; CHECK-NEXT: strb wzr, [x9]
+; CHECK-NEXT: mov w9, #111 // =0x6f
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: str wzr, [x9]
+; CHECK-NEXT: mov w9, #80 // =0x50
+; CHECK-NEXT: adrp x1, .L_MergedGlobals
+; CHECK-NEXT: add x1, x1, :lo12:.L_MergedGlobals
+; CHECK-NEXT: strh wzr, [x8]
+; CHECK-NEXT: str q0, [x9]
+; CHECK-NEXT: mov w9, #48 // =0x30
+; CHECK-NEXT: str q0, [x9]
+; CHECK-NEXT: mov w9, #32 // =0x20
+; CHECK-NEXT: str q0, [x10]
+; CHECK-NEXT: mov w10, #64 // =0x40
+; CHECK-NEXT: str q0, [x9]
+; CHECK-NEXT: mov w9, #16 // =0x10
+; CHECK-NEXT: str q0, [x10]
+; CHECK-NEXT: str q0, [x9]
+; CHECK-NEXT: str q0, [x8]
+; CHECK-NEXT: adrp x8, .L_MergedGlobals
+; CHECK-NEXT: strb wzr, [x0, #8]
+; CHECK-NEXT: strb wzr, [x0, #12]
+; CHECK-NEXT: strb wzr, [x0, #16]
+; CHECK-NEXT: strb wzr, [x0, #20]
+; CHECK-NEXT: mov w0, wzr
+; CHECK-NEXT: ldrh wzr, [x8, :lo12:.L_MergedGlobals]
+; CHECK-NEXT: ldrh w8, [x1, #4]!
+; CHECK-NEXT: sub w8, w8, #1
+; CHECK-NEXT: strh w8, [x1]
+; CHECK-NEXT: add sp, sp, #128
+; CHECK-NEXT: b use
+entry:
+ %l_32531.sroa.3 = alloca [3 x i8], align 4
+ %l_32531.sroa.4 = alloca [115 x i8], align 4
+ call void @llvm.lifetime.start.p0(ptr %l_32531.sroa.3)
+ call void @llvm.lifetime.start.p0(ptr %l_32531.sroa.4)
+ call void @llvm.memset.p0.i64(ptr null, i8 0, i64 3, i1 false)
+ call void @llvm.memset.p0.i64(ptr null, i8 0, i64 115, i1 false)
+ %0 = getelementptr inbounds i8, ptr %l_3253, i64 8
+ store i8 0, ptr %0, align 4
+ %1 = getelementptr inbounds i8, ptr %l_3253, i64 12
+ store i8 0, ptr %1, align 4
+ %2 = getelementptr inbounds i8, ptr %l_3253, i64 16
+ store i8 0, ptr %2, align 4
+ %3 = getelementptr inbounds i8, ptr %l_3253, i64 20
+ store i8 0, ptr %3, align 4
+ %4 = load volatile i16, ptr @g_260, align 4
+ %5 = load i16, ptr @g_480, align 4
+ %dec.i.i = add i16 %5, -1
+ store i16 %dec.i.i, ptr @g_480, align 4
+ %call1 = tail call i32 @use(i32 0, ptr @g_480)
+ ret i32 %call1
+}
+
+declare i32 @use(i32, ptr)
diff --git a/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll b/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll
index 9193025..628506b 100644
--- a/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll
@@ -84,8 +84,7 @@ entry:
define double @load_u64_from_u32_off1(ptr %n){
; CHECK-LABEL: load_u64_from_u32_off1:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldur w8, [x0, #1]
-; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ldur s0, [x0, #1]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 1
@@ -98,8 +97,7 @@ entry:
define double @load_u64_from_u16_off1(ptr %n){
; CHECK-LABEL: load_u64_from_u16_off1:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldurh w8, [x0, #1]
-; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ldur h0, [x0, #1]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 1
@@ -112,8 +110,7 @@ entry:
define double @load_u64_from_u8_off1(ptr %n){
; CHECK-LABEL: load_u64_from_u8_off1:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldrb w8, [x0, #1]
-; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ldr b0, [x0, #1]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 1
@@ -126,8 +123,7 @@ entry:
define float @load_u32_from_u16_off1(ptr %n){
; CHECK-LABEL: load_u32_from_u16_off1:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldurh w8, [x0, #1]
-; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: ldur h0, [x0, #1]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 1
@@ -140,8 +136,7 @@ entry:
define float @load_u32_from_u8_off1(ptr %n){
; CHECK-LABEL: load_u32_from_u8_off1:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldrb w8, [x0, #1]
-; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: ldr b0, [x0, #1]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 1
@@ -154,8 +149,7 @@ entry:
define half @load_u16_from_u8_off1(ptr %n){
; CHECK-LABEL: load_u16_from_u8_off1:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldrb w8, [x0, #1]
-; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: ldr b0, [x0, #1]
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
; CHECK-NEXT: ret
entry:
@@ -171,8 +165,7 @@ entry:
define double @load_u64_from_u32_off2(ptr %n){
; CHECK-LABEL: load_u64_from_u32_off2:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldur w8, [x0, #2]
-; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ldur s0, [x0, #2]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 2
@@ -185,8 +178,7 @@ entry:
define double @load_u64_from_u16_off2(ptr %n){
; CHECK-LABEL: load_u64_from_u16_off2:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldrh w8, [x0, #2]
-; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ldr h0, [x0, #2]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 2
@@ -199,8 +191,7 @@ entry:
define double @load_u64_from_u8_off2(ptr %n){
; CHECK-LABEL: load_u64_from_u8_off2:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldrb w8, [x0, #2]
-; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ldr b0, [x0, #2]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 2
@@ -226,7 +217,7 @@ entry:
define float @load_u32_from_u8_off2(ptr %n){
; CHECK-LABEL: load_u32_from_u8_off2:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr b0, [x0, #1]
+; CHECK-NEXT: ldr b0, [x0, #2]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 2
@@ -239,7 +230,7 @@ entry:
define half @load_u16_from_u8_off2(ptr %n){
; CHECK-LABEL: load_u16_from_u8_off2:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr b0, [x0, #1]
+; CHECK-NEXT: ldr b0, [x0, #2]
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
; CHECK-NEXT: ret
entry:
@@ -255,8 +246,7 @@ entry:
define double @load_u64_from_u32_off255(ptr %n){
; CHECK-LABEL: load_u64_from_u32_off255:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldur w8, [x0, #255]
-; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ldur s0, [x0, #255]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 255
@@ -269,8 +259,7 @@ entry:
define double @load_u64_from_u16_off255(ptr %n){
; CHECK-LABEL: load_u64_from_u16_off255:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldurh w8, [x0, #255]
-; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ldur h0, [x0, #255]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 255
@@ -283,8 +272,7 @@ entry:
define double @load_u64_from_u8_off255(ptr %n){
; CHECK-LABEL: load_u64_from_u8_off255:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldrb w8, [x0, #255]
-; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ldr b0, [x0, #255]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 255
@@ -297,8 +285,7 @@ entry:
define float @load_u32_from_u16_off255(ptr %n){
; CHECK-LABEL: load_u32_from_u16_off255:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldurh w8, [x0, #255]
-; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: ldur h0, [x0, #255]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 255
@@ -311,8 +298,7 @@ entry:
define float @load_u32_from_u8_off255(ptr %n){
; CHECK-LABEL: load_u32_from_u8_off255:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldrb w8, [x0, #255]
-; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: ldr b0, [x0, #255]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 255
@@ -325,8 +311,7 @@ entry:
define half @load_u16_from_u8_off255(ptr %n){
; CHECK-LABEL: load_u16_from_u8_off255:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldrb w8, [x0, #255]
-; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: ldr b0, [x0, #255]
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
; CHECK-NEXT: ret
entry:
@@ -354,7 +339,7 @@ entry:
define double @load_u64_from_u16_off256(ptr %n){
; CHECK-LABEL: load_u64_from_u16_off256:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr h0, [x0, #128]
+; CHECK-NEXT: ldr h0, [x0, #256]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 256
@@ -367,7 +352,7 @@ entry:
define double @load_u64_from_u8_off256(ptr %n){
; CHECK-LABEL: load_u64_from_u8_off256:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr b0, [x0, #64]
+; CHECK-NEXT: ldr b0, [x0, #256]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 256
@@ -393,7 +378,7 @@ entry:
define float @load_u32_from_u8_off256(ptr %n){
; CHECK-LABEL: load_u32_from_u8_off256:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr b0, [x0, #128]
+; CHECK-NEXT: ldr b0, [x0, #256]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 256
@@ -406,7 +391,7 @@ entry:
define half @load_u16_from_u8_off256(ptr %n){
; CHECK-LABEL: load_u16_from_u8_off256:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr b0, [x0, #128]
+; CHECK-NEXT: ldr b0, [x0, #256]
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
; CHECK-NEXT: ret
entry:
@@ -435,8 +420,7 @@ entry:
define double @load_u64_from_u16_offn(ptr %n){
; CHECK-LABEL: load_u64_from_u16_offn:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov w8, #8190 // =0x1ffe
-; CHECK-NEXT: ldr h0, [x0, x8]
+; CHECK-NEXT: ldr h0, [x0, #8190]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 8190
@@ -503,8 +487,8 @@ entry:
define double @load_u64_from_u32_offnp1(ptr %n){
; CHECK-LABEL: load_u64_from_u32_offnp1:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: add x8, x0, #4, lsl #12 // =16384
-; CHECK-NEXT: ldr s0, [x8]
+; CHECK-NEXT: mov w8, #16384 // =0x4000
+; CHECK-NEXT: ldr s0, [x0, x8]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 16384
@@ -517,7 +501,8 @@ entry:
define double @load_u64_from_u16_offnp1(ptr %n){
; CHECK-LABEL: load_u64_from_u16_offnp1:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr h0, [x0, #4096]
+; CHECK-NEXT: mov w8, #8192 // =0x2000
+; CHECK-NEXT: ldr h0, [x0, x8]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 8192
@@ -530,7 +515,8 @@ entry:
define double @load_u64_from_u8_offnp1(ptr %n){
; CHECK-LABEL: load_u64_from_u8_offnp1:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr b0, [x0, #1024]
+; CHECK-NEXT: mov w8, #4096 // =0x1000
+; CHECK-NEXT: ldr b0, [x0, x8]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 4096
@@ -543,8 +529,8 @@ entry:
define float @load_u32_from_u16_offnp1(ptr %n){
; CHECK-LABEL: load_u32_from_u16_offnp1:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: add x8, x0, #2, lsl #12 // =8192
-; CHECK-NEXT: ldr h0, [x8]
+; CHECK-NEXT: mov w8, #8192 // =0x2000
+; CHECK-NEXT: ldr h0, [x0, x8]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 8192
@@ -557,7 +543,8 @@ entry:
define float @load_u32_from_u8_offnp1(ptr %n){
; CHECK-LABEL: load_u32_from_u8_offnp1:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr b0, [x0, #2048]
+; CHECK-NEXT: mov w8, #4096 // =0x1000
+; CHECK-NEXT: ldr b0, [x0, x8]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 4096
@@ -570,7 +557,8 @@ entry:
define half @load_u16_from_u8_offnp1(ptr %n){
; CHECK-LABEL: load_u16_from_u8_offnp1:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr b0, [x0, #2048]
+; CHECK-NEXT: mov w8, #4096 // =0x1000
+; CHECK-NEXT: ldr b0, [x0, x8]
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
; CHECK-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/pr164181.ll b/llvm/test/CodeGen/AArch64/pr164181.ll
new file mode 100644
index 0000000..4ec63ec
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/pr164181.ll
@@ -0,0 +1,640 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -verify-machineinstrs < %s | FileCheck %s
+
+; This test recreates a regalloc crash reported in
+; https://github.com/llvm/llvm-project/issues/164181
+; When rematting an instruction we need to make sure to constrain the newly
+; allocated register to both the rematted def's reg class and the use's reg
+; class.
+
+target triple = "aarch64-unknown-linux-gnu"
+
+@var_32 = external global i16
+@var_35 = external global i64
+@var_39 = external global i64
+@var_46 = external global i64
+@var_50 = external global i32
+
+define void @f(i1 %var_0, i16 %var_1, i64 %var_2, i8 %var_3, i16 %var_4, i1 %var_5, i32 %var_6, i32 %var_7, i8 %var_10, i64 %var_11, i8 %var_14, i32 %var_15, i64 %var_16, ptr %arr_3, ptr %arr_4, ptr %arr_6, ptr %arr_7, ptr %arr_12, ptr %arr_13, ptr %arr_19, i64 %mul, i64 %conv35, i64 %idxprom138.us16, i8 %0, i8 %1, ptr %invariant.gep875.us) #0 {
+; CHECK-LABEL: f:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sub sp, sp, #240
+; CHECK-NEXT: str x30, [sp, #144] // 8-byte Folded Spill
+; CHECK-NEXT: stp x28, x27, [sp, #160] // 16-byte Folded Spill
+; CHECK-NEXT: stp x26, x25, [sp, #176] // 16-byte Folded Spill
+; CHECK-NEXT: stp x24, x23, [sp, #192] // 16-byte Folded Spill
+; CHECK-NEXT: stp x22, x21, [sp, #208] // 16-byte Folded Spill
+; CHECK-NEXT: stp x20, x19, [sp, #224] // 16-byte Folded Spill
+; CHECK-NEXT: str w6, [sp, #20] // 4-byte Folded Spill
+; CHECK-NEXT: str w4, [sp, #72] // 4-byte Folded Spill
+; CHECK-NEXT: str w3, [sp, #112] // 4-byte Folded Spill
+; CHECK-NEXT: str w5, [sp, #36] // 4-byte Folded Spill
+; CHECK-NEXT: tbz w5, #0, .LBB0_43
+; CHECK-NEXT: // %bb.1: // %for.body41.lr.ph
+; CHECK-NEXT: ldr x4, [sp, #312]
+; CHECK-NEXT: ldr x14, [sp, #280]
+; CHECK-NEXT: tbz w0, #0, .LBB0_42
+; CHECK-NEXT: // %bb.2: // %for.body41.us.preheader
+; CHECK-NEXT: ldrb w8, [sp, #368]
+; CHECK-NEXT: ldrb w12, [sp, #256]
+; CHECK-NEXT: ldr w26, [sp, #264]
+; CHECK-NEXT: adrp x20, :got:var_50
+; CHECK-NEXT: mov x28, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: mov w21, #36006 // =0x8ca6
+; CHECK-NEXT: ldr x11, [sp, #376]
+; CHECK-NEXT: ldrb w13, [sp, #360]
+; CHECK-NEXT: ldp x17, x16, [sp, #296]
+; CHECK-NEXT: mov w22, #1 // =0x1
+; CHECK-NEXT: add x27, x14, #120
+; CHECK-NEXT: ldr x18, [sp, #288]
+; CHECK-NEXT: ldr x7, [sp, #272]
+; CHECK-NEXT: ldr x5, [sp, #248]
+; CHECK-NEXT: mov x10, xzr
+; CHECK-NEXT: mov w23, wzr
+; CHECK-NEXT: mov w30, wzr
+; CHECK-NEXT: ldrb w19, [sp, #240]
+; CHECK-NEXT: mov w25, wzr
+; CHECK-NEXT: mov x24, xzr
+; CHECK-NEXT: str w8, [sp, #108] // 4-byte Folded Spill
+; CHECK-NEXT: mov x3, x26
+; CHECK-NEXT: ldp x9, x8, [sp, #344]
+; CHECK-NEXT: str w12, [sp, #92] // 4-byte Folded Spill
+; CHECK-NEXT: mov w12, #1 // =0x1
+; CHECK-NEXT: bic w12, w12, w0
+; CHECK-NEXT: str w12, [sp, #76] // 4-byte Folded Spill
+; CHECK-NEXT: mov w12, #48 // =0x30
+; CHECK-NEXT: str x9, [sp, #136] // 8-byte Folded Spill
+; CHECK-NEXT: ldp x9, x15, [sp, #328]
+; CHECK-NEXT: madd x8, x8, x12, x9
+; CHECK-NEXT: str x8, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: add x8, x26, w26, uxtw #1
+; CHECK-NEXT: ldr x20, [x20, :got_lo12:var_50]
+; CHECK-NEXT: str x26, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT: str x14, [sp, #152] // 8-byte Folded Spill
+; CHECK-NEXT: lsl x6, x8, #3
+; CHECK-NEXT: add x8, x14, #120
+; CHECK-NEXT: str x4, [sp, #24] // 8-byte Folded Spill
+; CHECK-NEXT: str w19, [sp, #16] // 4-byte Folded Spill
+; CHECK-NEXT: str x8, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: b .LBB0_4
+; CHECK-NEXT: .p2align 5, , 16
+; CHECK-NEXT: .LBB0_3: // in Loop: Header=BB0_4 Depth=1
+; CHECK-NEXT: ldr w19, [sp, #16] // 4-byte Folded Reload
+; CHECK-NEXT: ldr x24, [sp, #40] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x14, [sp, #152] // 8-byte Folded Reload
+; CHECK-NEXT: mov w23, #1 // =0x1
+; CHECK-NEXT: mov w30, #1 // =0x1
+; CHECK-NEXT: mov w25, w19
+; CHECK-NEXT: .LBB0_4: // %for.body41.us
+; CHECK-NEXT: // =>This Loop Header: Depth=1
+; CHECK-NEXT: // Child Loop BB0_6 Depth 2
+; CHECK-NEXT: // Child Loop BB0_8 Depth 3
+; CHECK-NEXT: // Child Loop BB0_10 Depth 4
+; CHECK-NEXT: // Child Loop BB0_11 Depth 5
+; CHECK-NEXT: // Child Loop BB0_28 Depth 5
+; CHECK-NEXT: // Child Loop BB0_39 Depth 5
+; CHECK-NEXT: ldr w8, [sp, #20] // 4-byte Folded Reload
+; CHECK-NEXT: mov x12, x24
+; CHECK-NEXT: str x24, [sp, #48] // 8-byte Folded Spill
+; CHECK-NEXT: str w8, [x14]
+; CHECK-NEXT: mov w8, #1 // =0x1
+; CHECK-NEXT: strb w19, [x14]
+; CHECK-NEXT: b .LBB0_6
+; CHECK-NEXT: .p2align 5, , 16
+; CHECK-NEXT: .LBB0_5: // %for.cond.cleanup93.us
+; CHECK-NEXT: // in Loop: Header=BB0_6 Depth=2
+; CHECK-NEXT: ldr w9, [sp, #36] // 4-byte Folded Reload
+; CHECK-NEXT: ldr x4, [sp, #24] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x24, x12, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: mov x22, xzr
+; CHECK-NEXT: mov w25, wzr
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: tbz w9, #0, .LBB0_3
+; CHECK-NEXT: .LBB0_6: // %for.body67.us
+; CHECK-NEXT: // Parent Loop BB0_4 Depth=1
+; CHECK-NEXT: // => This Loop Header: Depth=2
+; CHECK-NEXT: // Child Loop BB0_8 Depth 3
+; CHECK-NEXT: // Child Loop BB0_10 Depth 4
+; CHECK-NEXT: // Child Loop BB0_11 Depth 5
+; CHECK-NEXT: // Child Loop BB0_28 Depth 5
+; CHECK-NEXT: // Child Loop BB0_39 Depth 5
+; CHECK-NEXT: str x12, [sp, #40] // 8-byte Folded Spill
+; CHECK-NEXT: cmn x24, #30
+; CHECK-NEXT: mov x12, #-30 // =0xffffffffffffffe2
+; CHECK-NEXT: add x19, x4, w8, sxtw #2
+; CHECK-NEXT: mov x9, xzr
+; CHECK-NEXT: csel x12, x24, x12, lo
+; CHECK-NEXT: mov w4, w30
+; CHECK-NEXT: str x12, [sp, #56] // 8-byte Folded Spill
+; CHECK-NEXT: b .LBB0_8
+; CHECK-NEXT: .p2align 5, , 16
+; CHECK-NEXT: .LBB0_7: // %for.cond.cleanup98.us
+; CHECK-NEXT: // in Loop: Header=BB0_8 Depth=3
+; CHECK-NEXT: ldr w4, [sp, #72] // 4-byte Folded Reload
+; CHECK-NEXT: ldr w23, [sp, #128] // 4-byte Folded Reload
+; CHECK-NEXT: mov w9, #1 // =0x1
+; CHECK-NEXT: mov x22, xzr
+; CHECK-NEXT: tbnz w0, #0, .LBB0_5
+; CHECK-NEXT: .LBB0_8: // %for.cond95.preheader.us
+; CHECK-NEXT: // Parent Loop BB0_4 Depth=1
+; CHECK-NEXT: // Parent Loop BB0_6 Depth=2
+; CHECK-NEXT: // => This Loop Header: Depth=3
+; CHECK-NEXT: // Child Loop BB0_10 Depth 4
+; CHECK-NEXT: // Child Loop BB0_11 Depth 5
+; CHECK-NEXT: // Child Loop BB0_28 Depth 5
+; CHECK-NEXT: // Child Loop BB0_39 Depth 5
+; CHECK-NEXT: ldr x8, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: mov w14, #1152 // =0x480
+; CHECK-NEXT: mov w24, #1 // =0x1
+; CHECK-NEXT: mov w12, wzr
+; CHECK-NEXT: str wzr, [sp, #132] // 4-byte Folded Spill
+; CHECK-NEXT: mov w30, w4
+; CHECK-NEXT: madd x8, x9, x14, x8
+; CHECK-NEXT: mov w14, #1 // =0x1
+; CHECK-NEXT: str x8, [sp, #120] // 8-byte Folded Spill
+; CHECK-NEXT: add x8, x9, x9, lsl #1
+; CHECK-NEXT: lsl x26, x8, #4
+; CHECK-NEXT: sxtb w8, w23
+; CHECK-NEXT: mov w23, w25
+; CHECK-NEXT: str w8, [sp, #116] // 4-byte Folded Spill
+; CHECK-NEXT: b .LBB0_10
+; CHECK-NEXT: .p2align 5, , 16
+; CHECK-NEXT: .LBB0_9: // %for.cond510.preheader.us
+; CHECK-NEXT: // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT: ldr w23, [sp, #92] // 4-byte Folded Reload
+; CHECK-NEXT: mov x22, x8
+; CHECK-NEXT: ldr x3, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x27, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT: mov x28, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: mov x14, xzr
+; CHECK-NEXT: ldr w8, [sp, #76] // 4-byte Folded Reload
+; CHECK-NEXT: tbz w8, #31, .LBB0_7
+; CHECK-NEXT: .LBB0_10: // %for.body99.us
+; CHECK-NEXT: // Parent Loop BB0_4 Depth=1
+; CHECK-NEXT: // Parent Loop BB0_6 Depth=2
+; CHECK-NEXT: // Parent Loop BB0_8 Depth=3
+; CHECK-NEXT: // => This Loop Header: Depth=4
+; CHECK-NEXT: // Child Loop BB0_11 Depth 5
+; CHECK-NEXT: // Child Loop BB0_28 Depth 5
+; CHECK-NEXT: // Child Loop BB0_39 Depth 5
+; CHECK-NEXT: ldr w8, [sp, #116] // 4-byte Folded Reload
+; CHECK-NEXT: and w8, w8, w8, asr #31
+; CHECK-NEXT: str w8, [sp, #128] // 4-byte Folded Spill
+; CHECK-NEXT: .p2align 5, , 16
+; CHECK-NEXT: .LBB0_11: // %for.body113.us
+; CHECK-NEXT: // Parent Loop BB0_4 Depth=1
+; CHECK-NEXT: // Parent Loop BB0_6 Depth=2
+; CHECK-NEXT: // Parent Loop BB0_8 Depth=3
+; CHECK-NEXT: // Parent Loop BB0_10 Depth=4
+; CHECK-NEXT: // => This Inner Loop Header: Depth=5
+; CHECK-NEXT: tbnz w0, #0, .LBB0_11
+; CHECK-NEXT: // %bb.12: // %for.cond131.preheader.us
+; CHECK-NEXT: // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT: ldr w8, [sp, #112] // 4-byte Folded Reload
+; CHECK-NEXT: mov w4, #1 // =0x1
+; CHECK-NEXT: strb w8, [x18]
+; CHECK-NEXT: ldr x8, [sp, #120] // 8-byte Folded Reload
+; CHECK-NEXT: ldrh w8, [x8]
+; CHECK-NEXT: cbnz w4, .LBB0_14
+; CHECK-NEXT: // %bb.13: // %cond.true146.us
+; CHECK-NEXT: // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT: ldrsb w4, [x27, x3]
+; CHECK-NEXT: b .LBB0_15
+; CHECK-NEXT: .p2align 5, , 16
+; CHECK-NEXT: .LBB0_14: // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT: mov w4, wzr
+; CHECK-NEXT: .LBB0_15: // %cond.end154.us
+; CHECK-NEXT: // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT: mov w25, #18984 // =0x4a28
+; CHECK-NEXT: mul w8, w8, w25
+; CHECK-NEXT: and w8, w8, #0xfff8
+; CHECK-NEXT: lsl w8, w8, w4
+; CHECK-NEXT: cbz w8, .LBB0_17
+; CHECK-NEXT: // %bb.16: // %if.then.us
+; CHECK-NEXT: // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT: str wzr, [sp, #132] // 4-byte Folded Spill
+; CHECK-NEXT: str wzr, [x18]
+; CHECK-NEXT: .LBB0_17: // %if.end.us
+; CHECK-NEXT: // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT: ldr w8, [sp, #108] // 4-byte Folded Reload
+; CHECK-NEXT: mov w4, #18984 // =0x4a28
+; CHECK-NEXT: mov w25, w23
+; CHECK-NEXT: strb w8, [x18]
+; CHECK-NEXT: ldrsb w8, [x27, x3]
+; CHECK-NEXT: lsl w8, w4, w8
+; CHECK-NEXT: mov x4, #-18403 // =0xffffffffffffb81d
+; CHECK-NEXT: movk x4, #58909, lsl #16
+; CHECK-NEXT: cbz w8, .LBB0_19
+; CHECK-NEXT: // %bb.18: // %if.then.us.2
+; CHECK-NEXT: // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT: str wzr, [sp, #132] // 4-byte Folded Spill
+; CHECK-NEXT: strb wzr, [x18]
+; CHECK-NEXT: .LBB0_19: // %if.then.us.5
+; CHECK-NEXT: // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT: ldr w23, [sp, #132] // 4-byte Folded Reload
+; CHECK-NEXT: mov w8, #29625 // =0x73b9
+; CHECK-NEXT: movk w8, #21515, lsl #16
+; CHECK-NEXT: cmp w23, w8
+; CHECK-NEXT: csel w23, w23, w8, lt
+; CHECK-NEXT: str w23, [sp, #132] // 4-byte Folded Spill
+; CHECK-NEXT: tbz w0, #0, .LBB0_21
+; CHECK-NEXT: // %bb.20: // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: b .LBB0_22
+; CHECK-NEXT: .p2align 5, , 16
+; CHECK-NEXT: .LBB0_21: // %cond.true146.us.7
+; CHECK-NEXT: // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT: ldrsb w8, [x27, x3]
+; CHECK-NEXT: .LBB0_22: // %cond.end154.us.7
+; CHECK-NEXT: // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT: mov w23, #18984 // =0x4a28
+; CHECK-NEXT: mov w3, #149 // =0x95
+; CHECK-NEXT: lsl w8, w23, w8
+; CHECK-NEXT: cbz w8, .LBB0_24
+; CHECK-NEXT: // %bb.23: // %if.then.us.7
+; CHECK-NEXT: // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT: ldr x8, [sp, #152] // 8-byte Folded Reload
+; CHECK-NEXT: str wzr, [sp, #132] // 4-byte Folded Spill
+; CHECK-NEXT: str wzr, [x8]
+; CHECK-NEXT: .LBB0_24: // %if.end.us.7
+; CHECK-NEXT: // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT: mov x23, xzr
+; CHECK-NEXT: b .LBB0_28
+; CHECK-NEXT: .p2align 5, , 16
+; CHECK-NEXT: .LBB0_25: // %cond.true331.us
+; CHECK-NEXT: // in Loop: Header=BB0_28 Depth=5
+; CHECK-NEXT: ldrsb w4, [x10]
+; CHECK-NEXT: .LBB0_26: // %cond.end345.us
+; CHECK-NEXT: // in Loop: Header=BB0_28 Depth=5
+; CHECK-NEXT: strh w4, [x18]
+; CHECK-NEXT: mul x4, x22, x28
+; CHECK-NEXT: adrp x22, :got:var_46
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: ldr x22, [x22, :got_lo12:var_46]
+; CHECK-NEXT: str x4, [x22]
+; CHECK-NEXT: mov x4, #-18403 // =0xffffffffffffb81d
+; CHECK-NEXT: movk x4, #58909, lsl #16
+; CHECK-NEXT: .LBB0_27: // %for.inc371.us
+; CHECK-NEXT: // in Loop: Header=BB0_28 Depth=5
+; CHECK-NEXT: mov w22, #-18978 // =0xffffb5de
+; CHECK-NEXT: orr x23, x23, #0x1
+; CHECK-NEXT: mov x24, xzr
+; CHECK-NEXT: mul w12, w12, w22
+; CHECK-NEXT: mov x22, x5
+; CHECK-NEXT: tbz w0, #0, .LBB0_36
+; CHECK-NEXT: .LBB0_28: // %for.body194.us
+; CHECK-NEXT: // Parent Loop BB0_4 Depth=1
+; CHECK-NEXT: // Parent Loop BB0_6 Depth=2
+; CHECK-NEXT: // Parent Loop BB0_8 Depth=3
+; CHECK-NEXT: // Parent Loop BB0_10 Depth=4
+; CHECK-NEXT: // => This Inner Loop Header: Depth=5
+; CHECK-NEXT: cbnz wzr, .LBB0_30
+; CHECK-NEXT: // %bb.29: // %if.then222.us
+; CHECK-NEXT: // in Loop: Header=BB0_28 Depth=5
+; CHECK-NEXT: adrp x27, :got:var_32
+; CHECK-NEXT: ldur w8, [x19, #-12]
+; CHECK-NEXT: ldr x27, [x27, :got_lo12:var_32]
+; CHECK-NEXT: strh w8, [x27]
+; CHECK-NEXT: sxtb w8, w25
+; CHECK-NEXT: bic w25, w8, w8, asr #31
+; CHECK-NEXT: b .LBB0_31
+; CHECK-NEXT: .p2align 5, , 16
+; CHECK-NEXT: .LBB0_30: // in Loop: Header=BB0_28 Depth=5
+; CHECK-NEXT: mov w25, wzr
+; CHECK-NEXT: .LBB0_31: // %if.end239.us
+; CHECK-NEXT: // in Loop: Header=BB0_28 Depth=5
+; CHECK-NEXT: strb w3, [x16]
+; CHECK-NEXT: tst w13, #0xff
+; CHECK-NEXT: b.eq .LBB0_33
+; CHECK-NEXT: // %bb.32: // %if.then254.us
+; CHECK-NEXT: // in Loop: Header=BB0_28 Depth=5
+; CHECK-NEXT: ldrh w8, [x26, x14, lsl #1]
+; CHECK-NEXT: adrp x27, :got:var_35
+; CHECK-NEXT: ldr x27, [x27, :got_lo12:var_35]
+; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: csel x8, xzr, x7, eq
+; CHECK-NEXT: str x8, [x27]
+; CHECK-NEXT: strh w1, [x17]
+; CHECK-NEXT: .LBB0_33: // %if.end282.us
+; CHECK-NEXT: // in Loop: Header=BB0_28 Depth=5
+; CHECK-NEXT: orr x27, x24, x4
+; CHECK-NEXT: adrp x8, :got:var_39
+; CHECK-NEXT: str x27, [x18]
+; CHECK-NEXT: ldr x8, [x8, :got_lo12:var_39]
+; CHECK-NEXT: str x10, [x8]
+; CHECK-NEXT: ldrb w8, [x6, x9]
+; CHECK-NEXT: str x8, [x18]
+; CHECK-NEXT: mov w8, #1 // =0x1
+; CHECK-NEXT: cbnz x2, .LBB0_27
+; CHECK-NEXT: // %bb.34: // %if.then327.us
+; CHECK-NEXT: // in Loop: Header=BB0_28 Depth=5
+; CHECK-NEXT: cbz w8, .LBB0_25
+; CHECK-NEXT: // %bb.35: // in Loop: Header=BB0_28 Depth=5
+; CHECK-NEXT: mov w4, wzr
+; CHECK-NEXT: b .LBB0_26
+; CHECK-NEXT: .p2align 5, , 16
+; CHECK-NEXT: .LBB0_36: // %for.cond376.preheader.us
+; CHECK-NEXT: // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT: mov w3, #1152 // =0x480
+; CHECK-NEXT: mov x22, xzr
+; CHECK-NEXT: mov w4, wzr
+; CHECK-NEXT: mov x24, x27
+; CHECK-NEXT: lsl x23, x14, #1
+; CHECK-NEXT: mov x27, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: madd x14, x14, x3, x11
+; CHECK-NEXT: mov w28, w30
+; CHECK-NEXT: mov w3, #-7680 // =0xffffe200
+; CHECK-NEXT: b .LBB0_39
+; CHECK-NEXT: .p2align 5, , 16
+; CHECK-NEXT: .LBB0_37: // %if.then466.us
+; CHECK-NEXT: // in Loop: Header=BB0_39 Depth=5
+; CHECK-NEXT: ldr x28, [sp, #152] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x3, [sp, #136] // 8-byte Folded Reload
+; CHECK-NEXT: sxtb w4, w4
+; CHECK-NEXT: bic w4, w4, w4, asr #31
+; CHECK-NEXT: str x3, [x28]
+; CHECK-NEXT: mov w3, #-7680 // =0xffffe200
+; CHECK-NEXT: .LBB0_38: // %for.inc505.us
+; CHECK-NEXT: // in Loop: Header=BB0_39 Depth=5
+; CHECK-NEXT: add x22, x22, #1
+; CHECK-NEXT: add x27, x27, #1
+; CHECK-NEXT: mov w28, wzr
+; CHECK-NEXT: cmp x27, #0
+; CHECK-NEXT: b.hs .LBB0_9
+; CHECK-NEXT: .LBB0_39: // %for.body380.us
+; CHECK-NEXT: // Parent Loop BB0_4 Depth=1
+; CHECK-NEXT: // Parent Loop BB0_6 Depth=2
+; CHECK-NEXT: // Parent Loop BB0_8 Depth=3
+; CHECK-NEXT: // Parent Loop BB0_10 Depth=4
+; CHECK-NEXT: // => This Inner Loop Header: Depth=5
+; CHECK-NEXT: mov w30, w28
+; CHECK-NEXT: ldrh w28, [x23]
+; CHECK-NEXT: tst w0, #0x1
+; CHECK-NEXT: strh w28, [x11]
+; CHECK-NEXT: csel w28, w21, w3, ne
+; CHECK-NEXT: str w28, [x20]
+; CHECK-NEXT: cbz x15, .LBB0_38
+; CHECK-NEXT: // %bb.40: // %if.then436.us
+; CHECK-NEXT: // in Loop: Header=BB0_39 Depth=5
+; CHECK-NEXT: ldrh w28, [x14]
+; CHECK-NEXT: cbnz w28, .LBB0_37
+; CHECK-NEXT: // %bb.41: // in Loop: Header=BB0_39 Depth=5
+; CHECK-NEXT: mov w4, wzr
+; CHECK-NEXT: b .LBB0_38
+; CHECK-NEXT: .LBB0_42: // %for.body41
+; CHECK-NEXT: strb wzr, [x4]
+; CHECK-NEXT: strb wzr, [x14]
+; CHECK-NEXT: .LBB0_43: // %for.cond563.preheader
+; CHECK-NEXT: ldp x20, x19, [sp, #224] // 16-byte Folded Reload
+; CHECK-NEXT: ldp x22, x21, [sp, #208] // 16-byte Folded Reload
+; CHECK-NEXT: ldp x24, x23, [sp, #192] // 16-byte Folded Reload
+; CHECK-NEXT: ldp x26, x25, [sp, #176] // 16-byte Folded Reload
+; CHECK-NEXT: ldp x28, x27, [sp, #160] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #144] // 8-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #240
+; CHECK-NEXT: ret
+entry:
+ br i1 %var_5, label %for.body41.lr.ph, label %for.cond563.preheader
+
+for.body41.lr.ph: ; preds = %entry
+ %arrayidx147 = getelementptr i8, ptr %arr_3, i64 120
+ %tobool326.not = icmp eq i64 %var_2, 0
+ %not353 = xor i64 0, -1
+ %add538 = select i1 %var_0, i16 0, i16 1
+ br i1 %var_0, label %for.body41.us, label %for.body41
+
+for.body41.us: ; preds = %for.cond.cleanup93.us, %for.body41.lr.ph
+ %var_24.promoted9271009.us = phi i64 [ 0, %for.body41.lr.ph ], [ %6, %for.cond.cleanup93.us ]
+ %var_37.promoted9301008.us = phi i64 [ 1, %for.body41.lr.ph ], [ 0, %for.cond.cleanup93.us ]
+ %2 = phi i8 [ 0, %for.body41.lr.ph ], [ 1, %for.cond.cleanup93.us ]
+ %add4139751001.us = phi i16 [ 0, %for.body41.lr.ph ], [ 1, %for.cond.cleanup93.us ]
+ %3 = phi i8 [ 0, %for.body41.lr.ph ], [ %var_10, %for.cond.cleanup93.us ]
+ store i32 %var_6, ptr %arr_3, align 4
+ store i8 %var_10, ptr %arr_3, align 1
+ br label %for.body67.us
+
+for.body67.us: ; preds = %for.cond.cleanup93.us, %for.body41.us
+ %4 = phi i8 [ %3, %for.body41.us ], [ 0, %for.cond.cleanup93.us ]
+ %add413977.us = phi i16 [ %add4139751001.us, %for.body41.us ], [ %add413.us17, %for.cond.cleanup93.us ]
+ %5 = phi i8 [ %2, %for.body41.us ], [ %.sroa.speculated829.us, %for.cond.cleanup93.us ]
+ %conv64922.us = phi i32 [ 1, %for.body41.us ], [ 0, %for.cond.cleanup93.us ]
+ %6 = phi i64 [ %var_24.promoted9271009.us, %for.body41.us ], [ %.sroa.speculated832.us, %for.cond.cleanup93.us ]
+ %mul354903918.us = phi i64 [ %var_37.promoted9301008.us, %for.body41.us ], [ 0, %for.cond.cleanup93.us ]
+ %i_2.0921.us = zext i32 %var_15 to i64
+ %.sroa.speculated832.us = tail call i64 @llvm.umin.i64(i64 %var_24.promoted9271009.us, i64 -30)
+ %sext1023 = shl i64 %i_2.0921.us, 1
+ %idxprom138.us162 = ashr i64 %sext1023, 1
+ %gep889.us = getelementptr [24 x i16], ptr %arr_19, i64 %idxprom138.us16
+ %arrayidx149.us = getelementptr i8, ptr %arrayidx147, i64 %idxprom138.us162
+ %arrayidx319.us = getelementptr [24 x i8], ptr null, i64 %idxprom138.us162
+ %7 = sext i32 %conv64922.us to i64
+ %8 = getelementptr i32, ptr %arr_12, i64 %7
+ %arrayidx226.us = getelementptr i8, ptr %8, i64 -12
+ br label %for.cond95.preheader.us
+
+for.cond.cleanup93.us: ; preds = %for.cond.cleanup98.us
+ br i1 %var_5, label %for.body67.us, label %for.body41.us
+
+for.cond.cleanup98.us: ; preds = %for.cond510.preheader.us
+ br i1 %var_0, label %for.cond.cleanup93.us, label %for.cond95.preheader.us
+
+for.body99.us: ; preds = %for.cond95.preheader.us, %for.cond510.preheader.us
+ %mul287985.us = phi i16 [ 0, %for.cond95.preheader.us ], [ %mul287.us, %for.cond510.preheader.us ]
+ %9 = phi i8 [ %29, %for.cond95.preheader.us ], [ %var_14, %for.cond510.preheader.us ]
+ %add413979.us = phi i16 [ %add413978.us, %for.cond95.preheader.us ], [ %add413.us17, %for.cond510.preheader.us ]
+ %10 = phi i32 [ 0, %for.cond95.preheader.us ], [ %26, %for.cond510.preheader.us ]
+ %mul354905.us = phi i64 [ %mul354904.us, %for.cond95.preheader.us ], [ %mul354907.us, %for.cond510.preheader.us ]
+ %sub283896.us = phi i64 [ 1, %for.cond95.preheader.us ], [ %sub283.us, %for.cond510.preheader.us ]
+ %conv96880.us = phi i64 [ 1, %for.cond95.preheader.us ], [ 0, %for.cond510.preheader.us ]
+ %.sroa.speculated829.us = tail call i8 @llvm.smin.i8(i8 %30, i8 0)
+ br label %for.body113.us
+
+for.body380.us: ; preds = %for.cond376.preheader.us, %for.inc505.us
+ %indvars.iv1018 = phi i64 [ 0, %for.cond376.preheader.us ], [ %indvars.iv.next1019, %for.inc505.us ]
+ %11 = phi i8 [ 0, %for.cond376.preheader.us ], [ %13, %for.inc505.us ]
+ %add413980.us = phi i16 [ %add413979.us, %for.cond376.preheader.us ], [ 0, %for.inc505.us ]
+ %12 = load i16, ptr %arrayidx384.us, align 2
+ store i16 %12, ptr %invariant.gep875.us, align 2
+ %add413.us17 = or i16 %add413980.us, 0
+ %arrayidx416.us = getelementptr i16, ptr %arr_13, i64 %indvars.iv1018
+ %conv419.us = select i1 %var_0, i32 36006, i32 -7680
+ store i32 %conv419.us, ptr @var_50, align 4
+ %tobool435.not.us = icmp eq i64 %mul, 0
+ br i1 %tobool435.not.us, label %for.inc505.us, label %if.then436.us
+
+if.then436.us: ; preds = %for.body380.us
+ %.sroa.speculated817.us = tail call i8 @llvm.smax.i8(i8 %11, i8 0)
+ %cond464.in.us = load i16, ptr %gep876.us, align 2
+ %tobool465.not.us = icmp eq i16 %cond464.in.us, 0
+ br i1 %tobool465.not.us, label %for.inc505.us, label %if.then466.us
+
+if.then466.us: ; preds = %if.then436.us
+ store i64 %conv35, ptr %arr_3, align 8
+ br label %for.inc505.us
+
+for.inc505.us: ; preds = %if.then466.us, %if.then436.us, %for.body380.us
+ %13 = phi i8 [ %11, %for.body380.us ], [ %.sroa.speculated817.us, %if.then466.us ], [ 0, %if.then436.us ]
+ %indvars.iv.next1019 = add i64 %indvars.iv1018, 1
+ %cmp378.us = icmp ult i64 %indvars.iv1018, 0
+ br i1 %cmp378.us, label %for.body380.us, label %for.cond510.preheader.us
+
+for.body194.us: ; preds = %if.end.us.7, %for.inc371.us
+ %indvars.iv = phi i64 [ 0, %if.end.us.7 ], [ %indvars.iv.next, %for.inc371.us ]
+ %mul287986.us = phi i16 [ %mul287985.us, %if.end.us.7 ], [ %mul287.us, %for.inc371.us ]
+ %14 = phi i8 [ %9, %if.end.us.7 ], [ %16, %for.inc371.us ]
+ %mul354906.us = phi i64 [ %mul354905.us, %if.end.us.7 ], [ %var_11, %for.inc371.us ]
+ %sub283897.us = phi i64 [ %sub283896.us, %if.end.us.7 ], [ 0, %for.inc371.us ]
+ %tobool221.not.us = icmp eq i32 1, 0
+ br i1 %tobool221.not.us, label %if.end239.us, label %if.then222.us
+
+if.then222.us: ; preds = %for.body194.us
+ %15 = load i32, ptr %arrayidx226.us, align 4
+ %conv227.us = trunc i32 %15 to i16
+ store i16 %conv227.us, ptr @var_32, align 2
+ %.sroa.speculated820.us = tail call i8 @llvm.smax.i8(i8 %14, i8 0)
+ br label %if.end239.us
+
+if.end239.us: ; preds = %if.then222.us, %for.body194.us
+ %16 = phi i8 [ %.sroa.speculated820.us, %if.then222.us ], [ 0, %for.body194.us ]
+ store i8 -107, ptr %arr_7, align 1
+ %tobool253.not.us = icmp eq i8 %0, 0
+ br i1 %tobool253.not.us, label %if.end282.us, label %if.then254.us
+
+if.then254.us: ; preds = %if.end239.us
+ %17 = load i16, ptr %arrayidx259.us, align 2
+ %tobool261.not.us = icmp eq i16 %17, 0
+ %conv268.us = select i1 %tobool261.not.us, i64 0, i64 %var_16
+ store i64 %conv268.us, ptr @var_35, align 8
+ %gep867.us = getelementptr [24 x [24 x i64]], ptr null, i64 %indvars.iv
+ store i16 %var_1, ptr %arr_6, align 2
+ br label %if.end282.us
+
+if.end282.us: ; preds = %if.then254.us, %if.end239.us
+ %sub283.us = or i64 %sub283897.us, -434259939
+ store i64 %sub283.us, ptr %arr_4, align 8
+ %mul287.us = mul i16 %mul287986.us, -18978
+ store i64 0, ptr @var_39, align 8
+ %18 = load i8, ptr %arrayidx321.us, align 1
+ %conv322.us = zext i8 %18 to i64
+ store i64 %conv322.us, ptr %arr_4, align 8
+ br i1 %tobool326.not, label %if.then327.us, label %for.inc371.us
+
+if.then327.us: ; preds = %if.end282.us
+ %tobool330.not.us = icmp eq i32 0, 0
+ br i1 %tobool330.not.us, label %cond.end345.us, label %cond.true331.us
+
+cond.true331.us: ; preds = %if.then327.us
+ %19 = load i8, ptr null, align 1
+ %20 = sext i8 %19 to i16
+ br label %cond.end345.us
+
+cond.end345.us: ; preds = %cond.true331.us, %if.then327.us
+ %cond346.us = phi i16 [ %20, %cond.true331.us ], [ 0, %if.then327.us ]
+ store i16 %cond346.us, ptr %arr_4, align 2
+ %mul354.us = mul i64 %mul354906.us, %not353
+ store i64 %mul354.us, ptr @var_46, align 8
+ br label %for.inc371.us
+
+for.inc371.us: ; preds = %cond.end345.us, %if.end282.us
+ %mul354907.us = phi i64 [ 1, %if.end282.us ], [ 0, %cond.end345.us ]
+ %indvars.iv.next = or i64 %indvars.iv, 1
+ br i1 %var_0, label %for.body194.us, label %for.cond376.preheader.us
+
+cond.true146.us: ; preds = %for.cond131.preheader.us
+ %21 = load i8, ptr %arrayidx149.us, align 1
+ %conv150.us = sext i8 %21 to i32
+ br label %cond.end154.us
+
+cond.end154.us: ; preds = %for.cond131.preheader.us, %cond.true146.us
+ %cond155.us = phi i32 [ %conv150.us, %cond.true146.us ], [ 0, %for.cond131.preheader.us ]
+ %shl.us = shl i32 %div.us, %cond155.us
+ %tobool157.not.us = icmp eq i32 %shl.us, 0
+ br i1 %tobool157.not.us, label %if.end.us, label %if.then.us
+
+if.then.us: ; preds = %cond.end154.us
+ store i32 0, ptr %arr_4, align 4
+ br label %if.end.us
+
+if.end.us: ; preds = %if.then.us, %cond.end154.us
+ %22 = phi i32 [ 0, %if.then.us ], [ %10, %cond.end154.us ]
+ store i8 %1, ptr %arr_4, align 1
+ call void @llvm.assume(i1 true)
+ %23 = load i8, ptr %arrayidx149.us, align 1
+ %conv150.us.2 = sext i8 %23 to i32
+ %shl.us.2 = shl i32 18984, %conv150.us.2
+ %tobool157.not.us.2 = icmp eq i32 %shl.us.2, 0
+ br i1 %tobool157.not.us.2, label %if.then.us.5, label %if.then.us.2
+
+if.then.us.2: ; preds = %if.end.us
+ %.sroa.speculated826.us.2 = tail call i32 @llvm.smin.i32(i32 %10, i32 0)
+ store i8 0, ptr %arr_4, align 1
+ br label %if.then.us.5
+
+if.then.us.5: ; preds = %if.then.us.2, %if.end.us
+ %24 = phi i32 [ 0, %if.then.us.2 ], [ %22, %if.end.us ]
+ %.sroa.speculated826.us.5 = tail call i32 @llvm.smin.i32(i32 %24, i32 1410036665)
+ br i1 %var_0, label %cond.end154.us.7, label %cond.true146.us.7
+
+cond.true146.us.7: ; preds = %if.then.us.5
+ %25 = load i8, ptr %arrayidx149.us, align 1
+ %conv150.us.7 = sext i8 %25 to i32
+ br label %cond.end154.us.7
+
+cond.end154.us.7: ; preds = %cond.true146.us.7, %if.then.us.5
+ %cond155.us.7 = phi i32 [ %conv150.us.7, %cond.true146.us.7 ], [ 0, %if.then.us.5 ]
+ %shl.us.7 = shl i32 18984, %cond155.us.7
+ %tobool157.not.us.7 = icmp eq i32 %shl.us.7, 0
+ br i1 %tobool157.not.us.7, label %if.end.us.7, label %if.then.us.7
+
+if.then.us.7: ; preds = %cond.end154.us.7
+ store i32 0, ptr %arr_3, align 4
+ br label %if.end.us.7
+
+if.end.us.7: ; preds = %if.then.us.7, %cond.end154.us.7
+ %26 = phi i32 [ 0, %if.then.us.7 ], [ %.sroa.speculated826.us.5, %cond.end154.us.7 ]
+ %arrayidx259.us = getelementptr i16, ptr %arrayidx257.us, i64 %conv96880.us
+ br label %for.body194.us
+
+for.body113.us: ; preds = %for.body113.us, %for.body99.us
+ br i1 %var_0, label %for.body113.us, label %for.cond131.preheader.us
+
+for.cond510.preheader.us: ; preds = %for.inc505.us
+ %cmp97.us = icmp slt i16 %add538, 0
+ br i1 %cmp97.us, label %for.body99.us, label %for.cond.cleanup98.us
+
+for.cond376.preheader.us: ; preds = %for.inc371.us
+ %arrayidx384.us = getelementptr i16, ptr null, i64 %conv96880.us
+ %gep876.us = getelementptr [24 x [24 x i16]], ptr %invariant.gep875.us, i64 %conv96880.us
+ br label %for.body380.us
+
+for.cond131.preheader.us: ; preds = %for.body113.us
+ store i8 %var_3, ptr %arr_4, align 1
+ %27 = load i16, ptr %gep884.us, align 2
+ %28 = mul i16 18984, %27
+ %div.us = zext i16 %28 to i32
+ %tobool145.not.us = icmp eq i8 0, 0
+ br i1 %tobool145.not.us, label %cond.end154.us, label %cond.true146.us
+
+for.cond95.preheader.us: ; preds = %for.cond.cleanup98.us, %for.body67.us
+ %indvars.iv1021 = phi i64 [ 1, %for.cond.cleanup98.us ], [ 0, %for.body67.us ]
+ %29 = phi i8 [ %16, %for.cond.cleanup98.us ], [ %4, %for.body67.us ]
+ %add413978.us = phi i16 [ %var_4, %for.cond.cleanup98.us ], [ %add413977.us, %for.body67.us ]
+ %30 = phi i8 [ %.sroa.speculated829.us, %for.cond.cleanup98.us ], [ %5, %for.body67.us ]
+ %mul354904.us = phi i64 [ 0, %for.cond.cleanup98.us ], [ %mul354903918.us, %for.body67.us ]
+ %gep884.us = getelementptr [24 x [24 x i16]], ptr %gep889.us, i64 %indvars.iv1021
+ %arrayidx321.us = getelementptr i8, ptr %arrayidx319.us, i64 %indvars.iv1021
+ %arrayidx257.us = getelementptr [24 x i16], ptr null, i64 %indvars.iv1021
+ br label %for.body99.us
+
+for.cond563.preheader: ; preds = %for.body41, %entry
+ ret void
+
+for.body41: ; preds = %for.body41.lr.ph
+ store i8 0, ptr %arr_12, align 1
+ store i8 0, ptr %arr_3, align 1
+ br label %for.cond563.preheader
+}
+
+attributes #0 = { nounwind "frame-pointer"="non-leaf" "target-cpu"="grace" }
+attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-rdsvl.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-rdsvl.ll
index 06c53d8..b17b48c 100644
--- a/llvm/test/CodeGen/AArch64/sme-intrinsics-rdsvl.ll
+++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-rdsvl.ll
@@ -86,4 +86,111 @@ define i64 @sme_cntsd_mul() {
ret i64 %res
}
-declare i64 @llvm.aarch64.sme.cntsd()
+define i64 @sme_cntsb_mul_pos() {
+; CHECK-LABEL: sme_cntsb_mul_pos:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdsvl x8, #24
+; CHECK-NEXT: lsl x0, x8, #2
+; CHECK-NEXT: ret
+ %v = call i64 @llvm.aarch64.sme.cntsd()
+ %shl = shl nuw nsw i64 %v, 3
+ %res = mul nuw nsw i64 %shl, 96
+ ret i64 %res
+}
+
+define i64 @sme_cntsh_mul_pos() {
+; CHECK-LABEL: sme_cntsh_mul_pos:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdsvl x8, #3
+; CHECK-NEXT: lsr x0, x8, #1
+; CHECK-NEXT: ret
+ %v = call i64 @llvm.aarch64.sme.cntsd()
+ %shl = shl nuw nsw i64 %v, 2
+ %res = mul nuw nsw i64 %shl, 3
+ ret i64 %res
+}
+
+define i64 @sme_cntsw_mul_pos() {
+; CHECK-LABEL: sme_cntsw_mul_pos:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdsvl x8, #31
+; CHECK-NEXT: lsr x0, x8, #1
+; CHECK-NEXT: ret
+ %v = call i64 @llvm.aarch64.sme.cntsd()
+ %shl = shl nuw nsw i64 %v, 1
+ %res = mul nuw nsw i64 %shl, 62
+ ret i64 %res
+}
+
+define i64 @sme_cntsd_mul_pos() {
+; CHECK-LABEL: sme_cntsd_mul_pos:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdsvl x8, #31
+; CHECK-NEXT: lsl x0, x8, #2
+; CHECK-NEXT: ret
+ %v = call i64 @llvm.aarch64.sme.cntsd()
+ %res = mul nuw nsw i64 %v, 992
+ ret i64 %res
+}
+
+define i64 @sme_cntsb_mul_neg() {
+; CHECK-LABEL: sme_cntsb_mul_neg:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdsvl x8, #-24
+; CHECK-NEXT: lsl x0, x8, #2
+; CHECK-NEXT: ret
+ %v = call i64 @llvm.aarch64.sme.cntsd()
+ %shl = shl nuw nsw i64 %v, 3
+ %res = mul nuw nsw i64 %shl, -96
+ ret i64 %res
+}
+
+define i64 @sme_cntsh_mul_neg() {
+; CHECK-LABEL: sme_cntsh_mul_neg:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdsvl x8, #-3
+; CHECK-NEXT: lsr x0, x8, #1
+; CHECK-NEXT: ret
+ %v = call i64 @llvm.aarch64.sme.cntsd()
+ %shl = shl nuw nsw i64 %v, 2
+ %res = mul nuw nsw i64 %shl, -3
+ ret i64 %res
+}
+
+define i64 @sme_cntsw_mul_neg() {
+; CHECK-LABEL: sme_cntsw_mul_neg:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdsvl x8, #-31
+; CHECK-NEXT: lsl x0, x8, #3
+; CHECK-NEXT: ret
+ %v = call i64 @llvm.aarch64.sme.cntsd()
+ %shl = shl nuw nsw i64 %v, 1
+ %res = mul nuw nsw i64 %shl, -992
+ ret i64 %res
+}
+
+define i64 @sme_cntsd_mul_neg() {
+; CHECK-LABEL: sme_cntsd_mul_neg:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdsvl x8, #-3
+; CHECK-NEXT: lsr x0, x8, #3
+; CHECK-NEXT: ret
+ %v = call i64 @llvm.aarch64.sme.cntsd()
+ %res = mul nuw nsw i64 %v, -3
+ ret i64 %res
+}
+
+; Negative test for optimization failure
+define i64 @sme_cntsd_mul_fail() {
+; CHECK-LABEL: sme_cntsd_mul_fail:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdsvl x8, #1
+; CHECK-NEXT: mov w9, #993 // =0x3e1
+; CHECK-NEXT: lsr x8, x8, #3
+; CHECK-NEXT: mul x0, x8, x9
+; CHECK-NEXT: ret
+ %v = call i64 @llvm.aarch64.sme.cntsd()
+ %res = mul nuw nsw i64 %v, 993
+ ret i64 %res
+}
+
diff --git a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
index b8d6c88..3f35cb5 100644
--- a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
+++ b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
@@ -829,7 +829,7 @@ define void @try_catch_agnostic_za_invoke() "aarch64_za_state_agnostic" personal
; CHECK-SDAG-NEXT: bl __arm_sme_restore
; CHECK-SDAG-NEXT: b .LBB5_1
entry:
- invoke void @agnostic_za_call()
+ invoke void @agnostic_za_call() "aarch64_za_state_agnostic"
to label %exit unwind label %catch
catch:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
index 353c09b..ecd7cc2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
@@ -1778,7 +1778,7 @@ define i65 @v_ashr_i65_33(i65 %value) {
; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GFX6-NEXT: v_lshl_b64 v[0:1], v[1:2], 31
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 1, v3
-; GFX6-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
; GFX6-NEXT: v_ashrrev_i32_e32 v2, 1, v2
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
@@ -1790,7 +1790,7 @@ define i65 @v_ashr_i65_33(i65 %value) {
; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GFX8-NEXT: v_lshlrev_b64 v[0:1], 31, v[1:2]
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 1, v3
-; GFX8-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v3
; GFX8-NEXT: v_ashrrev_i32_e32 v2, 1, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -1802,7 +1802,7 @@ define i65 @v_ashr_i65_33(i65 %value) {
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 31, v[1:2]
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 1, v3
-; GFX9-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX9-NEXT: v_or_b32_e32 v0, v0, v3
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 1, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -1815,7 +1815,7 @@ define i65 @v_ashr_i65_33(i65 %value) {
; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 31, v[1:2]
; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v2, 1, v2
-; GFX10PLUS-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX10PLUS-NEXT: v_or_b32_e32 v0, v0, v3
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%result = ashr i65 %value, 33
ret i65 %result
@@ -1875,21 +1875,19 @@ define amdgpu_ps i65 @s_ashr_i65_33(i65 inreg %value) {
; GCN-LABEL: s_ashr_i65_33:
; GCN: ; %bb.0:
; GCN-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
-; GCN-NEXT: s_lshr_b32 s0, s1, 1
-; GCN-NEXT: s_mov_b32 s1, 0
-; GCN-NEXT: s_lshl_b64 s[4:5], s[2:3], 31
-; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
+; GCN-NEXT: s_lshr_b32 s4, s1, 1
+; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 31
+; GCN-NEXT: s_or_b32 s0, s0, s4
; GCN-NEXT: s_ashr_i32 s2, s3, 1
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_ashr_i65_33:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
-; GFX10PLUS-NEXT: s_lshr_b32 s0, s1, 1
-; GFX10PLUS-NEXT: s_mov_b32 s1, 0
-; GFX10PLUS-NEXT: s_lshl_b64 s[4:5], s[2:3], 31
+; GFX10PLUS-NEXT: s_lshr_b32 s4, s1, 1
+; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[2:3], 31
; GFX10PLUS-NEXT: s_ashr_i32 s2, s3, 1
-; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX10PLUS-NEXT: s_or_b32 s0, s0, s4
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = ashr i65 %value, 33
ret i65 %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-or-s64-s32.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-or-s64-s32.mir
new file mode 100644
index 0000000..48e9818
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-or-s64-s32.mir
@@ -0,0 +1,97 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=amdgpu-prelegalizer-combiner %s -o - | FileCheck %s
+
+---
+name: test_combine_or_s64_s32
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1, $sgpr2
+ ; CHECK-LABEL: name: test_combine_or_s64_s32
+ ; CHECK: liveins: $sgpr0_sgpr1, $sgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr2
+ ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+ ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV]], [[COPY1]]
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[UV1]](s32)
+ ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[MV]](s64)
+ ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
+ %0:_(s64) = COPY $sgpr0_sgpr1
+ %1:_(s32) = COPY $sgpr2
+ %2:_(s64) = G_ZEXT %1(s32)
+ %3:_(s64) = G_OR %0, %2
+ $sgpr0_sgpr1 = COPY %3(s64)
+ SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
+...
+---
+name: test_combine_or_s64_s32_rhs
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1, $sgpr2
+ ; CHECK-LABEL: name: test_combine_or_s64_s32_rhs
+ ; CHECK: liveins: $sgpr0_sgpr1, $sgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr2
+ ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+ ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV]], [[COPY1]]
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[UV1]](s32)
+ ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[MV]](s64)
+ ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
+ %0:_(s64) = COPY $sgpr0_sgpr1
+ %1:_(s32) = COPY $sgpr2
+ %2:_(s64) = G_ZEXT %1(s32)
+ %3:_(s64) = G_OR %2, %0
+ $sgpr0_sgpr1 = COPY %3(s64)
+ SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
+...
+---
+name: test_combine_or_s64_s32_merge_unmerge
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2
+ ; CHECK-LABEL: name: test_combine_or_s64_s32_merge_unmerge
+ ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr2
+ ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY]], [[COPY2]]
+ ; CHECK-NEXT: $sgpr0 = COPY [[OR]](s32)
+ ; CHECK-NEXT: $sgpr1 = COPY [[COPY1]](s32)
+ ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ %0:_(s32) = COPY $sgpr0
+ %1:_(s32) = COPY $sgpr1
+ %2:_(s32) = COPY $sgpr2
+ %3:_(s64) = G_MERGE_VALUES %0(s32), %1(s32)
+ %4:_(s64) = G_ZEXT %2(s32)
+ %5:_(s64) = G_OR %3, %4
+ %6:_(s32), %7:_(s32) = G_UNMERGE_VALUES %5(s64)
+ $sgpr0 = COPY %6(s32)
+ $sgpr1 = COPY %7(s32)
+ SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+...
+---
+name: negative_test_incorrect_types
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5
+ ; CHECK-LABEL: name: negative_test_incorrect_types
+ ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr4_vgpr5
+ ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s128) = G_ZEXT [[COPY1]](s64)
+ ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s128) = G_OR [[COPY]], [[ZEXT]]
+ ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[OR]](s128)
+ %0:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+ %1:_(s64) = COPY $vgpr4_vgpr5
+ %2:_(s128) = G_ZEXT %1
+ %3:_(s128) = G_OR %0, %2
+ $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3
+...
+
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
index 5dff8c1..667fa98 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
@@ -227,39 +227,38 @@ exit:
define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3 x i32> inreg %.WorkgroupId, <3 x i32> %.LocalInvocationId) #0 {
; GFX10-LABEL: single_lane_execution_attribute:
; GFX10: ; %bb.0: ; %.entry
-; GFX10-NEXT: s_getpc_b64 s[12:13]
-; GFX10-NEXT: s_mov_b32 s12, 0
+; GFX10-NEXT: s_getpc_b64 s[4:5]
; GFX10-NEXT: s_mov_b32 s2, s0
-; GFX10-NEXT: s_mov_b32 s3, s12
+; GFX10-NEXT: s_mov_b32 s3, s5
; GFX10-NEXT: v_mbcnt_lo_u32_b32 v1, -1, 0
-; GFX10-NEXT: s_or_b64 s[2:3], s[12:13], s[2:3]
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
; GFX10-NEXT: v_mbcnt_hi_u32_b32 v1, -1, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v1
; GFX10-NEXT: v_and_b32_e32 v3, 1, v1
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
; GFX10-NEXT: s_xor_b32 s2, vcc_lo, exec_lo
-; GFX10-NEXT: s_and_b32 vcc_lo, s2, exec_lo
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen
+; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2
+; GFX10-NEXT: s_mov_b32 s2, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v2
; GFX10-NEXT: s_cbranch_vccnz .LBB4_4
; GFX10-NEXT: ; %bb.1: ; %.preheader.preheader
-; GFX10-NEXT: s_mov_b32 s2, 0
+; GFX10-NEXT: s_mov_b32 s3, 0
; GFX10-NEXT: .LBB4_2: ; %.preheader
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_mov_b32_e32 v3, s12
+; GFX10-NEXT: v_mov_b32_e32 v3, s2
; GFX10-NEXT: v_add_nc_u32_e32 v1, -1, v1
-; GFX10-NEXT: s_add_i32 s12, s12, 4
+; GFX10-NEXT: s_add_i32 s2, s2, 4
; GFX10-NEXT: buffer_load_dword v3, v3, s[4:7], 0 offen
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_readfirstlane_b32 s3, v3
-; GFX10-NEXT: s_add_i32 s2, s3, s2
+; GFX10-NEXT: v_readfirstlane_b32 s12, v3
+; GFX10-NEXT: s_add_i32 s3, s12, s3
; GFX10-NEXT: s_cbranch_vccnz .LBB4_2
; GFX10-NEXT: ; %bb.3: ; %.preheader._crit_edge
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s2, v2
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s3, v2
; GFX10-NEXT: s_or_b32 s2, s0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
; GFX10-NEXT: s_branch .LBB4_6
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index bd53032..715a777 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -4934,17 +4934,15 @@ define amdgpu_ps i64 @s_fshl_i64_5(i64 inreg %lhs, i64 inreg %rhs) {
; GCN: ; %bb.0:
; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], 5
; GCN-NEXT: s_lshr_b32 s2, s3, 27
-; GCN-NEXT: s_mov_b32 s3, 0
-; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT: s_or_b32 s0, s0, s2
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_fshl_i64_5:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 5
; GFX11-NEXT: s_lshr_b32 s2, s3, 27
-; GFX11-NEXT: s_mov_b32 s3, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX11-NEXT: s_or_b32 s0, s0, s2
; GFX11-NEXT: ; return to shader part epilog
%result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 5)
ret i64 %result
@@ -4954,20 +4952,13 @@ define amdgpu_ps i64 @s_fshl_i64_32(i64 inreg %lhs, i64 inreg %rhs) {
; GCN-LABEL: s_fshl_i64_32:
; GCN: ; %bb.0:
; GCN-NEXT: s_mov_b32 s1, s0
-; GCN-NEXT: s_mov_b32 s0, 0
-; GCN-NEXT: s_mov_b32 s2, s3
-; GCN-NEXT: s_mov_b32 s3, s0
-; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT: s_mov_b32 s0, s3
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_fshl_i64_32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_mov_b32 s1, s0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_mov_b32 s2, s3
-; GFX11-NEXT: s_mov_b32 s3, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX11-NEXT: s_mov_b32 s0, s3
; GFX11-NEXT: ; return to shader part epilog
%result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 32)
ret i64 %result
@@ -6823,56 +6814,50 @@ define amdgpu_ps i128 @s_fshl_i128_65(i128 inreg %lhs, i128 inreg %rhs) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_lshl_b64 s[2:3], s[0:1], 1
; GFX6-NEXT: s_lshr_b32 s4, s5, 31
-; GFX6-NEXT: s_mov_b32 s5, 0
; GFX6-NEXT: s_lshl_b64 s[0:1], s[6:7], 1
-; GFX6-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1]
+; GFX6-NEXT: s_or_b32 s0, s0, s4
; GFX6-NEXT: s_lshr_b32 s4, s7, 31
-; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
+; GFX6-NEXT: s_or_b32 s2, s2, s4
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_fshl_i128_65:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_lshl_b64 s[2:3], s[0:1], 1
; GFX8-NEXT: s_lshr_b32 s4, s5, 31
-; GFX8-NEXT: s_mov_b32 s5, 0
; GFX8-NEXT: s_lshl_b64 s[0:1], s[6:7], 1
-; GFX8-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1]
+; GFX8-NEXT: s_or_b32 s0, s0, s4
; GFX8-NEXT: s_lshr_b32 s4, s7, 31
-; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
+; GFX8-NEXT: s_or_b32 s2, s2, s4
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_fshl_i128_65:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_lshl_b64 s[2:3], s[0:1], 1
; GFX9-NEXT: s_lshr_b32 s4, s5, 31
-; GFX9-NEXT: s_mov_b32 s5, 0
; GFX9-NEXT: s_lshl_b64 s[0:1], s[6:7], 1
-; GFX9-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1]
+; GFX9-NEXT: s_or_b32 s0, s0, s4
; GFX9-NEXT: s_lshr_b32 s4, s7, 31
-; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
+; GFX9-NEXT: s_or_b32 s2, s2, s4
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_fshl_i128_65:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_lshr_b32 s2, s5, 31
-; GFX10-NEXT: s_mov_b32 s3, 0
-; GFX10-NEXT: s_lshl_b64 s[4:5], s[6:7], 1
-; GFX10-NEXT: s_lshl_b64 s[8:9], s[0:1], 1
-; GFX10-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5]
-; GFX10-NEXT: s_lshr_b32 s2, s7, 31
-; GFX10-NEXT: s_or_b64 s[2:3], s[8:9], s[2:3]
+; GFX10-NEXT: s_lshl_b64 s[2:3], s[0:1], 1
+; GFX10-NEXT: s_lshr_b32 s4, s5, 31
+; GFX10-NEXT: s_lshl_b64 s[0:1], s[6:7], 1
+; GFX10-NEXT: s_lshr_b32 s5, s7, 31
+; GFX10-NEXT: s_or_b32 s0, s0, s4
+; GFX10-NEXT: s_or_b32 s2, s2, s5
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_fshl_i128_65:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_lshr_b32 s2, s5, 31
-; GFX11-NEXT: s_mov_b32 s3, 0
-; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 1
-; GFX11-NEXT: s_lshl_b64 s[8:9], s[0:1], 1
-; GFX11-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5]
-; GFX11-NEXT: s_lshr_b32 s2, s7, 31
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_or_b64 s[2:3], s[8:9], s[2:3]
+; GFX11-NEXT: s_lshl_b64 s[2:3], s[0:1], 1
+; GFX11-NEXT: s_lshr_b32 s4, s5, 31
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[6:7], 1
+; GFX11-NEXT: s_lshr_b32 s5, s7, 31
+; GFX11-NEXT: s_or_b32 s0, s0, s4
+; GFX11-NEXT: s_or_b32 s2, s2, s5
; GFX11-NEXT: ; return to shader part epilog
%result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 65)
ret i128 %result
@@ -6885,7 +6870,7 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) {
; GFX6-NEXT: v_lshl_b64 v[2:3], v[0:1], 1
; GFX6-NEXT: v_lshl_b64 v[0:1], v[6:7], 1
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 31, v5
-; GFX6-NEXT: v_or_b32_e32 v0, v4, v0
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v4
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 31, v7
; GFX6-NEXT: v_or_b32_e32 v2, v2, v4
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -6896,7 +6881,7 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) {
; GFX8-NEXT: v_lshlrev_b64 v[2:3], 1, v[0:1]
; GFX8-NEXT: v_lshlrev_b64 v[0:1], 1, v[6:7]
; GFX8-NEXT: v_lshrrev_b32_e32 v4, 31, v5
-; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
; GFX8-NEXT: v_lshrrev_b32_e32 v4, 31, v7
; GFX8-NEXT: v_or_b32_e32 v2, v2, v4
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -6907,7 +6892,7 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[0:1]
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[6:7]
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v5
-; GFX9-NEXT: v_or_b32_e32 v0, v4, v0
+; GFX9-NEXT: v_or_b32_e32 v0, v0, v4
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v7
; GFX9-NEXT: v_or_b32_e32 v2, v2, v4
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -6919,7 +6904,7 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) {
; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[6:7]
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 31, v5
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 31, v7
-; GFX10-NEXT: v_or_b32_e32 v0, v4, v0
+; GFX10-NEXT: v_or_b32_e32 v0, v0, v4
; GFX10-NEXT: v_or_b32_e32 v2, v2, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -6931,7 +6916,7 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) {
; GFX11-NEXT: v_lshrrev_b32_e32 v4, 31, v5
; GFX11-NEXT: v_lshrrev_b32_e32 v5, 31, v7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_or_b32_e32 v0, v4, v0
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v4
; GFX11-NEXT: v_or_b32_e32 v2, v2, v5
; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 65)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index ea6b3a3..5aa5a671 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -4715,20 +4715,13 @@ define amdgpu_ps i64 @s_fshr_i64_32(i64 inreg %lhs, i64 inreg %rhs) {
; GCN-LABEL: s_fshr_i64_32:
; GCN: ; %bb.0:
; GCN-NEXT: s_mov_b32 s1, s0
-; GCN-NEXT: s_mov_b32 s0, 0
-; GCN-NEXT: s_mov_b32 s2, s3
-; GCN-NEXT: s_mov_b32 s3, s0
-; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT: s_mov_b32 s0, s3
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_fshr_i64_32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_mov_b32 s1, s0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_mov_b32 s2, s3
-; GFX11-NEXT: s_mov_b32 s3, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX11-NEXT: s_mov_b32 s0, s3
; GFX11-NEXT: ; return to shader part epilog
%result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 32)
ret i64 %result
@@ -4739,17 +4732,15 @@ define amdgpu_ps i64 @s_fshr_i64_48(i64 inreg %lhs, i64 inreg %rhs) {
; GCN: ; %bb.0:
; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
; GCN-NEXT: s_lshr_b32 s2, s3, 16
-; GCN-NEXT: s_mov_b32 s3, 0
-; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT: s_or_b32 s0, s0, s2
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_fshr_i64_48:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
; GFX11-NEXT: s_lshr_b32 s2, s3, 16
-; GFX11-NEXT: s_mov_b32 s3, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX11-NEXT: s_or_b32 s0, s0, s2
; GFX11-NEXT: ; return to shader part epilog
%result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 48)
ret i64 %result
@@ -5293,34 +5284,33 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
; GFX6-NEXT: s_lshl_b64 s[10:11], s[0:1], 1
; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
; GFX6-NEXT: s_lshr_b32 s0, s1, 31
-; GFX6-NEXT: s_mov_b32 s1, 0
-; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX6-NEXT: s_andn2_b32 s2, 0x7f, s8
+; GFX6-NEXT: s_or_b32 s2, s2, s0
+; GFX6-NEXT: s_andn2_b32 s0, 0x7f, s8
; GFX6-NEXT: s_not_b32 s9, s8
-; GFX6-NEXT: s_sub_i32 s16, s2, 64
-; GFX6-NEXT: s_sub_i32 s12, 64, s2
-; GFX6-NEXT: s_cmp_lt_u32 s2, 64
+; GFX6-NEXT: s_sub_i32 s16, s0, 64
+; GFX6-NEXT: s_sub_i32 s12, 64, s0
+; GFX6-NEXT: s_cmp_lt_u32 s0, 64
; GFX6-NEXT: s_cselect_b32 s17, 1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s2, 0
+; GFX6-NEXT: s_cmp_eq_u32 s0, 0
; GFX6-NEXT: s_cselect_b32 s18, 1, 0
; GFX6-NEXT: s_lshr_b64 s[12:13], s[10:11], s12
-; GFX6-NEXT: s_lshl_b64 s[14:15], s[0:1], s9
-; GFX6-NEXT: s_lshl_b64 s[2:3], s[10:11], s9
+; GFX6-NEXT: s_lshl_b64 s[14:15], s[2:3], s9
+; GFX6-NEXT: s_lshl_b64 s[0:1], s[10:11], s9
; GFX6-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15]
; GFX6-NEXT: s_lshl_b64 s[10:11], s[10:11], s16
; GFX6-NEXT: s_cmp_lg_u32 s17, 0
-; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
+; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], 0
; GFX6-NEXT: s_cselect_b64 s[10:11], s[12:13], s[10:11]
; GFX6-NEXT: s_cmp_lg_u32 s18, 0
-; GFX6-NEXT: s_cselect_b64 s[10:11], s[0:1], s[10:11]
-; GFX6-NEXT: s_and_b32 s0, s8, 0x7f
-; GFX6-NEXT: s_sub_i32 s14, s0, 64
-; GFX6-NEXT: s_sub_i32 s12, 64, s0
-; GFX6-NEXT: s_cmp_lt_u32 s0, 64
+; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11]
+; GFX6-NEXT: s_and_b32 s9, s8, 0x7f
+; GFX6-NEXT: s_sub_i32 s14, s9, 64
+; GFX6-NEXT: s_sub_i32 s12, 64, s9
+; GFX6-NEXT: s_cmp_lt_u32 s9, 64
; GFX6-NEXT: s_cselect_b32 s15, 1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s0, 0
+; GFX6-NEXT: s_cmp_eq_u32 s9, 0
; GFX6-NEXT: s_cselect_b32 s16, 1, 0
-; GFX6-NEXT: s_lshr_b64 s[0:1], s[6:7], s8
+; GFX6-NEXT: s_lshr_b64 s[10:11], s[6:7], s8
; GFX6-NEXT: s_lshr_b64 s[8:9], s[4:5], s8
; GFX6-NEXT: s_lshl_b64 s[12:13], s[6:7], s12
; GFX6-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
@@ -5330,9 +5320,9 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
; GFX6-NEXT: s_cmp_lg_u32 s16, 0
; GFX6-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7]
; GFX6-NEXT: s_cmp_lg_u32 s15, 0
-; GFX6-NEXT: s_cselect_b64 s[6:7], s[0:1], 0
-; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5]
-; GFX6-NEXT: s_or_b64 s[2:3], s[10:11], s[6:7]
+; GFX6-NEXT: s_cselect_b64 s[6:7], s[10:11], 0
+; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7]
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_fshr_i128:
@@ -5340,34 +5330,33 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
; GFX8-NEXT: s_lshl_b64 s[10:11], s[0:1], 1
; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
; GFX8-NEXT: s_lshr_b32 s0, s1, 31
-; GFX8-NEXT: s_mov_b32 s1, 0
-; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX8-NEXT: s_andn2_b32 s2, 0x7f, s8
+; GFX8-NEXT: s_or_b32 s2, s2, s0
+; GFX8-NEXT: s_andn2_b32 s0, 0x7f, s8
; GFX8-NEXT: s_not_b32 s9, s8
-; GFX8-NEXT: s_sub_i32 s16, s2, 64
-; GFX8-NEXT: s_sub_i32 s12, 64, s2
-; GFX8-NEXT: s_cmp_lt_u32 s2, 64
+; GFX8-NEXT: s_sub_i32 s16, s0, 64
+; GFX8-NEXT: s_sub_i32 s12, 64, s0
+; GFX8-NEXT: s_cmp_lt_u32 s0, 64
; GFX8-NEXT: s_cselect_b32 s17, 1, 0
-; GFX8-NEXT: s_cmp_eq_u32 s2, 0
+; GFX8-NEXT: s_cmp_eq_u32 s0, 0
; GFX8-NEXT: s_cselect_b32 s18, 1, 0
; GFX8-NEXT: s_lshr_b64 s[12:13], s[10:11], s12
-; GFX8-NEXT: s_lshl_b64 s[14:15], s[0:1], s9
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[10:11], s9
+; GFX8-NEXT: s_lshl_b64 s[14:15], s[2:3], s9
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[10:11], s9
; GFX8-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15]
; GFX8-NEXT: s_lshl_b64 s[10:11], s[10:11], s16
; GFX8-NEXT: s_cmp_lg_u32 s17, 0
-; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
+; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], 0
; GFX8-NEXT: s_cselect_b64 s[10:11], s[12:13], s[10:11]
; GFX8-NEXT: s_cmp_lg_u32 s18, 0
-; GFX8-NEXT: s_cselect_b64 s[10:11], s[0:1], s[10:11]
-; GFX8-NEXT: s_and_b32 s0, s8, 0x7f
-; GFX8-NEXT: s_sub_i32 s14, s0, 64
-; GFX8-NEXT: s_sub_i32 s12, 64, s0
-; GFX8-NEXT: s_cmp_lt_u32 s0, 64
+; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11]
+; GFX8-NEXT: s_and_b32 s9, s8, 0x7f
+; GFX8-NEXT: s_sub_i32 s14, s9, 64
+; GFX8-NEXT: s_sub_i32 s12, 64, s9
+; GFX8-NEXT: s_cmp_lt_u32 s9, 64
; GFX8-NEXT: s_cselect_b32 s15, 1, 0
-; GFX8-NEXT: s_cmp_eq_u32 s0, 0
+; GFX8-NEXT: s_cmp_eq_u32 s9, 0
; GFX8-NEXT: s_cselect_b32 s16, 1, 0
-; GFX8-NEXT: s_lshr_b64 s[0:1], s[6:7], s8
+; GFX8-NEXT: s_lshr_b64 s[10:11], s[6:7], s8
; GFX8-NEXT: s_lshr_b64 s[8:9], s[4:5], s8
; GFX8-NEXT: s_lshl_b64 s[12:13], s[6:7], s12
; GFX8-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
@@ -5377,9 +5366,9 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
; GFX8-NEXT: s_cmp_lg_u32 s16, 0
; GFX8-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7]
; GFX8-NEXT: s_cmp_lg_u32 s15, 0
-; GFX8-NEXT: s_cselect_b64 s[6:7], s[0:1], 0
-; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5]
-; GFX8-NEXT: s_or_b64 s[2:3], s[10:11], s[6:7]
+; GFX8-NEXT: s_cselect_b64 s[6:7], s[10:11], 0
+; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7]
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_fshr_i128:
@@ -5387,34 +5376,33 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
; GFX9-NEXT: s_lshl_b64 s[10:11], s[0:1], 1
; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
; GFX9-NEXT: s_lshr_b32 s0, s1, 31
-; GFX9-NEXT: s_mov_b32 s1, 0
-; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX9-NEXT: s_andn2_b32 s2, 0x7f, s8
+; GFX9-NEXT: s_or_b32 s2, s2, s0
+; GFX9-NEXT: s_andn2_b32 s0, 0x7f, s8
; GFX9-NEXT: s_not_b32 s9, s8
-; GFX9-NEXT: s_sub_i32 s16, s2, 64
-; GFX9-NEXT: s_sub_i32 s12, 64, s2
-; GFX9-NEXT: s_cmp_lt_u32 s2, 64
+; GFX9-NEXT: s_sub_i32 s16, s0, 64
+; GFX9-NEXT: s_sub_i32 s12, 64, s0
+; GFX9-NEXT: s_cmp_lt_u32 s0, 64
; GFX9-NEXT: s_cselect_b32 s17, 1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s2, 0
+; GFX9-NEXT: s_cmp_eq_u32 s0, 0
; GFX9-NEXT: s_cselect_b32 s18, 1, 0
; GFX9-NEXT: s_lshr_b64 s[12:13], s[10:11], s12
-; GFX9-NEXT: s_lshl_b64 s[14:15], s[0:1], s9
-; GFX9-NEXT: s_lshl_b64 s[2:3], s[10:11], s9
+; GFX9-NEXT: s_lshl_b64 s[14:15], s[2:3], s9
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], s9
; GFX9-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15]
; GFX9-NEXT: s_lshl_b64 s[10:11], s[10:11], s16
; GFX9-NEXT: s_cmp_lg_u32 s17, 0
-; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
+; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], 0
; GFX9-NEXT: s_cselect_b64 s[10:11], s[12:13], s[10:11]
; GFX9-NEXT: s_cmp_lg_u32 s18, 0
-; GFX9-NEXT: s_cselect_b64 s[10:11], s[0:1], s[10:11]
-; GFX9-NEXT: s_and_b32 s0, s8, 0x7f
-; GFX9-NEXT: s_sub_i32 s14, s0, 64
-; GFX9-NEXT: s_sub_i32 s12, 64, s0
-; GFX9-NEXT: s_cmp_lt_u32 s0, 64
+; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11]
+; GFX9-NEXT: s_and_b32 s9, s8, 0x7f
+; GFX9-NEXT: s_sub_i32 s14, s9, 64
+; GFX9-NEXT: s_sub_i32 s12, 64, s9
+; GFX9-NEXT: s_cmp_lt_u32 s9, 64
; GFX9-NEXT: s_cselect_b32 s15, 1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s0, 0
+; GFX9-NEXT: s_cmp_eq_u32 s9, 0
; GFX9-NEXT: s_cselect_b32 s16, 1, 0
-; GFX9-NEXT: s_lshr_b64 s[0:1], s[6:7], s8
+; GFX9-NEXT: s_lshr_b64 s[10:11], s[6:7], s8
; GFX9-NEXT: s_lshr_b64 s[8:9], s[4:5], s8
; GFX9-NEXT: s_lshl_b64 s[12:13], s[6:7], s12
; GFX9-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
@@ -5424,19 +5412,18 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
; GFX9-NEXT: s_cmp_lg_u32 s16, 0
; GFX9-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7]
; GFX9-NEXT: s_cmp_lg_u32 s15, 0
-; GFX9-NEXT: s_cselect_b64 s[6:7], s[0:1], 0
-; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5]
-; GFX9-NEXT: s_or_b64 s[2:3], s[10:11], s[6:7]
+; GFX9-NEXT: s_cselect_b64 s[6:7], s[10:11], 0
+; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7]
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_fshr_i128:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT: s_lshr_b32 s10, s1, 31
-; GFX10-NEXT: s_mov_b32 s11, 0
-; GFX10-NEXT: s_andn2_b32 s9, 0x7f, s8
+; GFX10-NEXT: s_lshr_b32 s9, s1, 31
; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[10:11]
+; GFX10-NEXT: s_or_b32 s2, s2, s9
+; GFX10-NEXT: s_andn2_b32 s9, 0x7f, s8
; GFX10-NEXT: s_not_b32 s14, s8
; GFX10-NEXT: s_sub_i32 s16, s9, 64
; GFX10-NEXT: s_sub_i32 s10, 64, s9
@@ -5479,11 +5466,10 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
; GFX11-LABEL: s_fshr_i128:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT: s_lshr_b32 s10, s1, 31
-; GFX11-NEXT: s_mov_b32 s11, 0
-; GFX11-NEXT: s_and_not1_b32 s9, 0x7f, s8
+; GFX11-NEXT: s_lshr_b32 s9, s1, 31
; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[10:11]
+; GFX11-NEXT: s_or_b32 s2, s2, s9
+; GFX11-NEXT: s_and_not1_b32 s9, 0x7f, s8
; GFX11-NEXT: s_not_b32 s14, s8
; GFX11-NEXT: s_sub_i32 s16, s9, 64
; GFX11-NEXT: s_sub_i32 s10, 64, s9
@@ -5786,13 +5772,12 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
; GFX6-NEXT: s_lshl_b64 s[8:9], s[0:1], 1
; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
; GFX6-NEXT: s_lshr_b32 s0, s1, 31
-; GFX6-NEXT: s_mov_b32 s1, 0
; GFX6-NEXT: v_bfi_b32 v7, v0, 0, v1
-; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX6-NEXT: s_or_b32 s2, s2, s0
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 64, v7
; GFX6-NEXT: v_not_b32_e32 v8, 63
; GFX6-NEXT: v_lshr_b64 v[1:2], s[8:9], v1
-; GFX6-NEXT: v_lshl_b64 v[3:4], s[0:1], v7
+; GFX6-NEXT: v_lshl_b64 v[3:4], s[2:3], v7
; GFX6-NEXT: v_add_i32_e32 v9, vcc, v7, v8
; GFX6-NEXT: v_lshl_b64 v[5:6], s[8:9], v7
; GFX6-NEXT: v_or_b32_e32 v3, v1, v3
@@ -5803,8 +5788,8 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX6-NEXT: v_mov_b32_e32 v3, s0
-; GFX6-NEXT: v_mov_b32_e32 v4, s1
+; GFX6-NEXT: v_mov_b32_e32 v3, s2
+; GFX6-NEXT: v_mov_b32_e32 v4, s3
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
; GFX6-NEXT: v_and_b32_e32 v11, 0x7f, v0
; GFX6-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc
@@ -5839,13 +5824,12 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
; GFX8-NEXT: s_lshl_b64 s[8:9], s[0:1], 1
; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
; GFX8-NEXT: s_lshr_b32 s0, s1, 31
-; GFX8-NEXT: s_mov_b32 s1, 0
; GFX8-NEXT: v_bfi_b32 v7, v0, 0, v1
-; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-NEXT: s_or_b32 s2, s2, s0
; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 64, v7
; GFX8-NEXT: v_not_b32_e32 v8, 63
; GFX8-NEXT: v_lshrrev_b64 v[1:2], v1, s[8:9]
-; GFX8-NEXT: v_lshlrev_b64 v[3:4], v7, s[0:1]
+; GFX8-NEXT: v_lshlrev_b64 v[3:4], v7, s[2:3]
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v7, v8
; GFX8-NEXT: v_lshlrev_b64 v[5:6], v7, s[8:9]
; GFX8-NEXT: v_or_b32_e32 v3, v1, v3
@@ -5856,8 +5840,8 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, s0
-; GFX8-NEXT: v_mov_b32_e32 v4, s1
+; GFX8-NEXT: v_mov_b32_e32 v3, s2
+; GFX8-NEXT: v_mov_b32_e32 v4, s3
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
; GFX8-NEXT: v_and_b32_e32 v11, 0x7f, v0
; GFX8-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc
@@ -5892,12 +5876,11 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
; GFX9-NEXT: s_lshl_b64 s[8:9], s[0:1], 1
; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
; GFX9-NEXT: s_lshr_b32 s0, s1, 31
-; GFX9-NEXT: s_mov_b32 s1, 0
; GFX9-NEXT: v_bfi_b32 v7, v0, 0, v1
-; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX9-NEXT: s_or_b32 s2, s2, s0
; GFX9-NEXT: v_sub_u32_e32 v1, 64, v7
; GFX9-NEXT: v_lshrrev_b64 v[1:2], v1, s[8:9]
-; GFX9-NEXT: v_lshlrev_b64 v[3:4], v7, s[0:1]
+; GFX9-NEXT: v_lshlrev_b64 v[3:4], v7, s[2:3]
; GFX9-NEXT: v_add_u32_e32 v8, 0xffffffc0, v7
; GFX9-NEXT: v_lshlrev_b64 v[5:6], v7, s[8:9]
; GFX9-NEXT: v_or_b32_e32 v3, v1, v3
@@ -5908,10 +5891,10 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX9-NEXT: v_mov_b32_e32 v4, s1
+; GFX9-NEXT: v_mov_b32_e32 v4, s3
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
; GFX9-NEXT: v_and_b32_e32 v10, 0x7f, v0
-; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NEXT: v_cndmask_b32_e32 v9, v2, v4, vcc
; GFX9-NEXT: v_sub_u32_e32 v2, 64, v10
; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc
@@ -5941,34 +5924,33 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
; GFX10-LABEL: v_fshr_i128_ssv:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_bfi_b32 v11, v0, 0, 0x7f
-; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT: s_lshr_b32 s8, s1, 31
-; GFX10-NEXT: s_mov_b32 s9, 0
+; GFX10-NEXT: s_lshl_b64 s[8:9], s[2:3], 1
+; GFX10-NEXT: s_lshr_b32 s2, s1, 31
; GFX10-NEXT: v_and_b32_e32 v12, 0x7f, v0
-; GFX10-NEXT: v_sub_nc_u32_e32 v1, 64, v11
; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9]
+; GFX10-NEXT: v_sub_nc_u32_e32 v1, 64, v11
+; GFX10-NEXT: s_or_b32 s8, s8, s2
; GFX10-NEXT: v_add_nc_u32_e32 v0, 0xffffffc0, v11
; GFX10-NEXT: v_lshlrev_b64 v[3:4], v11, s[8:9]
-; GFX10-NEXT: v_lshrrev_b64 v[1:2], v1, s[0:1]
; GFX10-NEXT: v_sub_nc_u32_e32 v9, 64, v12
+; GFX10-NEXT: v_lshrrev_b64 v[1:2], v1, s[0:1]
; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v11
; GFX10-NEXT: v_add_nc_u32_e32 v13, 0xffffffc0, v12
; GFX10-NEXT: v_lshrrev_b64 v[7:8], v12, s[4:5]
+; GFX10-NEXT: v_lshlrev_b64 v[9:10], v9, s[6:7]
; GFX10-NEXT: v_lshlrev_b64 v[5:6], v11, s[0:1]
; GFX10-NEXT: v_or_b32_e32 v3, v1, v3
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, s[0:1]
-; GFX10-NEXT: v_lshlrev_b64 v[9:10], v9, s[6:7]
; GFX10-NEXT: v_or_b32_e32 v4, v2, v4
; GFX10-NEXT: v_cmp_gt_u32_e64 s1, 64, v12
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v11
; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v12
+; GFX10-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v14, v0, v3, vcc_lo
; GFX10-NEXT: v_lshrrev_b64 v[2:3], v13, s[6:7]
; GFX10-NEXT: v_or_b32_e32 v0, v7, v9
; GFX10-NEXT: v_or_b32_e32 v7, v8, v10
; GFX10-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v0, s1
; GFX10-NEXT: v_lshrrev_b64 v[0:1], v12, s[6:7]
@@ -5988,18 +5970,18 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
; GFX11-LABEL: v_fshr_i128_ssv:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_bfi_b32 v11, v0, 0, 0x7f
-; GFX11-NEXT: s_lshr_b32 s8, s1, 31
+; GFX11-NEXT: s_lshl_b64 s[8:9], s[2:3], 1
+; GFX11-NEXT: s_lshr_b32 s2, s1, 31
; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT: s_mov_b32 s9, 0
+; GFX11-NEXT: s_or_b32 s8, s8, s2
; GFX11-NEXT: v_sub_nc_u32_e32 v1, 64, v11
; GFX11-NEXT: v_lshlrev_b64 v[5:6], v11, s[0:1]
; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v11
; GFX11-NEXT: v_and_b32_e32 v12, 0x7f, v0
-; GFX11-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9]
-; GFX11-NEXT: v_lshrrev_b64 v[1:2], v1, s[0:1]
; GFX11-NEXT: v_lshlrev_b64 v[3:4], v11, s[8:9]
+; GFX11-NEXT: v_lshrrev_b64 v[1:2], v1, s[0:1]
; GFX11-NEXT: v_dual_cndmask_b32 v5, 0, v5 :: v_dual_add_nc_u32 v0, 0xffffffc0, v11
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_sub_nc_u32_e32 v9, 64, v12
; GFX11-NEXT: v_lshrrev_b64 v[7:8], v12, s[4:5]
; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v12
@@ -6045,26 +6027,25 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
; GFX6-NEXT: s_lshl_b64 s[6:7], s[0:1], 1
; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
; GFX6-NEXT: s_lshr_b32 s0, s1, 31
-; GFX6-NEXT: s_mov_b32 s1, 0
-; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX6-NEXT: s_andn2_b32 s2, 0x7f, s4
+; GFX6-NEXT: s_or_b32 s2, s2, s0
+; GFX6-NEXT: s_andn2_b32 s0, 0x7f, s4
; GFX6-NEXT: s_not_b32 s5, s4
-; GFX6-NEXT: s_sub_i32 s12, s2, 64
-; GFX6-NEXT: s_sub_i32 s8, 64, s2
-; GFX6-NEXT: s_cmp_lt_u32 s2, 64
+; GFX6-NEXT: s_sub_i32 s12, s0, 64
+; GFX6-NEXT: s_sub_i32 s8, 64, s0
+; GFX6-NEXT: s_cmp_lt_u32 s0, 64
; GFX6-NEXT: s_cselect_b32 s13, 1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s2, 0
+; GFX6-NEXT: s_cmp_eq_u32 s0, 0
; GFX6-NEXT: s_cselect_b32 s14, 1, 0
; GFX6-NEXT: s_lshr_b64 s[8:9], s[6:7], s8
-; GFX6-NEXT: s_lshl_b64 s[10:11], s[0:1], s5
-; GFX6-NEXT: s_lshl_b64 s[2:3], s[6:7], s5
+; GFX6-NEXT: s_lshl_b64 s[10:11], s[2:3], s5
+; GFX6-NEXT: s_lshl_b64 s[0:1], s[6:7], s5
; GFX6-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11]
; GFX6-NEXT: s_lshl_b64 s[6:7], s[6:7], s12
; GFX6-NEXT: s_cmp_lg_u32 s13, 0
-; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
-; GFX6-NEXT: s_cselect_b64 s[6:7], s[8:9], s[6:7]
+; GFX6-NEXT: s_cselect_b64 s[10:11], s[0:1], 0
+; GFX6-NEXT: s_cselect_b64 s[0:1], s[8:9], s[6:7]
; GFX6-NEXT: s_cmp_lg_u32 s14, 0
-; GFX6-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7]
+; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1]
; GFX6-NEXT: s_and_b32 s0, s4, 0x7f
; GFX6-NEXT: s_sub_i32 s1, s0, 64
; GFX6-NEXT: s_sub_i32 s4, 64, s0
@@ -6073,14 +6054,14 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
; GFX6-NEXT: s_cmp_eq_u32 s0, 0
; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], s0
; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], s4
-; GFX6-NEXT: s_cselect_b32 s8, 1, 0
+; GFX6-NEXT: s_cselect_b32 s6, 1, 0
; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], s0
; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], s1
; GFX6-NEXT: s_and_b32 s0, 1, s5
; GFX6-NEXT: v_or_b32_e32 v4, v4, v6
; GFX6-NEXT: v_or_b32_e32 v5, v5, v7
; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX6-NEXT: s_and_b32 s0, 1, s8
+; GFX6-NEXT: s_and_b32 s0, 1, s6
; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
@@ -6088,10 +6069,10 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[0:1]
; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX6-NEXT: v_or_b32_e32 v0, s2, v0
-; GFX6-NEXT: v_or_b32_e32 v1, s3, v1
-; GFX6-NEXT: v_or_b32_e32 v2, s6, v2
-; GFX6-NEXT: v_or_b32_e32 v3, s7, v3
+; GFX6-NEXT: v_or_b32_e32 v0, s10, v0
+; GFX6-NEXT: v_or_b32_e32 v1, s11, v1
+; GFX6-NEXT: v_or_b32_e32 v2, s2, v2
+; GFX6-NEXT: v_or_b32_e32 v3, s3, v3
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: v_fshr_i128_svs:
@@ -6099,26 +6080,25 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
; GFX8-NEXT: s_lshl_b64 s[6:7], s[0:1], 1
; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
; GFX8-NEXT: s_lshr_b32 s0, s1, 31
-; GFX8-NEXT: s_mov_b32 s1, 0
-; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX8-NEXT: s_andn2_b32 s2, 0x7f, s4
+; GFX8-NEXT: s_or_b32 s2, s2, s0
+; GFX8-NEXT: s_andn2_b32 s0, 0x7f, s4
; GFX8-NEXT: s_not_b32 s5, s4
-; GFX8-NEXT: s_sub_i32 s12, s2, 64
-; GFX8-NEXT: s_sub_i32 s8, 64, s2
-; GFX8-NEXT: s_cmp_lt_u32 s2, 64
+; GFX8-NEXT: s_sub_i32 s12, s0, 64
+; GFX8-NEXT: s_sub_i32 s8, 64, s0
+; GFX8-NEXT: s_cmp_lt_u32 s0, 64
; GFX8-NEXT: s_cselect_b32 s13, 1, 0
-; GFX8-NEXT: s_cmp_eq_u32 s2, 0
+; GFX8-NEXT: s_cmp_eq_u32 s0, 0
; GFX8-NEXT: s_cselect_b32 s14, 1, 0
; GFX8-NEXT: s_lshr_b64 s[8:9], s[6:7], s8
-; GFX8-NEXT: s_lshl_b64 s[10:11], s[0:1], s5
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s5
+; GFX8-NEXT: s_lshl_b64 s[10:11], s[2:3], s5
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[6:7], s5
; GFX8-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11]
; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], s12
; GFX8-NEXT: s_cmp_lg_u32 s13, 0
-; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
-; GFX8-NEXT: s_cselect_b64 s[6:7], s[8:9], s[6:7]
+; GFX8-NEXT: s_cselect_b64 s[10:11], s[0:1], 0
+; GFX8-NEXT: s_cselect_b64 s[0:1], s[8:9], s[6:7]
; GFX8-NEXT: s_cmp_lg_u32 s14, 0
-; GFX8-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7]
+; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1]
; GFX8-NEXT: s_and_b32 s0, s4, 0x7f
; GFX8-NEXT: s_sub_i32 s1, s0, 64
; GFX8-NEXT: s_sub_i32 s4, 64, s0
@@ -6127,14 +6107,14 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
; GFX8-NEXT: s_cmp_eq_u32 s0, 0
; GFX8-NEXT: v_lshrrev_b64 v[4:5], s0, v[0:1]
; GFX8-NEXT: v_lshlrev_b64 v[6:7], s4, v[2:3]
-; GFX8-NEXT: s_cselect_b32 s8, 1, 0
+; GFX8-NEXT: s_cselect_b32 s6, 1, 0
; GFX8-NEXT: v_lshrrev_b64 v[8:9], s0, v[2:3]
; GFX8-NEXT: v_lshrrev_b64 v[2:3], s1, v[2:3]
; GFX8-NEXT: s_and_b32 s0, 1, s5
; GFX8-NEXT: v_or_b32_e32 v4, v4, v6
; GFX8-NEXT: v_or_b32_e32 v5, v5, v7
; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX8-NEXT: s_and_b32 s0, 1, s8
+; GFX8-NEXT: s_and_b32 s0, 1, s6
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
@@ -6142,10 +6122,10 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX8-NEXT: v_or_b32_e32 v0, s2, v0
-; GFX8-NEXT: v_or_b32_e32 v1, s3, v1
-; GFX8-NEXT: v_or_b32_e32 v2, s6, v2
-; GFX8-NEXT: v_or_b32_e32 v3, s7, v3
+; GFX8-NEXT: v_or_b32_e32 v0, s10, v0
+; GFX8-NEXT: v_or_b32_e32 v1, s11, v1
+; GFX8-NEXT: v_or_b32_e32 v2, s2, v2
+; GFX8-NEXT: v_or_b32_e32 v3, s3, v3
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: v_fshr_i128_svs:
@@ -6153,26 +6133,25 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
; GFX9-NEXT: s_lshl_b64 s[6:7], s[0:1], 1
; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
; GFX9-NEXT: s_lshr_b32 s0, s1, 31
-; GFX9-NEXT: s_mov_b32 s1, 0
-; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX9-NEXT: s_andn2_b32 s2, 0x7f, s4
+; GFX9-NEXT: s_or_b32 s2, s2, s0
+; GFX9-NEXT: s_andn2_b32 s0, 0x7f, s4
; GFX9-NEXT: s_not_b32 s5, s4
-; GFX9-NEXT: s_sub_i32 s12, s2, 64
-; GFX9-NEXT: s_sub_i32 s8, 64, s2
-; GFX9-NEXT: s_cmp_lt_u32 s2, 64
+; GFX9-NEXT: s_sub_i32 s12, s0, 64
+; GFX9-NEXT: s_sub_i32 s8, 64, s0
+; GFX9-NEXT: s_cmp_lt_u32 s0, 64
; GFX9-NEXT: s_cselect_b32 s13, 1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s2, 0
+; GFX9-NEXT: s_cmp_eq_u32 s0, 0
; GFX9-NEXT: s_cselect_b32 s14, 1, 0
; GFX9-NEXT: s_lshr_b64 s[8:9], s[6:7], s8
-; GFX9-NEXT: s_lshl_b64 s[10:11], s[0:1], s5
-; GFX9-NEXT: s_lshl_b64 s[2:3], s[6:7], s5
+; GFX9-NEXT: s_lshl_b64 s[10:11], s[2:3], s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[6:7], s5
; GFX9-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11]
; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], s12
; GFX9-NEXT: s_cmp_lg_u32 s13, 0
-; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
-; GFX9-NEXT: s_cselect_b64 s[6:7], s[8:9], s[6:7]
+; GFX9-NEXT: s_cselect_b64 s[10:11], s[0:1], 0
+; GFX9-NEXT: s_cselect_b64 s[0:1], s[8:9], s[6:7]
; GFX9-NEXT: s_cmp_lg_u32 s14, 0
-; GFX9-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7]
+; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1]
; GFX9-NEXT: s_and_b32 s0, s4, 0x7f
; GFX9-NEXT: s_sub_i32 s1, s0, 64
; GFX9-NEXT: s_sub_i32 s4, 64, s0
@@ -6181,14 +6160,14 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
; GFX9-NEXT: s_cmp_eq_u32 s0, 0
; GFX9-NEXT: v_lshrrev_b64 v[4:5], s0, v[0:1]
; GFX9-NEXT: v_lshlrev_b64 v[6:7], s4, v[2:3]
-; GFX9-NEXT: s_cselect_b32 s8, 1, 0
+; GFX9-NEXT: s_cselect_b32 s6, 1, 0
; GFX9-NEXT: v_lshrrev_b64 v[8:9], s0, v[2:3]
; GFX9-NEXT: v_lshrrev_b64 v[2:3], s1, v[2:3]
; GFX9-NEXT: s_and_b32 s0, 1, s5
; GFX9-NEXT: v_or_b32_e32 v4, v4, v6
; GFX9-NEXT: v_or_b32_e32 v5, v5, v7
; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX9-NEXT: s_and_b32 s0, 1, s8
+; GFX9-NEXT: s_and_b32 s0, 1, s6
; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
@@ -6196,20 +6175,19 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX9-NEXT: v_or_b32_e32 v0, s2, v0
-; GFX9-NEXT: v_or_b32_e32 v1, s3, v1
-; GFX9-NEXT: v_or_b32_e32 v2, s6, v2
-; GFX9-NEXT: v_or_b32_e32 v3, s7, v3
+; GFX9-NEXT: v_or_b32_e32 v0, s10, v0
+; GFX9-NEXT: v_or_b32_e32 v1, s11, v1
+; GFX9-NEXT: v_or_b32_e32 v2, s2, v2
+; GFX9-NEXT: v_or_b32_e32 v3, s3, v3
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: v_fshr_i128_svs:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT: s_lshr_b32 s6, s1, 31
-; GFX10-NEXT: s_mov_b32 s7, 0
-; GFX10-NEXT: s_andn2_b32 s5, 0x7f, s4
+; GFX10-NEXT: s_lshr_b32 s5, s1, 31
; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7]
+; GFX10-NEXT: s_or_b32 s2, s2, s5
+; GFX10-NEXT: s_andn2_b32 s5, 0x7f, s4
; GFX10-NEXT: s_not_b32 s10, s4
; GFX10-NEXT: s_sub_i32 s12, s5, 64
; GFX10-NEXT: s_sub_i32 s6, 64, s5
@@ -6259,11 +6237,10 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
; GFX11-LABEL: v_fshr_i128_svs:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT: s_lshr_b32 s6, s1, 31
-; GFX11-NEXT: s_mov_b32 s7, 0
-; GFX11-NEXT: s_and_not1_b32 s5, 0x7f, s4
+; GFX11-NEXT: s_lshr_b32 s5, s1, 31
; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7]
+; GFX11-NEXT: s_or_b32 s2, s2, s5
+; GFX11-NEXT: s_and_not1_b32 s5, 0x7f, s4
; GFX11-NEXT: s_not_b32 s10, s4
; GFX11-NEXT: s_sub_i32 s12, s5, 64
; GFX11-NEXT: s_sub_i32 s6, 64, s5
@@ -6714,81 +6691,80 @@ define i128 @v_fshr_i128_65(i128 %lhs, i128 %rhs) {
define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs, <2 x i128> inreg %amt) {
; GFX6-LABEL: s_fshr_v2i128:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
-; GFX6-NEXT: s_lshr_b32 s22, s1, 31
-; GFX6-NEXT: s_mov_b32 s23, 0
; GFX6-NEXT: s_lshl_b64 s[18:19], s[0:1], 1
-; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[22:23]
-; GFX6-NEXT: s_andn2_b32 s2, 0x7f, s16
+; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
+; GFX6-NEXT: s_lshr_b32 s0, s1, 31
+; GFX6-NEXT: s_or_b32 s2, s2, s0
+; GFX6-NEXT: s_andn2_b32 s0, 0x7f, s16
; GFX6-NEXT: s_not_b32 s17, s16
-; GFX6-NEXT: s_sub_i32 s21, s2, 64
-; GFX6-NEXT: s_sub_i32 s22, 64, s2
-; GFX6-NEXT: s_cmp_lt_u32 s2, 64
-; GFX6-NEXT: s_cselect_b32 s28, 1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s2, 0
-; GFX6-NEXT: s_cselect_b32 s29, 1, 0
-; GFX6-NEXT: s_lshr_b64 s[24:25], s[18:19], s22
-; GFX6-NEXT: s_lshl_b64 s[26:27], s[0:1], s17
-; GFX6-NEXT: s_lshl_b64 s[2:3], s[18:19], s17
-; GFX6-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27]
-; GFX6-NEXT: s_lshl_b64 s[18:19], s[18:19], s21
-; GFX6-NEXT: s_cmp_lg_u32 s28, 0
-; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
-; GFX6-NEXT: s_cselect_b64 s[18:19], s[24:25], s[18:19]
-; GFX6-NEXT: s_cmp_lg_u32 s29, 0
-; GFX6-NEXT: s_cselect_b64 s[18:19], s[0:1], s[18:19]
-; GFX6-NEXT: s_and_b32 s0, s16, 0x7f
; GFX6-NEXT: s_sub_i32 s21, s0, 64
; GFX6-NEXT: s_sub_i32 s22, 64, s0
; GFX6-NEXT: s_cmp_lt_u32 s0, 64
; GFX6-NEXT: s_cselect_b32 s26, 1, 0
; GFX6-NEXT: s_cmp_eq_u32 s0, 0
; GFX6-NEXT: s_cselect_b32 s27, 1, 0
-; GFX6-NEXT: s_lshr_b64 s[0:1], s[10:11], s16
+; GFX6-NEXT: s_lshr_b64 s[22:23], s[18:19], s22
+; GFX6-NEXT: s_lshl_b64 s[24:25], s[2:3], s17
+; GFX6-NEXT: s_lshl_b64 s[0:1], s[18:19], s17
+; GFX6-NEXT: s_or_b64 s[22:23], s[22:23], s[24:25]
+; GFX6-NEXT: s_lshl_b64 s[18:19], s[18:19], s21
+; GFX6-NEXT: s_cmp_lg_u32 s26, 0
+; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], 0
+; GFX6-NEXT: s_cselect_b64 s[18:19], s[22:23], s[18:19]
+; GFX6-NEXT: s_cmp_lg_u32 s27, 0
+; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[18:19]
+; GFX6-NEXT: s_and_b32 s17, s16, 0x7f
+; GFX6-NEXT: s_sub_i32 s21, s17, 64
+; GFX6-NEXT: s_sub_i32 s22, 64, s17
+; GFX6-NEXT: s_cmp_lt_u32 s17, 64
+; GFX6-NEXT: s_cselect_b32 s24, 1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s17, 0
+; GFX6-NEXT: s_cselect_b32 s25, 1, 0
+; GFX6-NEXT: s_lshr_b64 s[18:19], s[10:11], s16
; GFX6-NEXT: s_lshr_b64 s[16:17], s[8:9], s16
-; GFX6-NEXT: s_lshl_b64 s[24:25], s[10:11], s22
-; GFX6-NEXT: s_or_b64 s[16:17], s[16:17], s[24:25]
+; GFX6-NEXT: s_lshl_b64 s[22:23], s[10:11], s22
+; GFX6-NEXT: s_or_b64 s[16:17], s[16:17], s[22:23]
; GFX6-NEXT: s_lshr_b64 s[10:11], s[10:11], s21
-; GFX6-NEXT: s_cmp_lg_u32 s26, 0
+; GFX6-NEXT: s_cmp_lg_u32 s24, 0
; GFX6-NEXT: s_cselect_b64 s[10:11], s[16:17], s[10:11]
-; GFX6-NEXT: s_cmp_lg_u32 s27, 0
+; GFX6-NEXT: s_cmp_lg_u32 s25, 0
; GFX6-NEXT: s_cselect_b64 s[8:9], s[8:9], s[10:11]
-; GFX6-NEXT: s_cmp_lg_u32 s26, 0
-; GFX6-NEXT: s_cselect_b64 s[10:11], s[0:1], 0
-; GFX6-NEXT: s_lshl_b64 s[6:7], s[6:7], 1
-; GFX6-NEXT: s_lshr_b32 s22, s5, 31
-; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9]
+; GFX6-NEXT: s_cmp_lg_u32 s24, 0
+; GFX6-NEXT: s_cselect_b64 s[10:11], s[18:19], 0
+; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9]
; GFX6-NEXT: s_lshl_b64 s[8:9], s[4:5], 1
-; GFX6-NEXT: s_or_b64 s[4:5], s[6:7], s[22:23]
-; GFX6-NEXT: s_andn2_b32 s6, 0x7f, s20
-; GFX6-NEXT: s_or_b64 s[2:3], s[18:19], s[10:11]
+; GFX6-NEXT: s_lshl_b64 s[6:7], s[6:7], 1
+; GFX6-NEXT: s_lshr_b32 s4, s5, 31
+; GFX6-NEXT: s_or_b32 s6, s6, s4
+; GFX6-NEXT: s_andn2_b32 s4, 0x7f, s20
+; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[10:11]
; GFX6-NEXT: s_not_b32 s16, s20
-; GFX6-NEXT: s_sub_i32 s18, s6, 64
-; GFX6-NEXT: s_sub_i32 s10, 64, s6
-; GFX6-NEXT: s_cmp_lt_u32 s6, 64
+; GFX6-NEXT: s_sub_i32 s18, s4, 64
+; GFX6-NEXT: s_sub_i32 s10, 64, s4
+; GFX6-NEXT: s_cmp_lt_u32 s4, 64
; GFX6-NEXT: s_cselect_b32 s19, 1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s6, 0
+; GFX6-NEXT: s_cmp_eq_u32 s4, 0
; GFX6-NEXT: s_cselect_b32 s21, 1, 0
-; GFX6-NEXT: s_lshl_b64 s[6:7], s[8:9], s16
+; GFX6-NEXT: s_lshl_b64 s[4:5], s[8:9], s16
; GFX6-NEXT: s_lshr_b64 s[10:11], s[8:9], s10
-; GFX6-NEXT: s_lshl_b64 s[16:17], s[4:5], s16
+; GFX6-NEXT: s_lshl_b64 s[16:17], s[6:7], s16
; GFX6-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17]
; GFX6-NEXT: s_lshl_b64 s[8:9], s[8:9], s18
; GFX6-NEXT: s_cmp_lg_u32 s19, 0
-; GFX6-NEXT: s_cselect_b64 s[6:7], s[6:7], 0
+; GFX6-NEXT: s_cselect_b64 s[4:5], s[4:5], 0
; GFX6-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9]
; GFX6-NEXT: s_cmp_lg_u32 s21, 0
-; GFX6-NEXT: s_cselect_b64 s[8:9], s[4:5], s[8:9]
-; GFX6-NEXT: s_and_b32 s4, s20, 0x7f
-; GFX6-NEXT: s_sub_i32 s18, s4, 64
-; GFX6-NEXT: s_sub_i32 s16, 64, s4
-; GFX6-NEXT: s_cmp_lt_u32 s4, 64
+; GFX6-NEXT: s_cselect_b64 s[6:7], s[6:7], s[8:9]
+; GFX6-NEXT: s_and_b32 s8, s20, 0x7f
+; GFX6-NEXT: s_sub_i32 s18, s8, 64
+; GFX6-NEXT: s_sub_i32 s16, 64, s8
+; GFX6-NEXT: s_cmp_lt_u32 s8, 64
; GFX6-NEXT: s_cselect_b32 s19, 1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s4, 0
+; GFX6-NEXT: s_cmp_eq_u32 s8, 0
; GFX6-NEXT: s_cselect_b32 s21, 1, 0
; GFX6-NEXT: s_lshr_b64 s[10:11], s[12:13], s20
; GFX6-NEXT: s_lshl_b64 s[16:17], s[14:15], s16
-; GFX6-NEXT: s_lshr_b64 s[4:5], s[14:15], s20
+; GFX6-NEXT: s_lshr_b64 s[8:9], s[14:15], s20
; GFX6-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17]
; GFX6-NEXT: s_lshr_b64 s[14:15], s[14:15], s18
; GFX6-NEXT: s_cmp_lg_u32 s19, 0
@@ -6796,88 +6772,87 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
; GFX6-NEXT: s_cmp_lg_u32 s21, 0
; GFX6-NEXT: s_cselect_b64 s[10:11], s[12:13], s[10:11]
; GFX6-NEXT: s_cmp_lg_u32 s19, 0
-; GFX6-NEXT: s_cselect_b64 s[12:13], s[4:5], 0
-; GFX6-NEXT: s_or_b64 s[4:5], s[6:7], s[10:11]
-; GFX6-NEXT: s_or_b64 s[6:7], s[8:9], s[12:13]
+; GFX6-NEXT: s_cselect_b64 s[8:9], s[8:9], 0
+; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[10:11]
+; GFX6-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_fshr_v2i128:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
-; GFX8-NEXT: s_lshr_b32 s22, s1, 31
-; GFX8-NEXT: s_mov_b32 s23, 0
; GFX8-NEXT: s_lshl_b64 s[18:19], s[0:1], 1
-; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[22:23]
-; GFX8-NEXT: s_andn2_b32 s2, 0x7f, s16
+; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
+; GFX8-NEXT: s_lshr_b32 s0, s1, 31
+; GFX8-NEXT: s_or_b32 s2, s2, s0
+; GFX8-NEXT: s_andn2_b32 s0, 0x7f, s16
; GFX8-NEXT: s_not_b32 s17, s16
-; GFX8-NEXT: s_sub_i32 s21, s2, 64
-; GFX8-NEXT: s_sub_i32 s22, 64, s2
-; GFX8-NEXT: s_cmp_lt_u32 s2, 64
-; GFX8-NEXT: s_cselect_b32 s28, 1, 0
-; GFX8-NEXT: s_cmp_eq_u32 s2, 0
-; GFX8-NEXT: s_cselect_b32 s29, 1, 0
-; GFX8-NEXT: s_lshr_b64 s[24:25], s[18:19], s22
-; GFX8-NEXT: s_lshl_b64 s[26:27], s[0:1], s17
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[18:19], s17
-; GFX8-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27]
-; GFX8-NEXT: s_lshl_b64 s[18:19], s[18:19], s21
-; GFX8-NEXT: s_cmp_lg_u32 s28, 0
-; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
-; GFX8-NEXT: s_cselect_b64 s[18:19], s[24:25], s[18:19]
-; GFX8-NEXT: s_cmp_lg_u32 s29, 0
-; GFX8-NEXT: s_cselect_b64 s[18:19], s[0:1], s[18:19]
-; GFX8-NEXT: s_and_b32 s0, s16, 0x7f
; GFX8-NEXT: s_sub_i32 s21, s0, 64
; GFX8-NEXT: s_sub_i32 s22, 64, s0
; GFX8-NEXT: s_cmp_lt_u32 s0, 64
; GFX8-NEXT: s_cselect_b32 s26, 1, 0
; GFX8-NEXT: s_cmp_eq_u32 s0, 0
; GFX8-NEXT: s_cselect_b32 s27, 1, 0
-; GFX8-NEXT: s_lshr_b64 s[0:1], s[10:11], s16
+; GFX8-NEXT: s_lshr_b64 s[22:23], s[18:19], s22
+; GFX8-NEXT: s_lshl_b64 s[24:25], s[2:3], s17
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[18:19], s17
+; GFX8-NEXT: s_or_b64 s[22:23], s[22:23], s[24:25]
+; GFX8-NEXT: s_lshl_b64 s[18:19], s[18:19], s21
+; GFX8-NEXT: s_cmp_lg_u32 s26, 0
+; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], 0
+; GFX8-NEXT: s_cselect_b64 s[18:19], s[22:23], s[18:19]
+; GFX8-NEXT: s_cmp_lg_u32 s27, 0
+; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[18:19]
+; GFX8-NEXT: s_and_b32 s17, s16, 0x7f
+; GFX8-NEXT: s_sub_i32 s21, s17, 64
+; GFX8-NEXT: s_sub_i32 s22, 64, s17
+; GFX8-NEXT: s_cmp_lt_u32 s17, 64
+; GFX8-NEXT: s_cselect_b32 s24, 1, 0
+; GFX8-NEXT: s_cmp_eq_u32 s17, 0
+; GFX8-NEXT: s_cselect_b32 s25, 1, 0
+; GFX8-NEXT: s_lshr_b64 s[18:19], s[10:11], s16
; GFX8-NEXT: s_lshr_b64 s[16:17], s[8:9], s16
-; GFX8-NEXT: s_lshl_b64 s[24:25], s[10:11], s22
-; GFX8-NEXT: s_or_b64 s[16:17], s[16:17], s[24:25]
+; GFX8-NEXT: s_lshl_b64 s[22:23], s[10:11], s22
+; GFX8-NEXT: s_or_b64 s[16:17], s[16:17], s[22:23]
; GFX8-NEXT: s_lshr_b64 s[10:11], s[10:11], s21
-; GFX8-NEXT: s_cmp_lg_u32 s26, 0
+; GFX8-NEXT: s_cmp_lg_u32 s24, 0
; GFX8-NEXT: s_cselect_b64 s[10:11], s[16:17], s[10:11]
-; GFX8-NEXT: s_cmp_lg_u32 s27, 0
+; GFX8-NEXT: s_cmp_lg_u32 s25, 0
; GFX8-NEXT: s_cselect_b64 s[8:9], s[8:9], s[10:11]
-; GFX8-NEXT: s_cmp_lg_u32 s26, 0
-; GFX8-NEXT: s_cselect_b64 s[10:11], s[0:1], 0
-; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 1
-; GFX8-NEXT: s_lshr_b32 s22, s5, 31
-; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9]
+; GFX8-NEXT: s_cmp_lg_u32 s24, 0
+; GFX8-NEXT: s_cselect_b64 s[10:11], s[18:19], 0
+; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9]
; GFX8-NEXT: s_lshl_b64 s[8:9], s[4:5], 1
-; GFX8-NEXT: s_or_b64 s[4:5], s[6:7], s[22:23]
-; GFX8-NEXT: s_andn2_b32 s6, 0x7f, s20
-; GFX8-NEXT: s_or_b64 s[2:3], s[18:19], s[10:11]
+; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 1
+; GFX8-NEXT: s_lshr_b32 s4, s5, 31
+; GFX8-NEXT: s_or_b32 s6, s6, s4
+; GFX8-NEXT: s_andn2_b32 s4, 0x7f, s20
+; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[10:11]
; GFX8-NEXT: s_not_b32 s16, s20
-; GFX8-NEXT: s_sub_i32 s18, s6, 64
-; GFX8-NEXT: s_sub_i32 s10, 64, s6
-; GFX8-NEXT: s_cmp_lt_u32 s6, 64
+; GFX8-NEXT: s_sub_i32 s18, s4, 64
+; GFX8-NEXT: s_sub_i32 s10, 64, s4
+; GFX8-NEXT: s_cmp_lt_u32 s4, 64
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
-; GFX8-NEXT: s_cmp_eq_u32 s6, 0
+; GFX8-NEXT: s_cmp_eq_u32 s4, 0
; GFX8-NEXT: s_cselect_b32 s21, 1, 0
-; GFX8-NEXT: s_lshl_b64 s[6:7], s[8:9], s16
+; GFX8-NEXT: s_lshl_b64 s[4:5], s[8:9], s16
; GFX8-NEXT: s_lshr_b64 s[10:11], s[8:9], s10
-; GFX8-NEXT: s_lshl_b64 s[16:17], s[4:5], s16
+; GFX8-NEXT: s_lshl_b64 s[16:17], s[6:7], s16
; GFX8-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17]
; GFX8-NEXT: s_lshl_b64 s[8:9], s[8:9], s18
; GFX8-NEXT: s_cmp_lg_u32 s19, 0
-; GFX8-NEXT: s_cselect_b64 s[6:7], s[6:7], 0
+; GFX8-NEXT: s_cselect_b64 s[4:5], s[4:5], 0
; GFX8-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9]
; GFX8-NEXT: s_cmp_lg_u32 s21, 0
-; GFX8-NEXT: s_cselect_b64 s[8:9], s[4:5], s[8:9]
-; GFX8-NEXT: s_and_b32 s4, s20, 0x7f
-; GFX8-NEXT: s_sub_i32 s18, s4, 64
-; GFX8-NEXT: s_sub_i32 s16, 64, s4
-; GFX8-NEXT: s_cmp_lt_u32 s4, 64
+; GFX8-NEXT: s_cselect_b64 s[6:7], s[6:7], s[8:9]
+; GFX8-NEXT: s_and_b32 s8, s20, 0x7f
+; GFX8-NEXT: s_sub_i32 s18, s8, 64
+; GFX8-NEXT: s_sub_i32 s16, 64, s8
+; GFX8-NEXT: s_cmp_lt_u32 s8, 64
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
-; GFX8-NEXT: s_cmp_eq_u32 s4, 0
+; GFX8-NEXT: s_cmp_eq_u32 s8, 0
; GFX8-NEXT: s_cselect_b32 s21, 1, 0
; GFX8-NEXT: s_lshr_b64 s[10:11], s[12:13], s20
; GFX8-NEXT: s_lshl_b64 s[16:17], s[14:15], s16
-; GFX8-NEXT: s_lshr_b64 s[4:5], s[14:15], s20
+; GFX8-NEXT: s_lshr_b64 s[8:9], s[14:15], s20
; GFX8-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17]
; GFX8-NEXT: s_lshr_b64 s[14:15], s[14:15], s18
; GFX8-NEXT: s_cmp_lg_u32 s19, 0
@@ -6885,88 +6860,87 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
; GFX8-NEXT: s_cmp_lg_u32 s21, 0
; GFX8-NEXT: s_cselect_b64 s[10:11], s[12:13], s[10:11]
; GFX8-NEXT: s_cmp_lg_u32 s19, 0
-; GFX8-NEXT: s_cselect_b64 s[12:13], s[4:5], 0
-; GFX8-NEXT: s_or_b64 s[4:5], s[6:7], s[10:11]
-; GFX8-NEXT: s_or_b64 s[6:7], s[8:9], s[12:13]
+; GFX8-NEXT: s_cselect_b64 s[8:9], s[8:9], 0
+; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[10:11]
+; GFX8-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_fshr_v2i128:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
-; GFX9-NEXT: s_lshr_b32 s22, s1, 31
-; GFX9-NEXT: s_mov_b32 s23, 0
; GFX9-NEXT: s_lshl_b64 s[18:19], s[0:1], 1
-; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[22:23]
-; GFX9-NEXT: s_andn2_b32 s2, 0x7f, s16
+; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
+; GFX9-NEXT: s_lshr_b32 s0, s1, 31
+; GFX9-NEXT: s_or_b32 s2, s2, s0
+; GFX9-NEXT: s_andn2_b32 s0, 0x7f, s16
; GFX9-NEXT: s_not_b32 s17, s16
-; GFX9-NEXT: s_sub_i32 s21, s2, 64
-; GFX9-NEXT: s_sub_i32 s22, 64, s2
-; GFX9-NEXT: s_cmp_lt_u32 s2, 64
-; GFX9-NEXT: s_cselect_b32 s28, 1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s2, 0
-; GFX9-NEXT: s_cselect_b32 s29, 1, 0
-; GFX9-NEXT: s_lshr_b64 s[24:25], s[18:19], s22
-; GFX9-NEXT: s_lshl_b64 s[26:27], s[0:1], s17
-; GFX9-NEXT: s_lshl_b64 s[2:3], s[18:19], s17
-; GFX9-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27]
-; GFX9-NEXT: s_lshl_b64 s[18:19], s[18:19], s21
-; GFX9-NEXT: s_cmp_lg_u32 s28, 0
-; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
-; GFX9-NEXT: s_cselect_b64 s[18:19], s[24:25], s[18:19]
-; GFX9-NEXT: s_cmp_lg_u32 s29, 0
-; GFX9-NEXT: s_cselect_b64 s[18:19], s[0:1], s[18:19]
-; GFX9-NEXT: s_and_b32 s0, s16, 0x7f
; GFX9-NEXT: s_sub_i32 s21, s0, 64
; GFX9-NEXT: s_sub_i32 s22, 64, s0
; GFX9-NEXT: s_cmp_lt_u32 s0, 64
; GFX9-NEXT: s_cselect_b32 s26, 1, 0
; GFX9-NEXT: s_cmp_eq_u32 s0, 0
; GFX9-NEXT: s_cselect_b32 s27, 1, 0
-; GFX9-NEXT: s_lshr_b64 s[0:1], s[10:11], s16
+; GFX9-NEXT: s_lshr_b64 s[22:23], s[18:19], s22
+; GFX9-NEXT: s_lshl_b64 s[24:25], s[2:3], s17
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[18:19], s17
+; GFX9-NEXT: s_or_b64 s[22:23], s[22:23], s[24:25]
+; GFX9-NEXT: s_lshl_b64 s[18:19], s[18:19], s21
+; GFX9-NEXT: s_cmp_lg_u32 s26, 0
+; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], 0
+; GFX9-NEXT: s_cselect_b64 s[18:19], s[22:23], s[18:19]
+; GFX9-NEXT: s_cmp_lg_u32 s27, 0
+; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[18:19]
+; GFX9-NEXT: s_and_b32 s17, s16, 0x7f
+; GFX9-NEXT: s_sub_i32 s21, s17, 64
+; GFX9-NEXT: s_sub_i32 s22, 64, s17
+; GFX9-NEXT: s_cmp_lt_u32 s17, 64
+; GFX9-NEXT: s_cselect_b32 s24, 1, 0
+; GFX9-NEXT: s_cmp_eq_u32 s17, 0
+; GFX9-NEXT: s_cselect_b32 s25, 1, 0
+; GFX9-NEXT: s_lshr_b64 s[18:19], s[10:11], s16
; GFX9-NEXT: s_lshr_b64 s[16:17], s[8:9], s16
-; GFX9-NEXT: s_lshl_b64 s[24:25], s[10:11], s22
-; GFX9-NEXT: s_or_b64 s[16:17], s[16:17], s[24:25]
+; GFX9-NEXT: s_lshl_b64 s[22:23], s[10:11], s22
+; GFX9-NEXT: s_or_b64 s[16:17], s[16:17], s[22:23]
; GFX9-NEXT: s_lshr_b64 s[10:11], s[10:11], s21
-; GFX9-NEXT: s_cmp_lg_u32 s26, 0
+; GFX9-NEXT: s_cmp_lg_u32 s24, 0
; GFX9-NEXT: s_cselect_b64 s[10:11], s[16:17], s[10:11]
-; GFX9-NEXT: s_cmp_lg_u32 s27, 0
+; GFX9-NEXT: s_cmp_lg_u32 s25, 0
; GFX9-NEXT: s_cselect_b64 s[8:9], s[8:9], s[10:11]
-; GFX9-NEXT: s_cmp_lg_u32 s26, 0
-; GFX9-NEXT: s_cselect_b64 s[10:11], s[0:1], 0
-; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 1
-; GFX9-NEXT: s_lshr_b32 s22, s5, 31
-; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9]
+; GFX9-NEXT: s_cmp_lg_u32 s24, 0
+; GFX9-NEXT: s_cselect_b64 s[10:11], s[18:19], 0
+; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9]
; GFX9-NEXT: s_lshl_b64 s[8:9], s[4:5], 1
-; GFX9-NEXT: s_or_b64 s[4:5], s[6:7], s[22:23]
-; GFX9-NEXT: s_andn2_b32 s6, 0x7f, s20
-; GFX9-NEXT: s_or_b64 s[2:3], s[18:19], s[10:11]
+; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 1
+; GFX9-NEXT: s_lshr_b32 s4, s5, 31
+; GFX9-NEXT: s_or_b32 s6, s6, s4
+; GFX9-NEXT: s_andn2_b32 s4, 0x7f, s20
+; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[10:11]
; GFX9-NEXT: s_not_b32 s16, s20
-; GFX9-NEXT: s_sub_i32 s18, s6, 64
-; GFX9-NEXT: s_sub_i32 s10, 64, s6
-; GFX9-NEXT: s_cmp_lt_u32 s6, 64
+; GFX9-NEXT: s_sub_i32 s18, s4, 64
+; GFX9-NEXT: s_sub_i32 s10, 64, s4
+; GFX9-NEXT: s_cmp_lt_u32 s4, 64
; GFX9-NEXT: s_cselect_b32 s19, 1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s6, 0
+; GFX9-NEXT: s_cmp_eq_u32 s4, 0
; GFX9-NEXT: s_cselect_b32 s21, 1, 0
-; GFX9-NEXT: s_lshl_b64 s[6:7], s[8:9], s16
+; GFX9-NEXT: s_lshl_b64 s[4:5], s[8:9], s16
; GFX9-NEXT: s_lshr_b64 s[10:11], s[8:9], s10
-; GFX9-NEXT: s_lshl_b64 s[16:17], s[4:5], s16
+; GFX9-NEXT: s_lshl_b64 s[16:17], s[6:7], s16
; GFX9-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17]
; GFX9-NEXT: s_lshl_b64 s[8:9], s[8:9], s18
; GFX9-NEXT: s_cmp_lg_u32 s19, 0
-; GFX9-NEXT: s_cselect_b64 s[6:7], s[6:7], 0
+; GFX9-NEXT: s_cselect_b64 s[4:5], s[4:5], 0
; GFX9-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9]
; GFX9-NEXT: s_cmp_lg_u32 s21, 0
-; GFX9-NEXT: s_cselect_b64 s[8:9], s[4:5], s[8:9]
-; GFX9-NEXT: s_and_b32 s4, s20, 0x7f
-; GFX9-NEXT: s_sub_i32 s18, s4, 64
-; GFX9-NEXT: s_sub_i32 s16, 64, s4
-; GFX9-NEXT: s_cmp_lt_u32 s4, 64
+; GFX9-NEXT: s_cselect_b64 s[6:7], s[6:7], s[8:9]
+; GFX9-NEXT: s_and_b32 s8, s20, 0x7f
+; GFX9-NEXT: s_sub_i32 s18, s8, 64
+; GFX9-NEXT: s_sub_i32 s16, 64, s8
+; GFX9-NEXT: s_cmp_lt_u32 s8, 64
; GFX9-NEXT: s_cselect_b32 s19, 1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s4, 0
+; GFX9-NEXT: s_cmp_eq_u32 s8, 0
; GFX9-NEXT: s_cselect_b32 s21, 1, 0
; GFX9-NEXT: s_lshr_b64 s[10:11], s[12:13], s20
; GFX9-NEXT: s_lshl_b64 s[16:17], s[14:15], s16
-; GFX9-NEXT: s_lshr_b64 s[4:5], s[14:15], s20
+; GFX9-NEXT: s_lshr_b64 s[8:9], s[14:15], s20
; GFX9-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17]
; GFX9-NEXT: s_lshr_b64 s[14:15], s[14:15], s18
; GFX9-NEXT: s_cmp_lg_u32 s19, 0
@@ -6974,61 +6948,60 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
; GFX9-NEXT: s_cmp_lg_u32 s21, 0
; GFX9-NEXT: s_cselect_b64 s[10:11], s[12:13], s[10:11]
; GFX9-NEXT: s_cmp_lg_u32 s19, 0
-; GFX9-NEXT: s_cselect_b64 s[12:13], s[4:5], 0
-; GFX9-NEXT: s_or_b64 s[4:5], s[6:7], s[10:11]
-; GFX9-NEXT: s_or_b64 s[6:7], s[8:9], s[12:13]
+; GFX9-NEXT: s_cselect_b64 s[8:9], s[8:9], 0
+; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], s[10:11]
+; GFX9-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_fshr_v2i128:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT: s_lshr_b32 s18, s1, 31
-; GFX10-NEXT: s_mov_b32 s19, 0
-; GFX10-NEXT: s_andn2_b32 s17, 0x7f, s16
+; GFX10-NEXT: s_lshr_b32 s17, s1, 31
; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[18:19]
-; GFX10-NEXT: s_not_b32 s18, s16
-; GFX10-NEXT: s_sub_i32 s21, s17, 64
-; GFX10-NEXT: s_sub_i32 s22, 64, s17
+; GFX10-NEXT: s_or_b32 s2, s2, s17
+; GFX10-NEXT: s_andn2_b32 s17, 0x7f, s16
+; GFX10-NEXT: s_not_b32 s21, s16
+; GFX10-NEXT: s_sub_i32 s26, s17, 64
+; GFX10-NEXT: s_sub_i32 s18, 64, s17
; GFX10-NEXT: s_cmp_lt_u32 s17, 64
-; GFX10-NEXT: s_cselect_b32 s28, 1, 0
+; GFX10-NEXT: s_cselect_b32 s27, 1, 0
; GFX10-NEXT: s_cmp_eq_u32 s17, 0
; GFX10-NEXT: s_cselect_b32 s17, 1, 0
-; GFX10-NEXT: s_lshr_b64 s[22:23], s[0:1], s22
-; GFX10-NEXT: s_lshl_b64 s[24:25], s[2:3], s18
-; GFX10-NEXT: s_lshl_b64 s[26:27], s[0:1], s18
-; GFX10-NEXT: s_or_b64 s[22:23], s[22:23], s[24:25]
-; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s21
-; GFX10-NEXT: s_cmp_lg_u32 s28, 0
-; GFX10-NEXT: s_cselect_b64 s[24:25], s[26:27], 0
-; GFX10-NEXT: s_cselect_b64 s[0:1], s[22:23], s[0:1]
+; GFX10-NEXT: s_lshr_b64 s[18:19], s[0:1], s18
+; GFX10-NEXT: s_lshl_b64 s[22:23], s[2:3], s21
+; GFX10-NEXT: s_lshl_b64 s[24:25], s[0:1], s21
+; GFX10-NEXT: s_or_b64 s[18:19], s[18:19], s[22:23]
+; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s26
+; GFX10-NEXT: s_cmp_lg_u32 s27, 0
+; GFX10-NEXT: s_cselect_b64 s[22:23], s[24:25], 0
+; GFX10-NEXT: s_cselect_b64 s[0:1], s[18:19], s[0:1]
; GFX10-NEXT: s_cmp_lg_u32 s17, 0
; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1]
; GFX10-NEXT: s_and_b32 s0, s16, 0x7f
-; GFX10-NEXT: s_sub_i32 s18, s0, 64
+; GFX10-NEXT: s_sub_i32 s21, s0, 64
; GFX10-NEXT: s_sub_i32 s17, 64, s0
; GFX10-NEXT: s_cmp_lt_u32 s0, 64
-; GFX10-NEXT: s_cselect_b32 s21, 1, 0
+; GFX10-NEXT: s_cselect_b32 s24, 1, 0
; GFX10-NEXT: s_cmp_eq_u32 s0, 0
-; GFX10-NEXT: s_cselect_b32 s26, 1, 0
+; GFX10-NEXT: s_cselect_b32 s25, 1, 0
; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], s16
-; GFX10-NEXT: s_lshl_b64 s[22:23], s[10:11], s17
+; GFX10-NEXT: s_lshl_b64 s[18:19], s[10:11], s17
; GFX10-NEXT: s_lshr_b64 s[16:17], s[10:11], s16
-; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[22:23]
-; GFX10-NEXT: s_lshr_b64 s[10:11], s[10:11], s18
-; GFX10-NEXT: s_cmp_lg_u32 s21, 0
+; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[18:19]
+; GFX10-NEXT: s_lshr_b64 s[10:11], s[10:11], s21
+; GFX10-NEXT: s_cmp_lg_u32 s24, 0
; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[10:11]
-; GFX10-NEXT: s_cmp_lg_u32 s26, 0
+; GFX10-NEXT: s_cmp_lg_u32 s25, 0
; GFX10-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1]
-; GFX10-NEXT: s_cmp_lg_u32 s21, 0
+; GFX10-NEXT: s_cmp_lg_u32 s24, 0
; GFX10-NEXT: s_cselect_b64 s[8:9], s[16:17], 0
; GFX10-NEXT: s_lshl_b64 s[6:7], s[6:7], 1
; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX10-NEXT: s_lshr_b32 s18, s5, 31
+; GFX10-NEXT: s_lshr_b32 s8, s5, 31
+; GFX10-NEXT: s_or_b64 s[0:1], s[22:23], s[0:1]
+; GFX10-NEXT: s_or_b32 s6, s6, s8
; GFX10-NEXT: s_andn2_b32 s8, 0x7f, s20
-; GFX10-NEXT: s_or_b64 s[0:1], s[24:25], s[0:1]
; GFX10-NEXT: s_lshl_b64 s[4:5], s[4:5], 1
-; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[18:19]
; GFX10-NEXT: s_not_b32 s16, s20
; GFX10-NEXT: s_sub_i32 s18, s8, 64
; GFX10-NEXT: s_sub_i32 s9, 64, s8
@@ -7071,54 +7044,53 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
; GFX11-LABEL: s_fshr_v2i128:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT: s_lshr_b32 s18, s1, 31
-; GFX11-NEXT: s_mov_b32 s19, 0
-; GFX11-NEXT: s_and_not1_b32 s17, 0x7f, s16
+; GFX11-NEXT: s_lshr_b32 s17, s1, 31
; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[18:19]
-; GFX11-NEXT: s_not_b32 s18, s16
-; GFX11-NEXT: s_sub_i32 s21, s17, 64
-; GFX11-NEXT: s_sub_i32 s22, 64, s17
+; GFX11-NEXT: s_or_b32 s2, s2, s17
+; GFX11-NEXT: s_and_not1_b32 s17, 0x7f, s16
+; GFX11-NEXT: s_not_b32 s21, s16
+; GFX11-NEXT: s_sub_i32 s26, s17, 64
+; GFX11-NEXT: s_sub_i32 s18, 64, s17
; GFX11-NEXT: s_cmp_lt_u32 s17, 64
-; GFX11-NEXT: s_cselect_b32 s28, 1, 0
+; GFX11-NEXT: s_cselect_b32 s27, 1, 0
; GFX11-NEXT: s_cmp_eq_u32 s17, 0
; GFX11-NEXT: s_cselect_b32 s17, 1, 0
-; GFX11-NEXT: s_lshr_b64 s[22:23], s[0:1], s22
-; GFX11-NEXT: s_lshl_b64 s[24:25], s[2:3], s18
-; GFX11-NEXT: s_lshl_b64 s[26:27], s[0:1], s18
-; GFX11-NEXT: s_or_b64 s[22:23], s[22:23], s[24:25]
-; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s21
-; GFX11-NEXT: s_cmp_lg_u32 s28, 0
-; GFX11-NEXT: s_cselect_b64 s[24:25], s[26:27], 0
-; GFX11-NEXT: s_cselect_b64 s[0:1], s[22:23], s[0:1]
+; GFX11-NEXT: s_lshr_b64 s[18:19], s[0:1], s18
+; GFX11-NEXT: s_lshl_b64 s[22:23], s[2:3], s21
+; GFX11-NEXT: s_lshl_b64 s[24:25], s[0:1], s21
+; GFX11-NEXT: s_or_b64 s[18:19], s[18:19], s[22:23]
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s26
+; GFX11-NEXT: s_cmp_lg_u32 s27, 0
+; GFX11-NEXT: s_cselect_b64 s[22:23], s[24:25], 0
+; GFX11-NEXT: s_cselect_b64 s[0:1], s[18:19], s[0:1]
; GFX11-NEXT: s_cmp_lg_u32 s17, 0
; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1]
; GFX11-NEXT: s_and_b32 s0, s16, 0x7f
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_sub_i32 s18, s0, 64
+; GFX11-NEXT: s_sub_i32 s21, s0, 64
; GFX11-NEXT: s_sub_i32 s17, 64, s0
; GFX11-NEXT: s_cmp_lt_u32 s0, 64
-; GFX11-NEXT: s_cselect_b32 s21, 1, 0
+; GFX11-NEXT: s_cselect_b32 s24, 1, 0
; GFX11-NEXT: s_cmp_eq_u32 s0, 0
-; GFX11-NEXT: s_cselect_b32 s26, 1, 0
+; GFX11-NEXT: s_cselect_b32 s25, 1, 0
; GFX11-NEXT: s_lshr_b64 s[0:1], s[8:9], s16
-; GFX11-NEXT: s_lshl_b64 s[22:23], s[10:11], s17
+; GFX11-NEXT: s_lshl_b64 s[18:19], s[10:11], s17
; GFX11-NEXT: s_lshr_b64 s[16:17], s[10:11], s16
-; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[22:23]
-; GFX11-NEXT: s_lshr_b64 s[10:11], s[10:11], s18
-; GFX11-NEXT: s_cmp_lg_u32 s21, 0
+; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[18:19]
+; GFX11-NEXT: s_lshr_b64 s[10:11], s[10:11], s21
+; GFX11-NEXT: s_cmp_lg_u32 s24, 0
; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], s[10:11]
-; GFX11-NEXT: s_cmp_lg_u32 s26, 0
+; GFX11-NEXT: s_cmp_lg_u32 s25, 0
; GFX11-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1]
-; GFX11-NEXT: s_cmp_lg_u32 s21, 0
+; GFX11-NEXT: s_cmp_lg_u32 s24, 0
; GFX11-NEXT: s_cselect_b64 s[8:9], s[16:17], 0
; GFX11-NEXT: s_lshl_b64 s[6:7], s[6:7], 1
; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX11-NEXT: s_lshr_b32 s18, s5, 31
+; GFX11-NEXT: s_lshr_b32 s8, s5, 31
+; GFX11-NEXT: s_or_b64 s[0:1], s[22:23], s[0:1]
+; GFX11-NEXT: s_or_b32 s6, s6, s8
; GFX11-NEXT: s_and_not1_b32 s8, 0x7f, s20
-; GFX11-NEXT: s_or_b64 s[0:1], s[24:25], s[0:1]
; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], 1
-; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[18:19]
; GFX11-NEXT: s_not_b32 s16, s20
; GFX11-NEXT: s_sub_i32 s18, s8, 64
; GFX11-NEXT: s_sub_i32 s9, 64, s8
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaximum.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaximum.mir
new file mode 100644
index 0000000..4b214e6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaximum.mir
@@ -0,0 +1,275 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=-real-true16 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX12 %s
+
+---
+name: test_fmaximum_f16
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; GFX9-LABEL: name: test_fmaximum_f16
+ ; GFX9: liveins: $vgpr0, $vgpr1
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s16) = G_FMAXNUM_IEEE [[TRUNC]], [[TRUNC1]]
+ ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[TRUNC]](s16), [[TRUNC1]]
+ ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH7E00
+ ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[FMAXNUM_IEEE]], [[C]]
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s16) = COPY [[SELECT]](s16)
+ ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY2]](s16)
+ ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: test_fmaximum_f16
+ ; GFX12: liveins: $vgpr0, $vgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s16) = G_FMAXIMUM [[TRUNC]], [[TRUNC1]]
+ ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMAXIMUM]](s16)
+ ; GFX12-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s16) = G_TRUNC %0(s32)
+ %2:_(s32) = COPY $vgpr1
+ %3:_(s16) = G_TRUNC %2(s32)
+ %4:_(s16) = G_FMAXIMUM %1, %3
+ %5:_(s32) = G_ANYEXT %4(s16)
+ $vgpr0 = COPY %5(s32)
+ SI_RETURN implicit $vgpr0
+...
+---
+name: test_fmaximum_f32
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; GFX9-LABEL: name: test_fmaximum_f32
+ ; GFX9: liveins: $vgpr0, $vgpr1
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[COPY]], [[COPY1]]
+ ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s32), [[COPY1]]
+ ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x7FF8000000000000
+ ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[FMAXNUM_IEEE]], [[C]]
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SELECT]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: test_fmaximum_f32
+ ; GFX12: liveins: $vgpr0, $vgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s32) = G_FMAXIMUM [[COPY]], [[COPY1]]
+ ; GFX12-NEXT: $vgpr0 = COPY [[FMAXIMUM]](s32)
+ ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(s32) = G_FMAXIMUM %0, %1
+ $vgpr0 = COPY %2(s32)
+ SI_RETURN implicit $vgpr0
+...
+---
+name: test_fmaximum_f64
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+ ; GFX9-LABEL: name: test_fmaximum_f64
+ ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+ ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s64) = G_FMAXNUM_IEEE [[COPY]], [[COPY1]]
+ ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s64), [[COPY1]]
+ ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x7FF8000000000000
+ ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[FCMP]](s1), [[FMAXNUM_IEEE]], [[C]]
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[SELECT]](s64)
+ ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[COPY2]](s64)
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0_vgpr1
+ ;
+ ; GFX12-LABEL: name: test_fmaximum_f64
+ ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+ ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s64) = G_FMAXIMUM [[COPY]], [[COPY1]]
+ ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[FMAXIMUM]](s64)
+ ; GFX12-NEXT: SI_RETURN implicit $vgpr0_vgpr1
+ %0:_(s64) = COPY $vgpr0_vgpr1
+ %1:_(s64) = COPY $vgpr2_vgpr3
+ %2:_(s64) = G_FMAXIMUM %0, %1
+ $vgpr0_vgpr1 = COPY %2(s64)
+ SI_RETURN implicit $vgpr0_vgpr1
+...
+---
+name: test_fmaximum_v2f16
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; GFX9-LABEL: name: test_fmaximum_v2f16
+ ; GFX9: liveins: $vgpr0, $vgpr1
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM_IEEE [[COPY]], [[COPY1]]
+ ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
+ ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+ ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+ ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+ ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+ ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
+ ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+ ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+ ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+ ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[TRUNC]](s16), [[TRUNC2]]
+ ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[TRUNC1]](s16), [[TRUNC3]]
+ ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH7E00
+ ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[FMAXNUM_IEEE]](<2 x s16>)
+ ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
+ ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+ ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32)
+ ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[TRUNC4]], [[C1]]
+ ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s16) = G_SELECT [[FCMP1]](s1), [[TRUNC5]], [[C1]]
+ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[SELECT]](s16), [[SELECT1]](s16)
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY [[BUILD_VECTOR]](<2 x s16>)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](<2 x s16>)
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: test_fmaximum_v2f16
+ ; GFX12: liveins: $vgpr0, $vgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(<2 x s16>) = G_FMAXIMUM [[COPY]], [[COPY1]]
+ ; GFX12-NEXT: $vgpr0 = COPY [[FMAXIMUM]](<2 x s16>)
+ ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+ %0:_(<2 x s16>) = COPY $vgpr0
+ %1:_(<2 x s16>) = COPY $vgpr1
+ %2:_(<2 x s16>) = G_FMAXIMUM %0, %1
+ $vgpr0 = COPY %2(<2 x s16>)
+ SI_RETURN implicit $vgpr0
+...
+---
+name: test_fmaximum_v2f32
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+ ; GFX9-LABEL: name: test_fmaximum_v2f32
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[COPY]], [[COPY2]]
+ ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s32), [[COPY2]]
+ ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x7FF8000000000000
+ ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[FMAXNUM_IEEE]], [[C]]
+ ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SELECT]](s32)
+ ; GFX9-NEXT: [[FMAXNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[COPY1]], [[COPY3]]
+ ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY1]](s32), [[COPY3]]
+ ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[FMAXNUM_IEEE1]], [[C]]
+ ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[SELECT1]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY4]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY5]](s32)
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX12-LABEL: name: test_fmaximum_v2f32
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s32) = G_FMAXIMUM [[COPY]], [[COPY2]]
+ ; GFX12-NEXT: [[FMAXIMUM1:%[0-9]+]]:_(s32) = G_FMAXIMUM [[COPY1]], [[COPY3]]
+ ; GFX12-NEXT: $vgpr0 = COPY [[FMAXIMUM]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[FMAXIMUM1]](s32)
+ ; GFX12-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(<2 x s32>) = G_BUILD_VECTOR %0(s32), %1(s32)
+ %3:_(s32) = COPY $vgpr2
+ %4:_(s32) = COPY $vgpr3
+ %5:_(<2 x s32>) = G_BUILD_VECTOR %3(s32), %4(s32)
+ %6:_(<2 x s32>) = G_FMAXIMUM %2, %5
+ %7:_(s32), %8:_(s32) = G_UNMERGE_VALUES %6(<2 x s32>)
+ $vgpr0 = COPY %7(s32)
+ $vgpr1 = COPY %8(s32)
+ SI_RETURN implicit $vgpr0, implicit $vgpr1
+...
+---
+name: test_fmaximum_nsz_f32
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; GFX9-LABEL: name: test_fmaximum_nsz_f32
+ ; GFX9: liveins: $vgpr0, $vgpr1
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[COPY]], [[COPY1]]
+ ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s32), [[COPY1]]
+ ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x7FF8000000000000
+ ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[FMAXNUM_IEEE]], [[C]]
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SELECT]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: test_fmaximum_nsz_f32
+ ; GFX12: liveins: $vgpr0, $vgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s32) = nsz G_FMAXIMUM [[COPY]], [[COPY1]]
+ ; GFX12-NEXT: $vgpr0 = COPY [[FMAXIMUM]](s32)
+ ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(s32) = nsz G_FMAXIMUM %0, %1
+ $vgpr0 = COPY %2(s32)
+ SI_RETURN implicit $vgpr0
+...
+---
+name: test_fmaximum_nnan_f32
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; GFX9-LABEL: name: test_fmaximum_nnan_f32
+ ; GFX9: liveins: $vgpr0, $vgpr1
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[COPY]], [[COPY1]]
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[FMAXNUM_IEEE]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: test_fmaximum_nnan_f32
+ ; GFX12: liveins: $vgpr0, $vgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s32) = nnan G_FMAXIMUM [[COPY]], [[COPY1]]
+ ; GFX12-NEXT: $vgpr0 = COPY [[FMAXIMUM]](s32)
+ ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(s32) = nnan G_FMAXIMUM %0, %1
+ $vgpr0 = COPY %2(s32)
+ SI_RETURN implicit $vgpr0
+...
+
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fminimum.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fminimum.mir
new file mode 100644
index 0000000..8ba0794
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fminimum.mir
@@ -0,0 +1,275 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=-real-true16 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX12 %s
+
+---
+name: test_fminimum_f16
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; GFX9-LABEL: name: test_fminimum_f16
+ ; GFX9: liveins: $vgpr0, $vgpr1
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX9-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s16) = G_FMINNUM_IEEE [[TRUNC]], [[TRUNC1]]
+ ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[TRUNC]](s16), [[TRUNC1]]
+ ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH7E00
+ ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[FMINNUM_IEEE]], [[C]]
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s16) = COPY [[SELECT]](s16)
+ ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY2]](s16)
+ ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: test_fminimum_f16
+ ; GFX12: liveins: $vgpr0, $vgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX12-NEXT: [[FMINIMUM:%[0-9]+]]:_(s16) = G_FMINIMUM [[TRUNC]], [[TRUNC1]]
+ ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMINIMUM]](s16)
+ ; GFX12-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s16) = G_TRUNC %0(s32)
+ %2:_(s32) = COPY $vgpr1
+ %3:_(s16) = G_TRUNC %2(s32)
+ %4:_(s16) = G_FMINIMUM %1, %3
+ %5:_(s32) = G_ANYEXT %4(s16)
+ $vgpr0 = COPY %5(s32)
+ SI_RETURN implicit $vgpr0
+...
+---
+name: test_fminimum_f32
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; GFX9-LABEL: name: test_fminimum_f32
+ ; GFX9: liveins: $vgpr0, $vgpr1
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[COPY]], [[COPY1]]
+ ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s32), [[COPY1]]
+ ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x7FF8000000000000
+ ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[FMINNUM_IEEE]], [[C]]
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SELECT]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: test_fminimum_f32
+ ; GFX12: liveins: $vgpr0, $vgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX12-NEXT: [[FMINIMUM:%[0-9]+]]:_(s32) = G_FMINIMUM [[COPY]], [[COPY1]]
+ ; GFX12-NEXT: $vgpr0 = COPY [[FMINIMUM]](s32)
+ ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(s32) = G_FMINIMUM %0, %1
+ $vgpr0 = COPY %2(s32)
+ SI_RETURN implicit $vgpr0
+...
+---
+name: test_fminimum_f64
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+ ; GFX9-LABEL: name: test_fminimum_f64
+ ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+ ; GFX9-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s64) = G_FMINNUM_IEEE [[COPY]], [[COPY1]]
+ ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s64), [[COPY1]]
+ ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x7FF8000000000000
+ ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[FCMP]](s1), [[FMINNUM_IEEE]], [[C]]
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[SELECT]](s64)
+ ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[COPY2]](s64)
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0_vgpr1
+ ;
+ ; GFX12-LABEL: name: test_fminimum_f64
+ ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+ ; GFX12-NEXT: [[FMINIMUM:%[0-9]+]]:_(s64) = G_FMINIMUM [[COPY]], [[COPY1]]
+ ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[FMINIMUM]](s64)
+ ; GFX12-NEXT: SI_RETURN implicit $vgpr0_vgpr1
+ %0:_(s64) = COPY $vgpr0_vgpr1
+ %1:_(s64) = COPY $vgpr2_vgpr3
+ %2:_(s64) = G_FMINIMUM %0, %1
+ $vgpr0_vgpr1 = COPY %2(s64)
+ SI_RETURN implicit $vgpr0_vgpr1
+...
+---
+name: test_fminimum_v2f16
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; GFX9-LABEL: name: test_fminimum_v2f16
+ ; GFX9: liveins: $vgpr0, $vgpr1
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX9-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMINNUM_IEEE [[COPY]], [[COPY1]]
+ ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
+ ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+ ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+ ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+ ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+ ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
+ ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+ ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+ ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+ ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[TRUNC]](s16), [[TRUNC2]]
+ ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[TRUNC1]](s16), [[TRUNC3]]
+ ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH7E00
+ ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[FMINNUM_IEEE]](<2 x s16>)
+ ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
+ ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+ ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32)
+ ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[TRUNC4]], [[C1]]
+ ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s16) = G_SELECT [[FCMP1]](s1), [[TRUNC5]], [[C1]]
+ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[SELECT]](s16), [[SELECT1]](s16)
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY [[BUILD_VECTOR]](<2 x s16>)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](<2 x s16>)
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: test_fminimum_v2f16
+ ; GFX12: liveins: $vgpr0, $vgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX12-NEXT: [[FMINIMUM:%[0-9]+]]:_(<2 x s16>) = G_FMINIMUM [[COPY]], [[COPY1]]
+ ; GFX12-NEXT: $vgpr0 = COPY [[FMINIMUM]](<2 x s16>)
+ ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+ %0:_(<2 x s16>) = COPY $vgpr0
+ %1:_(<2 x s16>) = COPY $vgpr1
+ %2:_(<2 x s16>) = G_FMINIMUM %0, %1
+ $vgpr0 = COPY %2(<2 x s16>)
+ SI_RETURN implicit $vgpr0
+...
+---
+name: test_fminimum_v2f32
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+ ; GFX9-LABEL: name: test_fminimum_v2f32
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[COPY]], [[COPY2]]
+ ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s32), [[COPY2]]
+ ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x7FF8000000000000
+ ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[FMINNUM_IEEE]], [[C]]
+ ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SELECT]](s32)
+ ; GFX9-NEXT: [[FMINNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[COPY1]], [[COPY3]]
+ ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY1]](s32), [[COPY3]]
+ ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[FMINNUM_IEEE1]], [[C]]
+ ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[SELECT1]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY4]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY5]](s32)
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX12-LABEL: name: test_fminimum_v2f32
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX12-NEXT: [[FMINIMUM:%[0-9]+]]:_(s32) = G_FMINIMUM [[COPY]], [[COPY2]]
+ ; GFX12-NEXT: [[FMINIMUM1:%[0-9]+]]:_(s32) = G_FMINIMUM [[COPY1]], [[COPY3]]
+ ; GFX12-NEXT: $vgpr0 = COPY [[FMINIMUM]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[FMINIMUM1]](s32)
+ ; GFX12-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(<2 x s32>) = G_BUILD_VECTOR %0(s32), %1(s32)
+ %3:_(s32) = COPY $vgpr2
+ %4:_(s32) = COPY $vgpr3
+ %5:_(<2 x s32>) = G_BUILD_VECTOR %3(s32), %4(s32)
+ %6:_(<2 x s32>) = G_FMINIMUM %2, %5
+ %7:_(s32), %8:_(s32) = G_UNMERGE_VALUES %6(<2 x s32>)
+ $vgpr0 = COPY %7(s32)
+ $vgpr1 = COPY %8(s32)
+ SI_RETURN implicit $vgpr0, implicit $vgpr1
+...
+---
+name: test_fminimum_nsz_f32
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; GFX9-LABEL: name: test_fminimum_nsz_f32
+ ; GFX9: liveins: $vgpr0, $vgpr1
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[COPY]], [[COPY1]]
+ ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s32), [[COPY1]]
+ ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x7FF8000000000000
+ ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[FMINNUM_IEEE]], [[C]]
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SELECT]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: test_fminimum_nsz_f32
+ ; GFX12: liveins: $vgpr0, $vgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX12-NEXT: [[FMINIMUM:%[0-9]+]]:_(s32) = nsz G_FMINIMUM [[COPY]], [[COPY1]]
+ ; GFX12-NEXT: $vgpr0 = COPY [[FMINIMUM]](s32)
+ ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(s32) = nsz G_FMINIMUM %0, %1
+ $vgpr0 = COPY %2(s32)
+ SI_RETURN implicit $vgpr0
+...
+---
+name: test_fminimum_nnan_f32
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; GFX9-LABEL: name: test_fminimum_nnan_f32
+ ; GFX9: liveins: $vgpr0, $vgpr1
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[COPY]], [[COPY1]]
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[FMINNUM_IEEE]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: test_fminimum_nnan_f32
+ ; GFX12: liveins: $vgpr0, $vgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX12-NEXT: [[FMINIMUM:%[0-9]+]]:_(s32) = nnan G_FMINIMUM [[COPY]], [[COPY1]]
+ ; GFX12-NEXT: $vgpr0 = COPY [[FMINIMUM]](s32)
+ ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(s32) = nnan G_FMINIMUM %0, %1
+ $vgpr0 = COPY %2(s32)
+ SI_RETURN implicit $vgpr0
+...
+
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir
index c8bd8ab..423ce82 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir
@@ -18,58 +18,12 @@ body: |
; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
- %2:_(<2 x s32>) = G_SHUFFLE_VECTOR %0, %1, shufflemask(0, 1)
+ %2:_(<2 x s32>) = G_BUILD_VECTOR %0, %1
$vgpr0_vgpr1 = COPY %2
...
---
-name: shufflevector_scalar_src_dst
-tracksRegLiveness: true
-
-body: |
- bb.0:
- liveins: $vgpr0, $vgpr1
-
- ; CHECK-LABEL: name: shufflevector_scalar_src_dst
- ; CHECK: liveins: $vgpr0, $vgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
- ; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32)
- %0:_(s32) = COPY $vgpr0
- %1:_(s32) = COPY $vgpr1
- %2:_(s32) = G_SHUFFLE_VECTOR %0, %1, shufflemask(1)
- $vgpr0 = COPY %2
-
-...
-
----
-name: shufflevector_scalar_dst
-tracksRegLiveness: true
-
-body: |
- bb.0:
- liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
-
- ; CHECK-LABEL: name: shufflevector_scalar_dst
- ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3
- ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>)
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
- ; CHECK-NEXT: $vgpr0 = COPY [[COPY3]](s32)
- %0:_(<2 x s32>) = COPY $vgpr0_vgpr1
- %1:_(<2 x s32>) = COPY $vgpr2_vgpr3
- %2:_(s32) = G_SHUFFLE_VECTOR %0, %1, shufflemask(2)
- $vgpr0 = COPY %2
-
-...
-
----
name: shufflevector_v2s32_0_1
tracksRegLiveness: true
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
index 5171403..7714c03 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
@@ -140,7 +140,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
@@ -345,7 +344,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
index 7b01f13..7b81669 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
@@ -143,7 +143,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
@@ -348,7 +347,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1
; CHECK-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index 8533e34..518af70 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -1750,7 +1750,7 @@ define i65 @v_lshr_i65_33(i65 %value) {
; GFX6-NEXT: v_and_b32_e32 v0, 1, v2
; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 31
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v3
-; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
; GFX6-NEXT: v_mov_b32_e32 v2, 0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
@@ -1763,7 +1763,7 @@ define i65 @v_lshr_i65_33(i65 %value) {
; GFX8-NEXT: v_and_b32_e32 v0, 1, v2
; GFX8-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1]
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v3
-; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -1776,7 +1776,7 @@ define i65 @v_lshr_i65_33(i65 %value) {
; GFX9-NEXT: v_and_b32_e32 v0, 1, v2
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1]
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v3
-; GFX9-NEXT: v_or_b32_e32 v0, v2, v0
+; GFX9-NEXT: v_or_b32_e32 v0, v0, v2
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -1789,7 +1789,7 @@ define i65 @v_lshr_i65_33(i65 %value) {
; GFX10-NEXT: v_and_b32_e32 v0, 1, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 1, v3
; GFX10-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1]
-; GFX10-NEXT: v_or_b32_e32 v0, v2, v0
+; GFX10-NEXT: v_or_b32_e32 v0, v0, v2
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -1800,7 +1800,7 @@ define i65 @v_lshr_i65_33(i65 %value) {
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 1, v2
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 1, v3
; GFX11-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1]
-; GFX11-NEXT: v_or_b32_e32 v0, v2, v0
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v2
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = lshr i65 %value, 33
@@ -1859,21 +1859,19 @@ define amdgpu_ps i65 @s_lshr_i65_33(i65 inreg %value) {
; GCN-LABEL: s_lshr_i65_33:
; GCN: ; %bb.0:
; GCN-NEXT: s_and_b64 s[2:3], s[2:3], 1
-; GCN-NEXT: s_lshr_b32 s0, s1, 1
-; GCN-NEXT: s_mov_b32 s1, 0
-; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], 31
-; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT: s_lshr_b32 s4, s1, 1
+; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 31
+; GCN-NEXT: s_or_b32 s0, s0, s4
; GCN-NEXT: s_mov_b32 s2, 0
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_lshr_i65_33:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_and_b64 s[2:3], s[2:3], 1
-; GFX10PLUS-NEXT: s_lshr_b32 s0, s1, 1
-; GFX10PLUS-NEXT: s_mov_b32 s1, 0
-; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[2:3], 31
-; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX10PLUS-NEXT: s_lshr_b32 s4, s1, 1
+; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[2:3], 31
; GFX10PLUS-NEXT: s_mov_b32 s2, 0
+; GFX10PLUS-NEXT: s_or_b32 s0, s0, s4
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = lshr i65 %value, 33
ret i65 %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/or.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/or.ll
index af377b1..e0581f01 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/or.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/or.ll
@@ -597,13 +597,13 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_mov_b32 s5, 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0
+; GFX7-NEXT: s_load_dword s3, s[2:3], 0x0
; GFX7-NEXT: s_mov_b32 s2, -1
-; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_or_b64 s[4:5], s[4:5], 0x50
+; GFX7-NEXT: s_or_b32 s4, s3, 0x50
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
@@ -616,7 +616,7 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a
; GFX8-NEXT: s_mov_b32 s3, 0
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], 0x50
+; GFX8-NEXT: s_or_b32 s2, s2, 0x50
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -630,7 +630,7 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a
; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: s_mov_b32 s3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], 0x50
+; GFX9-NEXT: s_or_b32 s2, s2, 0x50
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -644,7 +644,7 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a
; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX10-NEXT: s_mov_b32 s3, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], 0x50
+; GFX10-NEXT: s_or_b32 s2, s2, 0x50
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: v_mov_b32_e32 v1, s3
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -658,7 +658,7 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a
; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
; GFX11-NEXT: s_mov_b32 s3, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], 0x50
+; GFX11-NEXT: s_or_b32 s2, s2, 0x50
; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
@@ -671,7 +671,7 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a
; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
; GFX12-NEXT: s_mov_b32 s3, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_or_b64 s[2:3], s[2:3], 0x50
+; GFX12-NEXT: s_or_b32 s2, s2, 0x50
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/prelegalizer-combiner-shuffle.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/prelegalizer-combiner-shuffle.mir
index 6e4c6bc..31e3d97 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/prelegalizer-combiner-shuffle.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/prelegalizer-combiner-shuffle.mir
@@ -147,15 +147,17 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p3) = COPY $vgpr1
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s16>) = G_LOAD [[COPY]](p3) :: (load (<8 x s16>), align 8, addrspace 3)
- ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16), [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[LOAD]](<8 x s16>)
- ; CHECK-NEXT: G_STORE [[UV4]](s16), [[COPY1]](p3) :: (store (s16), addrspace 3)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+ ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s16) = G_LOAD [[PTR_ADD]](p3) :: (load (s16), addrspace 3)
+ ; CHECK-NEXT: G_STORE [[LOAD]](s16), [[COPY1]](p3) :: (store (s16), addrspace 3)
; CHECK-NEXT: SI_RETURN
%0:_(p3) = COPY $vgpr0
%1:_(p3) = COPY $vgpr1
%2:_(<8 x s16>) = G_IMPLICIT_DEF
%3:_(<8 x s16>) = G_LOAD %0(p3) :: (load (<8 x s16>), align 8, addrspace 3)
- %4:_(s16) = G_SHUFFLE_VECTOR %3(<8 x s16>), %2, shufflemask(4)
+ %idx:_(s32) = G_CONSTANT i32 4
+ %4:_(s16) = G_EXTRACT_VECTOR_ELT %3(<8 x s16>), %idx
G_STORE %4(s16), %1(p3) :: (store (s16), addrspace 3)
SI_RETURN
...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
index a9b3deb..cfe655f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
@@ -1381,7 +1381,7 @@ define i65 @v_sext_inreg_i65_33(i65 %value) {
; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GFX6-NEXT: v_lshl_b64 v[0:1], v[1:2], 31
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 1, v3
-; GFX6-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
; GFX6-NEXT: v_ashrrev_i32_e32 v2, 1, v2
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
@@ -1393,7 +1393,7 @@ define i65 @v_sext_inreg_i65_33(i65 %value) {
; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GFX8-NEXT: v_lshlrev_b64 v[0:1], 31, v[1:2]
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 1, v3
-; GFX8-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v3
; GFX8-NEXT: v_ashrrev_i32_e32 v2, 1, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -1405,7 +1405,7 @@ define i65 @v_sext_inreg_i65_33(i65 %value) {
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 31, v[1:2]
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 1, v3
-; GFX9-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX9-NEXT: v_or_b32_e32 v0, v0, v3
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 1, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -1418,7 +1418,7 @@ define i65 @v_sext_inreg_i65_33(i65 %value) {
; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 31, v[1:2]
; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v2, 1, v2
-; GFX10PLUS-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX10PLUS-NEXT: v_or_b32_e32 v0, v0, v3
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%shl = shl i65 %value, 33
%ashr = ashr i65 %value, 33
@@ -1429,29 +1429,27 @@ define amdgpu_ps i65 @s_sext_inreg_i65_18(i65 inreg %value) {
; GCN-LABEL: s_sext_inreg_i65_18:
; GCN: ; %bb.0:
; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], 18
-; GCN-NEXT: s_lshr_b32 s4, s1, 14
-; GCN-NEXT: s_mov_b32 s5, 0
-; GCN-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
+; GCN-NEXT: s_lshr_b32 s3, s1, 14
+; GCN-NEXT: s_or_b32 s2, s2, s3
; GCN-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
; GCN-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x2e0000
-; GCN-NEXT: s_lshl_b32 s7, s2, 14
-; GCN-NEXT: s_mov_b32 s6, s5
-; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
+; GCN-NEXT: s_lshl_b32 s5, s2, 14
+; GCN-NEXT: s_mov_b32 s4, 0
+; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 18
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_sext_inreg_i65_18:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[2:3], 18
-; GFX10PLUS-NEXT: s_lshr_b32 s4, s1, 14
-; GFX10PLUS-NEXT: s_mov_b32 s5, 0
+; GFX10PLUS-NEXT: s_lshr_b32 s3, s1, 14
; GFX10PLUS-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x2e0000
-; GFX10PLUS-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
-; GFX10PLUS-NEXT: s_mov_b32 s6, s5
+; GFX10PLUS-NEXT: s_or_b32 s2, s2, s3
+; GFX10PLUS-NEXT: s_mov_b32 s4, 0
; GFX10PLUS-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
-; GFX10PLUS-NEXT: s_lshl_b32 s7, s2, 14
+; GFX10PLUS-NEXT: s_lshl_b32 s5, s2, 14
; GFX10PLUS-NEXT: s_ashr_i64 s[2:3], s[2:3], 18
-; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
+; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
; GFX10PLUS-NEXT: ; return to shader part epilog
%shl = shl i65 %value, 18
%ashr = ashr i65 %shl, 18
@@ -1464,13 +1462,12 @@ define amdgpu_ps i65 @s_sext_inreg_i65_33(i65 inreg %value) {
; GCN-NEXT: s_lshl_b32 s3, s2, 1
; GCN-NEXT: s_mov_b32 s2, 0
; GCN-NEXT: s_lshr_b64 s[4:5], s[0:1], 31
-; GCN-NEXT: s_or_b64 s[4:5], s[2:3], s[4:5]
-; GCN-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000
-; GCN-NEXT: s_bfe_u32 s0, s0, 0x1f0000
-; GCN-NEXT: s_mov_b32 s1, s2
-; GCN-NEXT: s_lshl_b64 s[2:3], s[4:5], 31
-; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
-; GCN-NEXT: s_ashr_i32 s2, s5, 1
+; GCN-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
+; GCN-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GCN-NEXT: s_bfe_u32 s4, s0, 0x1f0000
+; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 31
+; GCN-NEXT: s_or_b32 s0, s0, s4
+; GCN-NEXT: s_ashr_i32 s2, s3, 1
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_sext_inreg_i65_33:
@@ -1478,13 +1475,12 @@ define amdgpu_ps i65 @s_sext_inreg_i65_33(i65 inreg %value) {
; GFX10PLUS-NEXT: s_lshl_b32 s3, s2, 1
; GFX10PLUS-NEXT: s_mov_b32 s2, 0
; GFX10PLUS-NEXT: s_lshr_b64 s[4:5], s[0:1], 31
-; GFX10PLUS-NEXT: s_bfe_u32 s0, s0, 0x1f0000
-; GFX10PLUS-NEXT: s_or_b64 s[4:5], s[2:3], s[4:5]
-; GFX10PLUS-NEXT: s_mov_b32 s1, s2
-; GFX10PLUS-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000
-; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[4:5], 31
-; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
-; GFX10PLUS-NEXT: s_ashr_i32 s2, s5, 1
+; GFX10PLUS-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
+; GFX10PLUS-NEXT: s_bfe_u32 s4, s0, 0x1f0000
+; GFX10PLUS-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[2:3], 31
+; GFX10PLUS-NEXT: s_ashr_i32 s2, s3, 1
+; GFX10PLUS-NEXT: s_or_b32 s0, s0, s4
; GFX10PLUS-NEXT: ; return to shader part epilog
%shl = shl i65 %value, 33
%ashr = ashr i65 %shl, 33
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shufflevector-pointer-crash.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/shufflevector-pointer-crash.mir
index ac903ad..e7bba9d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shufflevector-pointer-crash.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shufflevector-pointer-crash.mir
@@ -19,15 +19,15 @@ body: |
; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p0>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
; CHECK-NEXT: [[UV:%[0-9]+]]:_(p0), [[UV1:%[0-9]+]]:_(p0) = G_UNMERGE_VALUES [[BITCAST]](<2 x p0>)
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY [[UV]](p0)
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY [[COPY]](p0)
- ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](p0)
+ ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](p0)
; CHECK-NEXT: $vgpr0 = COPY [[UV2]](s32)
; CHECK-NEXT: $vgpr1 = COPY [[UV3]](s32)
; CHECK-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
%0:_(p0) = G_CONSTANT i64 0
%1:_(<2 x p0>) = G_BUILD_VECTOR %0:_(p0), %0:_(p0)
%2:_(<2 x p0>) = G_LOAD %0:_(p0) :: (load (<2 x p0>))
- %3:_(p0) = G_SHUFFLE_VECTOR %2:_(<2 x p0>), %1:_, shufflemask(0)
+ %idx:_(s32) = G_CONSTANT i32 0
+ %3:_(p0) = G_EXTRACT_VECTOR_ELT %2:_(<2 x p0>), %idx
%4:_(s32), %5:_(s32) = G_UNMERGE_VALUES %3:_(p0)
$vgpr0 = COPY %4:_(s32)
$vgpr1 = COPY %5:_(s32)
diff --git a/llvm/test/CodeGen/AMDGPU/add-max.ll b/llvm/test/CodeGen/AMDGPU/add-max.ll
index b3a7057..c551375 100644
--- a/llvm/test/CodeGen/AMDGPU/add-max.ll
+++ b/llvm/test/CodeGen/AMDGPU/add-max.ll
@@ -7,7 +7,7 @@ define amdgpu_ps float @add_max_u32_vvv(i32 %a, i32 %b, i32 %c) {
; GCN: ; %bb.0:
; GCN-NEXT: v_add_max_u32 v0, v0, v1, v2
; GCN-NEXT: ; return to shader part epilog
- %add = add i32 %a, %b
+ %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
%max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
%ret = bitcast i32 %max to float
ret float %ret
@@ -18,39 +18,38 @@ define amdgpu_ps float @add_max_u32_svv(i32 inreg %a, i32 %b, i32 %c) {
; GCN: ; %bb.0:
; GCN-NEXT: v_add_max_u32 v0, s0, v0, v1
; GCN-NEXT: ; return to shader part epilog
- %add = add i32 %a, %b
+ %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
%max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
%ret = bitcast i32 %max to float
ret float %ret
}
define amdgpu_ps float @add_max_u32_ssv(i32 inreg %a, i32 inreg %b, i32 %c) {
-; SDAG-LABEL: add_max_u32_ssv:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: v_add_max_u32 v0, s0, s1, v0
-; SDAG-NEXT: ; return to shader part epilog
-;
-; GISEL-LABEL: add_max_u32_ssv:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_add_co_i32 s0, s0, s1
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-NEXT: v_max_u32_e32 v0, s0, v0
-; GISEL-NEXT: ; return to shader part epilog
- %add = add i32 %a, %b
+; GCN-LABEL: add_max_u32_ssv:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_add_max_u32 v0, s0, s1, v0
+; GCN-NEXT: ; return to shader part epilog
+ %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
%max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
%ret = bitcast i32 %max to float
ret float %ret
}
define amdgpu_ps float @add_max_u32_sss(i32 inreg %a, i32 inreg %b, i32 inreg %c) {
-; GCN-LABEL: add_max_u32_sss:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_add_co_i32 s0, s0, s1
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT: s_max_u32 s0, s0, s2
-; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: ; return to shader part epilog
- %add = add i32 %a, %b
+; SDAG-LABEL: add_max_u32_sss:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: v_add_nc_u32_e64 v0, s0, s1 clamp
+; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-NEXT: v_max_u32_e32 v0, s2, v0
+; SDAG-NEXT: ; return to shader part epilog
+;
+; GISEL-LABEL: add_max_u32_sss:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT: v_add_max_u32 v0, s0, s1, v0
+; GISEL-NEXT: ; return to shader part epilog
+ %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
%max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
%ret = bitcast i32 %max to float
ret float %ret
@@ -61,7 +60,7 @@ define amdgpu_ps float @add_max_u32_vsi(i32 %a, i32 inreg %b) {
; GCN: ; %bb.0:
; GCN-NEXT: v_add_max_u32 v0, v0, s0, 4
; GCN-NEXT: ; return to shader part epilog
- %add = add i32 %a, %b
+ %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
%max = call i32 @llvm.umax.i32(i32 %add, i32 4)
%ret = bitcast i32 %max to float
ret float %ret
@@ -72,26 +71,19 @@ define amdgpu_ps float @add_max_u32_svl(i32 inreg %a, i32 %b) {
; GCN: ; %bb.0:
; GCN-NEXT: v_add_max_u32 v0, s0, v0, 0x64
; GCN-NEXT: ; return to shader part epilog
- %add = add i32 %a, %b
+ %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
%max = call i32 @llvm.umax.i32(i32 %add, i32 100)
%ret = bitcast i32 %max to float
ret float %ret
}
-define amdgpu_ps float @add_max_u32_slv(i32 inreg %a, i32 %b) {
-; SDAG-LABEL: add_max_u32_slv:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: v_add_max_u32 v0, 0x64, s0, v0
-; SDAG-NEXT: ; return to shader part epilog
-;
-; GISEL-LABEL: add_max_u32_slv:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_addk_co_i32 s0, 0x64
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-NEXT: v_max_u32_e32 v0, s0, v0
-; GISEL-NEXT: ; return to shader part epilog
- %add = add i32 %a, 100
- %max = call i32 @llvm.umax.i32(i32 %add, i32 %b)
+define amdgpu_ps float @add_max_u32_slv(i32 inreg %a, i32 %b, i32 %c) {
+; GCN-LABEL: add_max_u32_slv:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_add_max_u32 v0, s0, v0, v1
+; GCN-NEXT: ; return to shader part epilog
+ %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
+ %max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
%ret = bitcast i32 %max to float
ret float %ret
}
@@ -101,7 +93,7 @@ define amdgpu_ps float @add_max_i32_vvv(i32 %a, i32 %b, i32 %c) {
; GCN: ; %bb.0:
; GCN-NEXT: v_add_max_i32 v0, v0, v1, v2
; GCN-NEXT: ; return to shader part epilog
- %add = add i32 %a, %b
+ %add = call i32 @llvm.sadd.sat.i32(i32 %a, i32 %b)
%max = call i32 @llvm.smax.i32(i32 %add, i32 %c)
%ret = bitcast i32 %max to float
ret float %ret
@@ -112,7 +104,7 @@ define amdgpu_ps float @add_min_u32_vvv(i32 %a, i32 %b, i32 %c) {
; GCN: ; %bb.0:
; GCN-NEXT: v_add_min_u32 v0, v0, v1, v2
; GCN-NEXT: ; return to shader part epilog
- %add = add i32 %a, %b
+ %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
%max = call i32 @llvm.umin.i32(i32 %add, i32 %c)
%ret = bitcast i32 %max to float
ret float %ret
@@ -123,7 +115,7 @@ define amdgpu_ps float @add_min_i32_vvv(i32 %a, i32 %b, i32 %c) {
; GCN: ; %bb.0:
; GCN-NEXT: v_add_min_i32 v0, v0, v1, v2
; GCN-NEXT: ; return to shader part epilog
- %add = add i32 %a, %b
+ %add = call i32 @llvm.sadd.sat.i32(i32 %a, i32 %b)
%max = call i32 @llvm.smin.i32(i32 %add, i32 %c)
%ret = bitcast i32 %max to float
ret float %ret
@@ -134,7 +126,7 @@ define amdgpu_ps float @add_max_v2u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16>
; GCN: ; %bb.0:
; GCN-NEXT: v_pk_add_max_u16 v0, v0, v1, v2
; GCN-NEXT: ; return to shader part epilog
- %add = add <2 x i16> %a, %b
+ %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
%ret = bitcast <2 x i16> %max to float
ret float %ret
@@ -145,29 +137,18 @@ define amdgpu_ps float @add_max_v2u16_svv(<2 x i16> inreg %a, <2 x i16> %b, <2 x
; GCN: ; %bb.0:
; GCN-NEXT: v_pk_add_max_u16 v0, s0, v0, v1
; GCN-NEXT: ; return to shader part epilog
- %add = add <2 x i16> %a, %b
+ %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
%ret = bitcast <2 x i16> %max to float
ret float %ret
}
define amdgpu_ps float @add_max_v2u16_ssv(<2 x i16> inreg %a, <2 x i16> inreg %b, <2 x i16> %c) {
-; SDAG-LABEL: add_max_v2u16_ssv:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: v_pk_add_max_u16 v0, s0, s1, v0
-; SDAG-NEXT: ; return to shader part epilog
-;
-; GISEL-LABEL: add_max_v2u16_ssv:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_lshr_b32 s2, s0, 16
-; GISEL-NEXT: s_lshr_b32 s3, s1, 16
-; GISEL-NEXT: s_add_co_i32 s0, s0, s1
-; GISEL-NEXT: s_add_co_i32 s2, s2, s3
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s2
-; GISEL-NEXT: v_pk_max_u16 v0, s0, v0
-; GISEL-NEXT: ; return to shader part epilog
- %add = add <2 x i16> %a, %b
+; GCN-LABEL: add_max_v2u16_ssv:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_add_max_u16 v0, s0, s1, v0
+; GCN-NEXT: ; return to shader part epilog
+ %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
%ret = bitcast <2 x i16> %max to float
ret float %ret
@@ -176,30 +157,18 @@ define amdgpu_ps float @add_max_v2u16_ssv(<2 x i16> inreg %a, <2 x i16> inreg %b
define amdgpu_ps float @add_max_v2u16_sss(<2 x i16> inreg %a, <2 x i16> inreg %b, <2 x i16> inreg %c) {
; SDAG-LABEL: add_max_v2u16_sss:
; SDAG: ; %bb.0:
-; SDAG-NEXT: v_pk_add_u16 v0, s0, s1
+; SDAG-NEXT: v_pk_add_u16 v0, s0, s1 clamp
; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; SDAG-NEXT: v_pk_max_u16 v0, v0, s2
; SDAG-NEXT: ; return to shader part epilog
;
; GISEL-LABEL: add_max_v2u16_sss:
; GISEL: ; %bb.0:
-; GISEL-NEXT: s_lshr_b32 s3, s0, 16
-; GISEL-NEXT: s_lshr_b32 s4, s1, 16
-; GISEL-NEXT: s_add_co_i32 s0, s0, s1
-; GISEL-NEXT: s_add_co_i32 s3, s3, s4
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s3
-; GISEL-NEXT: s_and_b32 s3, s2, 0xffff
-; GISEL-NEXT: s_lshr_b32 s1, s0, 16
-; GISEL-NEXT: s_and_b32 s0, s0, 0xffff
-; GISEL-NEXT: s_lshr_b32 s2, s2, 16
-; GISEL-NEXT: s_max_u32 s0, s0, s3
-; GISEL-NEXT: s_max_u32 s1, s1, s2
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT: v_pk_add_max_u16 v0, s0, s1, v0
; GISEL-NEXT: ; return to shader part epilog
- %add = add <2 x i16> %a, %b
+ %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
%ret = bitcast <2 x i16> %max to float
ret float %ret
@@ -210,7 +179,7 @@ define amdgpu_ps float @add_max_v2u16_vsi(<2 x i16> %a, <2 x i16> inreg %b) {
; GCN: ; %bb.0:
; GCN-NEXT: v_pk_add_max_u16 v0, v0, s0, 4
; GCN-NEXT: ; return to shader part epilog
- %add = add <2 x i16> %a, %b
+ %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> <i16 4, i16 0>)
%ret = bitcast <2 x i16> %max to float
ret float %ret
@@ -221,28 +190,18 @@ define amdgpu_ps float @add_max_v2u16_svl(<2 x i16> inreg %a, <2 x i16> %b) {
; GCN: ; %bb.0:
; GCN-NEXT: v_pk_add_max_u16 v0, s0, v0, 0x650064
; GCN-NEXT: ; return to shader part epilog
- %add = add <2 x i16> %a, %b
+ %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> <i16 100, i16 101>)
%ret = bitcast <2 x i16> %max to float
ret float %ret
}
define amdgpu_ps float @add_max_v2u16_slv(<2 x i16> inreg %a, <2 x i16> %b) {
-; SDAG-LABEL: add_max_v2u16_slv:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: v_pk_add_max_u16 v0, 0x640064, s0, v0
-; SDAG-NEXT: ; return to shader part epilog
-;
-; GISEL-LABEL: add_max_v2u16_slv:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_lshr_b32 s1, s0, 16
-; GISEL-NEXT: s_add_co_i32 s0, s0, 0x640064
-; GISEL-NEXT: s_addk_co_i32 s1, 0x64
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GISEL-NEXT: v_pk_max_u16 v0, s0, v0
-; GISEL-NEXT: ; return to shader part epilog
- %add = add <2 x i16> %a, <i16 100, i16 100>
+; GCN-LABEL: add_max_v2u16_slv:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_add_max_u16 v0, 0x640064, s0, v0
+; GCN-NEXT: ; return to shader part epilog
+ %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> <i16 100, i16 100>)
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %b)
%ret = bitcast <2 x i16> %max to float
ret float %ret
@@ -253,7 +212,7 @@ define amdgpu_ps float @add_max_v2s16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16>
; GCN: ; %bb.0:
; GCN-NEXT: v_pk_add_max_i16 v0, v0, v1, v2
; GCN-NEXT: ; return to shader part epilog
- %add = add <2 x i16> %a, %b
+ %add = call <2 x i16> @llvm.sadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
%max = call <2 x i16> @llvm.smax.v216(<2 x i16> %add, <2 x i16> %c)
%ret = bitcast <2 x i16> %max to float
ret float %ret
@@ -264,7 +223,7 @@ define amdgpu_ps float @add_min_v2u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16>
; GCN: ; %bb.0:
; GCN-NEXT: v_pk_add_min_u16 v0, v0, v1, v2
; GCN-NEXT: ; return to shader part epilog
- %add = add <2 x i16> %a, %b
+ %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
%max = call <2 x i16> @llvm.umin.v216(<2 x i16> %add, <2 x i16> %c)
%ret = bitcast <2 x i16> %max to float
ret float %ret
@@ -275,7 +234,7 @@ define amdgpu_ps float @add_min_v2s16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16>
; GCN: ; %bb.0:
; GCN-NEXT: v_pk_add_min_i16 v0, v0, v1, v2
; GCN-NEXT: ; return to shader part epilog
- %add = add <2 x i16> %a, %b
+ %add = call <2 x i16> @llvm.sadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
%max = call <2 x i16> @llvm.smin.v216(<2 x i16> %add, <2 x i16> %c)
%ret = bitcast <2 x i16> %max to float
ret float %ret
diff --git a/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll b/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll
index b72eba8..8088c1b 100644
--- a/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll
+++ b/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll
@@ -180,11 +180,7 @@ define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B
; CHECK-LABEL: s_add64_32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 s0, s0, s2
-; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0
-; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0
; CHECK-NEXT: s_addc_u32 s1, s1, s3
-; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
; CHECK-NEXT: s_addc_u32 s2, s4, 0
; CHECK-NEXT: ; return to shader part epilog
%sum64 = add i64 %val64A, %val64B
@@ -199,14 +195,10 @@ define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B
define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) {
; CHECK-LABEL: s_uadd_v2i64:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_add_u32 s10, s2, s6
-; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0
-; CHECK-NEXT: s_cmp_lg_u64 s[8:9], 0
-; CHECK-NEXT: s_addc_u32 s8, s3, s7
+; CHECK-NEXT: s_add_u32 s6, s2, s6
+; CHECK-NEXT: s_addc_u32 s7, s3, s7
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
; CHECK-NEXT: s_add_u32 s0, s0, s4
-; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0
-; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0
; CHECK-NEXT: s_addc_u32 s1, s1, s5
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: v_mov_b32_e32 v3, s1
@@ -215,8 +207,8 @@ define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v7
; CHECK-NEXT: v_readfirstlane_b32 s2, v6
-; CHECK-NEXT: v_mov_b32_e32 v4, s10
-; CHECK-NEXT: v_mov_b32_e32 v5, s8
+; CHECK-NEXT: v_mov_b32_e32 v4, s6
+; CHECK-NEXT: v_mov_b32_e32 v5, s7
; CHECK-NEXT: s_mov_b32 s1, s0
; CHECK-NEXT: s_mov_b32 s3, s2
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
@@ -233,14 +225,10 @@ define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) {
; CHECK-LABEL: s_usub_v2i64:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_sub_u32 s10, s2, s6
-; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0
-; CHECK-NEXT: s_cmp_lg_u64 s[8:9], 0
-; CHECK-NEXT: s_subb_u32 s8, s3, s7
+; CHECK-NEXT: s_sub_u32 s6, s2, s6
+; CHECK-NEXT: s_subb_u32 s7, s3, s7
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
; CHECK-NEXT: s_sub_u32 s0, s0, s4
-; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0
-; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0
; CHECK-NEXT: s_subb_u32 s1, s1, s5
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: v_mov_b32_e32 v3, s1
@@ -249,8 +237,8 @@ define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v7
; CHECK-NEXT: v_readfirstlane_b32 s2, v6
-; CHECK-NEXT: v_mov_b32_e32 v4, s10
-; CHECK-NEXT: v_mov_b32_e32 v5, s8
+; CHECK-NEXT: v_mov_b32_e32 v4, s6
+; CHECK-NEXT: v_mov_b32_e32 v5, s7
; CHECK-NEXT: s_mov_b32 s1, s0
; CHECK-NEXT: s_mov_b32 s3, s2
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
@@ -268,8 +256,6 @@ define amdgpu_ps i64 @s_uadd_i64(i64 inreg %val0, i64 inreg %val1, ptr %ptrval)
; CHECK-LABEL: s_uadd_i64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 s0, s0, s2
-; CHECK-NEXT: s_cselect_b64 s[4:5], -1, 0
-; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0
; CHECK-NEXT: s_addc_u32 s1, s1, s3
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: v_mov_b32_e32 v3, s1
@@ -292,8 +278,6 @@ define amdgpu_ps i64 @s_uadd_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
; CHECK-LABEL: s_uadd_p1:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 s0, s0, 1
-; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: v_mov_b32_e32 v3, s1
@@ -339,8 +323,6 @@ define amdgpu_ps i64 @s_usub_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
; CHECK-LABEL: s_usub_p1:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_sub_u32 s0, s0, 1
-; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
; CHECK-NEXT: s_subb_u32 s1, s1, 0
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: v_mov_b32_e32 v3, s1
@@ -363,8 +345,6 @@ define amdgpu_ps i64 @s_usub_n1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
; CHECK-LABEL: s_usub_n1:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_sub_u32 s0, s0, -1
-; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
; CHECK-NEXT: s_subb_u32 s1, s1, -1
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: v_mov_b32_e32 v3, s1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index 948811e..51df8c3 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -7821,10 +7821,9 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: s_addc_u32 s15, 0, s16
; GFX6-NEXT: s_add_u32 s16, s0, s1
; GFX6-NEXT: v_mov_b32_e32 v0, s16
-; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0
+; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX6-NEXT: s_or_b32 s0, s0, s1
-; GFX6-NEXT: s_cmp_lg_u32 s0, 0
; GFX6-NEXT: s_addc_u32 s14, s14, s15
; GFX6-NEXT: s_mul_i32 s0, s12, s14
; GFX6-NEXT: v_readfirstlane_b32 s1, v0
@@ -7855,7 +7854,6 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: s_add_u32 s15, s16, s0
; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX6-NEXT: s_or_b32 s0, s0, s1
-; GFX6-NEXT: s_cmp_lg_u32 s0, 0
; GFX6-NEXT: s_addc_u32 s14, s14, s12
; GFX6-NEXT: s_ashr_i32 s12, s7, 31
; GFX6-NEXT: s_add_u32 s0, s6, s12
@@ -7881,52 +7879,50 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: v_readfirstlane_b32 s4, v0
; GFX6-NEXT: s_addc_u32 s4, s4, 0
; GFX6-NEXT: s_mul_i32 s14, s7, s14
-; GFX6-NEXT: s_add_u32 s14, s1, s14
-; GFX6-NEXT: v_mov_b32_e32 v0, s14
+; GFX6-NEXT: s_add_u32 s16, s1, s14
+; GFX6-NEXT: v_mov_b32_e32 v0, s16
; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0
-; GFX6-NEXT: s_addc_u32 s15, 0, s4
+; GFX6-NEXT: s_addc_u32 s17, 0, s4
; GFX6-NEXT: s_mov_b32 s1, s5
-; GFX6-NEXT: s_mul_i32 s4, s10, s15
+; GFX6-NEXT: s_mul_i32 s4, s10, s17
; GFX6-NEXT: v_readfirstlane_b32 s5, v0
; GFX6-NEXT: s_add_i32 s4, s5, s4
-; GFX6-NEXT: s_mul_i32 s5, s11, s14
-; GFX6-NEXT: s_add_i32 s16, s4, s5
-; GFX6-NEXT: s_sub_i32 s17, s7, s16
-; GFX6-NEXT: s_mul_i32 s4, s10, s14
+; GFX6-NEXT: s_mul_i32 s5, s11, s16
+; GFX6-NEXT: s_add_i32 s18, s4, s5
+; GFX6-NEXT: s_sub_i32 s14, s7, s18
+; GFX6-NEXT: s_mul_i32 s4, s10, s16
; GFX6-NEXT: s_sub_u32 s6, s6, s4
; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX6-NEXT: s_or_b32 s18, s4, s5
-; GFX6-NEXT: s_cmp_lg_u32 s18, 0
-; GFX6-NEXT: s_subb_u32 s17, s17, s11
-; GFX6-NEXT: s_sub_u32 s19, s6, s10
-; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX6-NEXT: s_or_b32 s15, s4, s5
+; GFX6-NEXT: s_subb_u32 s19, s14, s11
+; GFX6-NEXT: s_sub_u32 s20, s6, s10
+; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
+; GFX6-NEXT: s_or_b32 s14, s14, s15
+; GFX6-NEXT: s_subb_u32 s14, s19, 0
+; GFX6-NEXT: s_cmp_ge_u32 s14, s11
+; GFX6-NEXT: s_cselect_b32 s15, -1, 0
+; GFX6-NEXT: s_cmp_ge_u32 s20, s10
+; GFX6-NEXT: s_cselect_b32 s19, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s14, s11
+; GFX6-NEXT: s_cselect_b32 s14, s19, s15
+; GFX6-NEXT: s_add_u32 s15, s16, 1
+; GFX6-NEXT: s_addc_u32 s19, s17, 0
+; GFX6-NEXT: s_add_u32 s20, s16, 2
+; GFX6-NEXT: s_addc_u32 s21, s17, 0
+; GFX6-NEXT: s_cmp_lg_u32 s14, 0
+; GFX6-NEXT: s_cselect_b32 s14, s20, s15
+; GFX6-NEXT: s_cselect_b32 s15, s21, s19
; GFX6-NEXT: s_or_b32 s4, s4, s5
-; GFX6-NEXT: s_cmp_lg_u32 s4, 0
-; GFX6-NEXT: s_subb_u32 s4, s17, 0
+; GFX6-NEXT: s_subb_u32 s4, s7, s18
; GFX6-NEXT: s_cmp_ge_u32 s4, s11
; GFX6-NEXT: s_cselect_b32 s5, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s19, s10
-; GFX6-NEXT: s_cselect_b32 s17, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s4, s11
-; GFX6-NEXT: s_cselect_b32 s4, s17, s5
-; GFX6-NEXT: s_add_u32 s5, s14, 1
-; GFX6-NEXT: s_addc_u32 s17, s15, 0
-; GFX6-NEXT: s_add_u32 s19, s14, 2
-; GFX6-NEXT: s_addc_u32 s20, s15, 0
-; GFX6-NEXT: s_cmp_lg_u32 s4, 0
-; GFX6-NEXT: s_cselect_b32 s4, s19, s5
-; GFX6-NEXT: s_cselect_b32 s5, s20, s17
-; GFX6-NEXT: s_cmp_lg_u32 s18, 0
-; GFX6-NEXT: s_subb_u32 s7, s7, s16
-; GFX6-NEXT: s_cmp_ge_u32 s7, s11
-; GFX6-NEXT: s_cselect_b32 s16, -1, 0
; GFX6-NEXT: s_cmp_ge_u32 s6, s10
; GFX6-NEXT: s_cselect_b32 s6, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s7, s11
-; GFX6-NEXT: s_cselect_b32 s6, s6, s16
-; GFX6-NEXT: s_cmp_lg_u32 s6, 0
-; GFX6-NEXT: s_cselect_b32 s5, s5, s15
-; GFX6-NEXT: s_cselect_b32 s4, s4, s14
+; GFX6-NEXT: s_cmp_eq_u32 s4, s11
+; GFX6-NEXT: s_cselect_b32 s4, s6, s5
+; GFX6-NEXT: s_cmp_lg_u32 s4, 0
+; GFX6-NEXT: s_cselect_b32 s5, s15, s17
+; GFX6-NEXT: s_cselect_b32 s4, s14, s16
; GFX6-NEXT: s_xor_b64 s[6:7], s[12:13], s[8:9]
; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7]
; GFX6-NEXT: s_sub_u32 s4, s4, s6
@@ -7949,8 +7945,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT: s_sub_u32 s10, 0, s8
-; GFX9-NEXT: s_subb_u32 s11, 0, s9
+; GFX9-NEXT: s_sub_u32 s4, 0, s8
+; GFX9-NEXT: s_subb_u32 s5, 0, s9
; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GFX9-NEXT: v_rcp_f32_e32 v1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
@@ -7960,56 +7956,52 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT: v_readfirstlane_b32 s12, v2
-; GFX9-NEXT: v_readfirstlane_b32 s4, v1
-; GFX9-NEXT: s_mul_i32 s5, s10, s12
-; GFX9-NEXT: s_mul_hi_u32 s14, s10, s4
-; GFX9-NEXT: s_mul_i32 s13, s11, s4
-; GFX9-NEXT: s_add_i32 s5, s14, s5
-; GFX9-NEXT: s_mul_i32 s15, s10, s4
-; GFX9-NEXT: s_add_i32 s5, s5, s13
-; GFX9-NEXT: s_mul_hi_u32 s14, s4, s15
-; GFX9-NEXT: s_mul_i32 s16, s4, s5
-; GFX9-NEXT: s_mul_hi_u32 s13, s4, s5
+; GFX9-NEXT: v_readfirstlane_b32 s10, v2
+; GFX9-NEXT: v_readfirstlane_b32 s11, v1
+; GFX9-NEXT: s_mul_i32 s12, s4, s10
+; GFX9-NEXT: s_mul_hi_u32 s14, s4, s11
+; GFX9-NEXT: s_mul_i32 s13, s5, s11
+; GFX9-NEXT: s_add_i32 s12, s14, s12
+; GFX9-NEXT: s_mul_i32 s15, s4, s11
+; GFX9-NEXT: s_add_i32 s12, s12, s13
+; GFX9-NEXT: s_mul_hi_u32 s14, s11, s15
+; GFX9-NEXT: s_mul_i32 s16, s11, s12
+; GFX9-NEXT: s_mul_hi_u32 s13, s11, s12
; GFX9-NEXT: s_add_u32 s14, s14, s16
; GFX9-NEXT: s_addc_u32 s13, 0, s13
-; GFX9-NEXT: s_mul_hi_u32 s17, s12, s15
-; GFX9-NEXT: s_mul_i32 s15, s12, s15
+; GFX9-NEXT: s_mul_hi_u32 s17, s10, s15
+; GFX9-NEXT: s_mul_i32 s15, s10, s15
; GFX9-NEXT: s_add_u32 s14, s14, s15
-; GFX9-NEXT: s_mul_hi_u32 s16, s12, s5
+; GFX9-NEXT: s_mul_hi_u32 s16, s10, s12
; GFX9-NEXT: s_addc_u32 s13, s13, s17
; GFX9-NEXT: s_addc_u32 s14, s16, 0
-; GFX9-NEXT: s_mul_i32 s5, s12, s5
-; GFX9-NEXT: s_add_u32 s5, s13, s5
+; GFX9-NEXT: s_mul_i32 s12, s10, s12
+; GFX9-NEXT: s_add_u32 s12, s13, s12
; GFX9-NEXT: s_addc_u32 s13, 0, s14
-; GFX9-NEXT: s_add_u32 s14, s4, s5
-; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT: s_addc_u32 s12, s12, s13
-; GFX9-NEXT: s_mul_i32 s4, s10, s12
-; GFX9-NEXT: s_mul_hi_u32 s5, s10, s14
-; GFX9-NEXT: s_add_i32 s4, s5, s4
-; GFX9-NEXT: s_mul_i32 s11, s11, s14
-; GFX9-NEXT: s_add_i32 s4, s4, s11
-; GFX9-NEXT: s_mul_i32 s10, s10, s14
-; GFX9-NEXT: s_mul_hi_u32 s11, s12, s10
-; GFX9-NEXT: s_mul_i32 s13, s12, s10
-; GFX9-NEXT: s_mul_i32 s16, s14, s4
-; GFX9-NEXT: s_mul_hi_u32 s10, s14, s10
-; GFX9-NEXT: s_mul_hi_u32 s15, s14, s4
-; GFX9-NEXT: s_add_u32 s10, s10, s16
+; GFX9-NEXT: s_add_u32 s11, s11, s12
+; GFX9-NEXT: s_addc_u32 s10, s10, s13
+; GFX9-NEXT: s_mul_i32 s12, s4, s10
+; GFX9-NEXT: s_mul_hi_u32 s13, s4, s11
+; GFX9-NEXT: s_add_i32 s12, s13, s12
+; GFX9-NEXT: s_mul_i32 s5, s5, s11
+; GFX9-NEXT: s_add_i32 s12, s12, s5
+; GFX9-NEXT: s_mul_i32 s4, s4, s11
+; GFX9-NEXT: s_mul_hi_u32 s13, s10, s4
+; GFX9-NEXT: s_mul_i32 s14, s10, s4
+; GFX9-NEXT: s_mul_i32 s16, s11, s12
+; GFX9-NEXT: s_mul_hi_u32 s4, s11, s4
+; GFX9-NEXT: s_mul_hi_u32 s15, s11, s12
+; GFX9-NEXT: s_add_u32 s4, s4, s16
; GFX9-NEXT: s_addc_u32 s15, 0, s15
-; GFX9-NEXT: s_add_u32 s10, s10, s13
-; GFX9-NEXT: s_mul_hi_u32 s5, s12, s4
-; GFX9-NEXT: s_addc_u32 s10, s15, s11
+; GFX9-NEXT: s_add_u32 s4, s4, s14
+; GFX9-NEXT: s_mul_hi_u32 s5, s10, s12
+; GFX9-NEXT: s_addc_u32 s4, s15, s13
; GFX9-NEXT: s_addc_u32 s5, s5, 0
-; GFX9-NEXT: s_mul_i32 s4, s12, s4
-; GFX9-NEXT: s_add_u32 s4, s10, s4
-; GFX9-NEXT: s_addc_u32 s10, 0, s5
-; GFX9-NEXT: s_add_u32 s11, s14, s4
-; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT: s_addc_u32 s10, s12, s10
+; GFX9-NEXT: s_mul_i32 s12, s10, s12
+; GFX9-NEXT: s_add_u32 s4, s4, s12
+; GFX9-NEXT: s_addc_u32 s5, 0, s5
+; GFX9-NEXT: s_add_u32 s11, s11, s4
+; GFX9-NEXT: s_addc_u32 s10, s10, s5
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_ashr_i32 s4, s3, 31
; GFX9-NEXT: s_add_u32 s2, s2, s4
@@ -8028,38 +8020,35 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: s_addc_u32 s11, s12, s15
; GFX9-NEXT: s_addc_u32 s12, s14, 0
; GFX9-NEXT: s_mul_i32 s10, s3, s10
-; GFX9-NEXT: s_add_u32 s14, s11, s10
-; GFX9-NEXT: s_addc_u32 s15, 0, s12
-; GFX9-NEXT: s_mul_i32 s10, s8, s15
-; GFX9-NEXT: s_mul_hi_u32 s11, s8, s14
+; GFX9-NEXT: s_add_u32 s13, s11, s10
+; GFX9-NEXT: s_addc_u32 s12, 0, s12
+; GFX9-NEXT: s_mul_i32 s10, s8, s12
+; GFX9-NEXT: s_mul_hi_u32 s11, s8, s13
; GFX9-NEXT: s_add_i32 s10, s11, s10
-; GFX9-NEXT: s_mul_i32 s11, s9, s14
-; GFX9-NEXT: s_add_i32 s16, s10, s11
-; GFX9-NEXT: s_sub_i32 s12, s3, s16
-; GFX9-NEXT: s_mul_i32 s10, s8, s14
+; GFX9-NEXT: s_mul_i32 s11, s9, s13
+; GFX9-NEXT: s_add_i32 s14, s10, s11
+; GFX9-NEXT: s_sub_i32 s15, s3, s14
+; GFX9-NEXT: s_mul_i32 s10, s8, s13
; GFX9-NEXT: s_sub_u32 s2, s2, s10
; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT: s_subb_u32 s17, s12, s9
-; GFX9-NEXT: s_sub_u32 s18, s2, s8
-; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0
-; GFX9-NEXT: s_subb_u32 s12, s17, 0
-; GFX9-NEXT: s_cmp_ge_u32 s12, s9
-; GFX9-NEXT: s_cselect_b32 s13, -1, 0
-; GFX9-NEXT: s_cmp_ge_u32 s18, s8
+; GFX9-NEXT: s_subb_u32 s15, s15, s9
+; GFX9-NEXT: s_sub_u32 s16, s2, s8
+; GFX9-NEXT: s_subb_u32 s15, s15, 0
+; GFX9-NEXT: s_cmp_ge_u32 s15, s9
; GFX9-NEXT: s_cselect_b32 s17, -1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s12, s9
-; GFX9-NEXT: s_cselect_b32 s12, s17, s13
-; GFX9-NEXT: s_add_u32 s13, s14, 1
-; GFX9-NEXT: s_addc_u32 s17, s15, 0
-; GFX9-NEXT: s_add_u32 s18, s14, 2
-; GFX9-NEXT: s_addc_u32 s19, s15, 0
-; GFX9-NEXT: s_cmp_lg_u32 s12, 0
-; GFX9-NEXT: s_cselect_b32 s12, s18, s13
-; GFX9-NEXT: s_cselect_b32 s13, s19, s17
+; GFX9-NEXT: s_cmp_ge_u32 s16, s8
+; GFX9-NEXT: s_cselect_b32 s16, -1, 0
+; GFX9-NEXT: s_cmp_eq_u32 s15, s9
+; GFX9-NEXT: s_cselect_b32 s15, s16, s17
+; GFX9-NEXT: s_add_u32 s16, s13, 1
+; GFX9-NEXT: s_addc_u32 s17, s12, 0
+; GFX9-NEXT: s_add_u32 s18, s13, 2
+; GFX9-NEXT: s_addc_u32 s19, s12, 0
+; GFX9-NEXT: s_cmp_lg_u32 s15, 0
+; GFX9-NEXT: s_cselect_b32 s15, s18, s16
+; GFX9-NEXT: s_cselect_b32 s16, s19, s17
; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT: s_subb_u32 s3, s3, s16
+; GFX9-NEXT: s_subb_u32 s3, s3, s14
; GFX9-NEXT: s_cmp_ge_u32 s3, s9
; GFX9-NEXT: s_cselect_b32 s10, -1, 0
; GFX9-NEXT: s_cmp_ge_u32 s2, s8
@@ -8067,8 +8056,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: s_cmp_eq_u32 s3, s9
; GFX9-NEXT: s_cselect_b32 s2, s2, s10
; GFX9-NEXT: s_cmp_lg_u32 s2, 0
-; GFX9-NEXT: s_cselect_b32 s3, s13, s15
-; GFX9-NEXT: s_cselect_b32 s2, s12, s14
+; GFX9-NEXT: s_cselect_b32 s3, s16, s12
+; GFX9-NEXT: s_cselect_b32 s2, s15, s13
; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7]
; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5]
; GFX9-NEXT: s_sub_u32 s2, s2, s4
@@ -8328,10 +8317,9 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_addc_u32 s17, 0, s18
; GFX6-NEXT: s_add_u32 s18, s12, s13
; GFX6-NEXT: v_mov_b32_e32 v0, s18
-; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
; GFX6-NEXT: v_mul_hi_u32 v0, s14, v0
+; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_cmp_lg_u32 s12, 0
; GFX6-NEXT: s_addc_u32 s16, s16, s17
; GFX6-NEXT: s_mul_i32 s12, s14, s16
; GFX6-NEXT: v_readfirstlane_b32 s13, v0
@@ -8362,7 +8350,6 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_add_u32 s15, s18, s12
; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_cmp_lg_u32 s12, 0
; GFX6-NEXT: s_addc_u32 s14, s16, s14
; GFX6-NEXT: s_ashr_i32 s12, s9, 31
; GFX6-NEXT: s_add_u32 s8, s8, s12
@@ -8387,55 +8374,53 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_readfirstlane_b32 s16, v0
; GFX6-NEXT: s_addc_u32 s16, s16, 0
; GFX6-NEXT: s_mul_i32 s14, s9, s14
-; GFX6-NEXT: s_add_u32 s17, s15, s14
-; GFX6-NEXT: v_mov_b32_e32 v0, s17
+; GFX6-NEXT: s_add_u32 s18, s15, s14
+; GFX6-NEXT: v_mov_b32_e32 v0, s18
; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0
-; GFX6-NEXT: s_addc_u32 s16, 0, s16
-; GFX6-NEXT: s_mul_i32 s14, s6, s16
+; GFX6-NEXT: s_addc_u32 s19, 0, s16
+; GFX6-NEXT: s_mul_i32 s14, s6, s19
; GFX6-NEXT: v_readfirstlane_b32 s15, v0
; GFX6-NEXT: s_add_i32 s14, s15, s14
-; GFX6-NEXT: s_mul_i32 s15, s7, s17
-; GFX6-NEXT: s_add_i32 s18, s14, s15
-; GFX6-NEXT: s_sub_i32 s19, s9, s18
-; GFX6-NEXT: s_mul_i32 s14, s6, s17
+; GFX6-NEXT: s_mul_i32 s15, s7, s18
+; GFX6-NEXT: s_add_i32 s20, s14, s15
+; GFX6-NEXT: s_sub_i32 s16, s9, s20
+; GFX6-NEXT: s_mul_i32 s14, s6, s18
; GFX6-NEXT: s_sub_u32 s8, s8, s14
; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GFX6-NEXT: s_or_b32 s20, s14, s15
-; GFX6-NEXT: s_cmp_lg_u32 s20, 0
-; GFX6-NEXT: s_subb_u32 s19, s19, s7
-; GFX6-NEXT: s_sub_u32 s21, s8, s6
-; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
+; GFX6-NEXT: s_or_b32 s17, s14, s15
+; GFX6-NEXT: s_subb_u32 s21, s16, s7
+; GFX6-NEXT: s_sub_u32 s22, s8, s6
+; GFX6-NEXT: s_cselect_b64 s[16:17], -1, 0
+; GFX6-NEXT: s_or_b32 s16, s16, s17
+; GFX6-NEXT: s_subb_u32 s16, s21, 0
+; GFX6-NEXT: s_cmp_ge_u32 s16, s7
+; GFX6-NEXT: s_cselect_b32 s17, -1, 0
+; GFX6-NEXT: s_cmp_ge_u32 s22, s6
+; GFX6-NEXT: s_cselect_b32 s21, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s16, s7
+; GFX6-NEXT: s_cselect_b32 s16, s21, s17
+; GFX6-NEXT: s_add_u32 s17, s18, 1
+; GFX6-NEXT: s_addc_u32 s21, s19, 0
+; GFX6-NEXT: s_add_u32 s22, s18, 2
+; GFX6-NEXT: s_addc_u32 s23, s19, 0
+; GFX6-NEXT: s_cmp_lg_u32 s16, 0
+; GFX6-NEXT: s_cselect_b32 s16, s22, s17
+; GFX6-NEXT: s_cselect_b32 s17, s23, s21
; GFX6-NEXT: s_or_b32 s14, s14, s15
-; GFX6-NEXT: s_cmp_lg_u32 s14, 0
-; GFX6-NEXT: s_subb_u32 s14, s19, 0
-; GFX6-NEXT: s_cmp_ge_u32 s14, s7
-; GFX6-NEXT: s_cselect_b32 s15, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s21, s6
-; GFX6-NEXT: s_cselect_b32 s19, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s14, s7
-; GFX6-NEXT: s_cselect_b32 s14, s19, s15
-; GFX6-NEXT: s_add_u32 s15, s17, 1
-; GFX6-NEXT: s_addc_u32 s19, s16, 0
-; GFX6-NEXT: s_add_u32 s21, s17, 2
-; GFX6-NEXT: s_addc_u32 s22, s16, 0
-; GFX6-NEXT: s_cmp_lg_u32 s14, 0
-; GFX6-NEXT: s_cselect_b32 s14, s21, s15
-; GFX6-NEXT: s_cselect_b32 s15, s22, s19
-; GFX6-NEXT: s_cmp_lg_u32 s20, 0
-; GFX6-NEXT: s_subb_u32 s9, s9, s18
+; GFX6-NEXT: s_subb_u32 s9, s9, s20
; GFX6-NEXT: s_cmp_ge_u32 s9, s7
-; GFX6-NEXT: s_cselect_b32 s18, -1, 0
+; GFX6-NEXT: s_cselect_b32 s14, -1, 0
; GFX6-NEXT: s_cmp_ge_u32 s8, s6
; GFX6-NEXT: s_cselect_b32 s6, -1, 0
; GFX6-NEXT: s_cmp_eq_u32 s9, s7
-; GFX6-NEXT: s_cselect_b32 s6, s6, s18
+; GFX6-NEXT: s_cselect_b32 s6, s6, s14
; GFX6-NEXT: s_cmp_lg_u32 s6, 0
-; GFX6-NEXT: s_cselect_b32 s7, s15, s16
-; GFX6-NEXT: s_cselect_b32 s6, s14, s17
+; GFX6-NEXT: s_cselect_b32 s7, s17, s19
+; GFX6-NEXT: s_cselect_b32 s6, s16, s18
; GFX6-NEXT: s_xor_b64 s[2:3], s[12:13], s[2:3]
; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3]
-; GFX6-NEXT: s_sub_u32 s14, s6, s2
-; GFX6-NEXT: s_subb_u32 s15, s7, s3
+; GFX6-NEXT: s_sub_u32 s16, s6, s2
+; GFX6-NEXT: s_subb_u32 s17, s7, s3
; GFX6-NEXT: s_ashr_i32 s6, s1, 31
; GFX6-NEXT: s_add_u32 s0, s0, s6
; GFX6-NEXT: s_mov_b32 s7, s6
@@ -8454,40 +8439,39 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX6-NEXT: v_mul_hi_u32 v2, s12, v0
-; GFX6-NEXT: v_readfirstlane_b32 s16, v1
+; GFX6-NEXT: v_readfirstlane_b32 s14, v1
; GFX6-NEXT: v_readfirstlane_b32 s2, v0
-; GFX6-NEXT: s_mul_i32 s1, s12, s16
+; GFX6-NEXT: s_mul_i32 s1, s12, s14
; GFX6-NEXT: v_readfirstlane_b32 s3, v2
; GFX6-NEXT: s_mul_i32 s0, s13, s2
; GFX6-NEXT: s_add_i32 s1, s3, s1
; GFX6-NEXT: s_add_i32 s3, s1, s0
-; GFX6-NEXT: s_mul_i32 s17, s12, s2
+; GFX6-NEXT: s_mul_i32 s15, s12, s2
; GFX6-NEXT: v_mul_hi_u32 v2, v0, s3
-; GFX6-NEXT: v_mul_hi_u32 v0, v0, s17
+; GFX6-NEXT: v_mul_hi_u32 v0, v0, s15
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GFX6-NEXT: s_mul_i32 s4, s2, s3
; GFX6-NEXT: v_readfirstlane_b32 s5, v2
; GFX6-NEXT: v_readfirstlane_b32 s18, v0
-; GFX6-NEXT: v_mul_hi_u32 v0, v1, s17
+; GFX6-NEXT: v_mul_hi_u32 v0, v1, s15
; GFX6-NEXT: v_mul_hi_u32 v1, v1, s3
; GFX6-NEXT: s_add_u32 s4, s18, s4
; GFX6-NEXT: s_addc_u32 s5, 0, s5
-; GFX6-NEXT: s_mul_i32 s17, s16, s17
+; GFX6-NEXT: s_mul_i32 s15, s14, s15
; GFX6-NEXT: v_readfirstlane_b32 s18, v0
-; GFX6-NEXT: s_add_u32 s4, s4, s17
+; GFX6-NEXT: s_add_u32 s4, s4, s15
; GFX6-NEXT: s_addc_u32 s4, s5, s18
; GFX6-NEXT: v_readfirstlane_b32 s5, v1
; GFX6-NEXT: s_addc_u32 s5, s5, 0
-; GFX6-NEXT: s_mul_i32 s3, s16, s3
+; GFX6-NEXT: s_mul_i32 s3, s14, s3
; GFX6-NEXT: s_add_u32 s3, s4, s3
; GFX6-NEXT: s_addc_u32 s4, 0, s5
; GFX6-NEXT: s_add_u32 s5, s2, s3
; GFX6-NEXT: v_mov_b32_e32 v0, s5
-; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0
+; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX6-NEXT: s_or_b32 s2, s2, s3
-; GFX6-NEXT: s_cmp_lg_u32 s2, 0
-; GFX6-NEXT: s_addc_u32 s4, s16, s4
+; GFX6-NEXT: s_addc_u32 s4, s14, s4
; GFX6-NEXT: s_mul_i32 s2, s12, s4
; GFX6-NEXT: v_readfirstlane_b32 s3, v0
; GFX6-NEXT: s_add_i32 s2, s3, s2
@@ -8501,14 +8485,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_mul_hi_u32 v1, s4, v0
; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0
; GFX6-NEXT: s_mul_i32 s13, s5, s2
-; GFX6-NEXT: v_readfirstlane_b32 s17, v2
-; GFX6-NEXT: s_add_u32 s13, s17, s13
-; GFX6-NEXT: v_readfirstlane_b32 s16, v0
+; GFX6-NEXT: v_readfirstlane_b32 s15, v2
+; GFX6-NEXT: s_add_u32 s13, s15, s13
+; GFX6-NEXT: v_readfirstlane_b32 s14, v0
; GFX6-NEXT: s_mul_i32 s3, s4, s3
-; GFX6-NEXT: s_addc_u32 s16, 0, s16
+; GFX6-NEXT: s_addc_u32 s14, 0, s14
; GFX6-NEXT: v_readfirstlane_b32 s12, v3
; GFX6-NEXT: s_add_u32 s3, s13, s3
-; GFX6-NEXT: s_addc_u32 s3, s16, s12
+; GFX6-NEXT: s_addc_u32 s3, s14, s12
; GFX6-NEXT: v_readfirstlane_b32 s12, v1
; GFX6-NEXT: s_addc_u32 s12, s12, 0
; GFX6-NEXT: s_mul_i32 s2, s4, s2
@@ -8517,7 +8501,6 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_add_u32 s13, s5, s2
; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX6-NEXT: s_or_b32 s2, s2, s3
-; GFX6-NEXT: s_cmp_lg_u32 s2, 0
; GFX6-NEXT: s_addc_u32 s12, s4, s12
; GFX6-NEXT: s_ashr_i32 s4, s11, 31
; GFX6-NEXT: s_add_u32 s2, s10, s4
@@ -8529,72 +8512,70 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_mov_b32_e32 v2, s13
; GFX6-NEXT: v_mul_hi_u32 v3, s10, v2
; GFX6-NEXT: s_mul_i32 s2, s10, s12
-; GFX6-NEXT: v_readfirstlane_b32 s16, v1
+; GFX6-NEXT: v_readfirstlane_b32 s14, v1
; GFX6-NEXT: v_mul_hi_u32 v1, s11, v2
-; GFX6-NEXT: v_readfirstlane_b32 s17, v3
+; GFX6-NEXT: v_readfirstlane_b32 s15, v3
; GFX6-NEXT: v_mul_hi_u32 v0, s11, v0
-; GFX6-NEXT: s_add_u32 s2, s17, s2
-; GFX6-NEXT: s_addc_u32 s16, 0, s16
+; GFX6-NEXT: s_add_u32 s2, s15, s2
+; GFX6-NEXT: s_addc_u32 s14, 0, s14
; GFX6-NEXT: s_mul_i32 s13, s11, s13
-; GFX6-NEXT: v_readfirstlane_b32 s17, v1
+; GFX6-NEXT: v_readfirstlane_b32 s15, v1
; GFX6-NEXT: s_add_u32 s2, s2, s13
-; GFX6-NEXT: s_addc_u32 s2, s16, s17
+; GFX6-NEXT: s_addc_u32 s2, s14, s15
; GFX6-NEXT: v_readfirstlane_b32 s13, v0
; GFX6-NEXT: s_addc_u32 s13, s13, 0
; GFX6-NEXT: s_mul_i32 s12, s11, s12
-; GFX6-NEXT: s_add_u32 s16, s2, s12
-; GFX6-NEXT: v_mov_b32_e32 v0, s16
+; GFX6-NEXT: s_add_u32 s18, s2, s12
+; GFX6-NEXT: v_mov_b32_e32 v0, s18
; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0
-; GFX6-NEXT: s_addc_u32 s17, 0, s13
-; GFX6-NEXT: s_mul_i32 s12, s8, s17
+; GFX6-NEXT: s_addc_u32 s19, 0, s13
+; GFX6-NEXT: s_mul_i32 s12, s8, s19
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: v_readfirstlane_b32 s13, v0
; GFX6-NEXT: s_add_i32 s12, s13, s12
-; GFX6-NEXT: s_mul_i32 s13, s9, s16
-; GFX6-NEXT: s_add_i32 s18, s12, s13
-; GFX6-NEXT: s_sub_i32 s19, s11, s18
-; GFX6-NEXT: s_mul_i32 s12, s8, s16
+; GFX6-NEXT: s_mul_i32 s13, s9, s18
+; GFX6-NEXT: s_add_i32 s20, s12, s13
+; GFX6-NEXT: s_sub_i32 s14, s11, s20
+; GFX6-NEXT: s_mul_i32 s12, s8, s18
; GFX6-NEXT: s_sub_u32 s10, s10, s12
; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT: s_or_b32 s20, s12, s13
-; GFX6-NEXT: s_cmp_lg_u32 s20, 0
-; GFX6-NEXT: s_subb_u32 s19, s19, s9
-; GFX6-NEXT: s_sub_u32 s21, s10, s8
-; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT: s_or_b32 s15, s12, s13
+; GFX6-NEXT: s_subb_u32 s21, s14, s9
+; GFX6-NEXT: s_sub_u32 s22, s10, s8
+; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
+; GFX6-NEXT: s_or_b32 s14, s14, s15
+; GFX6-NEXT: s_subb_u32 s14, s21, 0
+; GFX6-NEXT: s_cmp_ge_u32 s14, s9
+; GFX6-NEXT: s_cselect_b32 s15, -1, 0
+; GFX6-NEXT: s_cmp_ge_u32 s22, s8
+; GFX6-NEXT: s_cselect_b32 s21, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s14, s9
+; GFX6-NEXT: s_cselect_b32 s14, s21, s15
+; GFX6-NEXT: s_add_u32 s15, s18, 1
+; GFX6-NEXT: s_addc_u32 s21, s19, 0
+; GFX6-NEXT: s_add_u32 s22, s18, 2
+; GFX6-NEXT: s_addc_u32 s23, s19, 0
+; GFX6-NEXT: s_cmp_lg_u32 s14, 0
+; GFX6-NEXT: s_cselect_b32 s14, s22, s15
+; GFX6-NEXT: s_cselect_b32 s15, s23, s21
; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_cmp_lg_u32 s12, 0
-; GFX6-NEXT: s_subb_u32 s12, s19, 0
-; GFX6-NEXT: s_cmp_ge_u32 s12, s9
-; GFX6-NEXT: s_cselect_b32 s13, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s21, s8
-; GFX6-NEXT: s_cselect_b32 s19, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s12, s9
-; GFX6-NEXT: s_cselect_b32 s12, s19, s13
-; GFX6-NEXT: s_add_u32 s13, s16, 1
-; GFX6-NEXT: s_addc_u32 s19, s17, 0
-; GFX6-NEXT: s_add_u32 s21, s16, 2
-; GFX6-NEXT: s_addc_u32 s22, s17, 0
-; GFX6-NEXT: s_cmp_lg_u32 s12, 0
-; GFX6-NEXT: s_cselect_b32 s12, s21, s13
-; GFX6-NEXT: s_cselect_b32 s13, s22, s19
-; GFX6-NEXT: s_cmp_lg_u32 s20, 0
-; GFX6-NEXT: s_subb_u32 s11, s11, s18
+; GFX6-NEXT: s_subb_u32 s11, s11, s20
; GFX6-NEXT: s_cmp_ge_u32 s11, s9
-; GFX6-NEXT: s_cselect_b32 s18, -1, 0
+; GFX6-NEXT: s_cselect_b32 s12, -1, 0
; GFX6-NEXT: s_cmp_ge_u32 s10, s8
; GFX6-NEXT: s_cselect_b32 s8, -1, 0
; GFX6-NEXT: s_cmp_eq_u32 s11, s9
-; GFX6-NEXT: s_cselect_b32 s8, s8, s18
+; GFX6-NEXT: s_cselect_b32 s8, s8, s12
; GFX6-NEXT: s_cmp_lg_u32 s8, 0
-; GFX6-NEXT: s_cselect_b32 s9, s13, s17
-; GFX6-NEXT: s_cselect_b32 s8, s12, s16
+; GFX6-NEXT: s_cselect_b32 s9, s15, s19
+; GFX6-NEXT: s_cselect_b32 s8, s14, s18
; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7]
; GFX6-NEXT: s_xor_b64 s[6:7], s[8:9], s[4:5]
; GFX6-NEXT: s_sub_u32 s4, s6, s4
; GFX6-NEXT: s_subb_u32 s5, s7, s5
; GFX6-NEXT: s_mov_b32 s2, -1
-; GFX6-NEXT: v_mov_b32_e32 v0, s14
-; GFX6-NEXT: v_mov_b32_e32 v1, s15
+; GFX6-NEXT: v_mov_b32_e32 v0, s16
+; GFX6-NEXT: v_mov_b32_e32 v1, s17
; GFX6-NEXT: v_mov_b32_e32 v2, s4
; GFX6-NEXT: v_mov_b32_e32 v3, s5
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
@@ -8614,8 +8595,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3]
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7
-; GFX9-NEXT: s_sub_u32 s14, 0, s6
-; GFX9-NEXT: s_subb_u32 s15, 0, s7
+; GFX9-NEXT: s_sub_u32 s12, 0, s6
+; GFX9-NEXT: s_subb_u32 s13, 0, s7
; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
; GFX9-NEXT: v_rcp_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -8624,56 +8605,52 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s16, v1
-; GFX9-NEXT: v_readfirstlane_b32 s12, v0
-; GFX9-NEXT: s_mul_i32 s13, s14, s16
-; GFX9-NEXT: s_mul_hi_u32 s18, s14, s12
-; GFX9-NEXT: s_mul_i32 s17, s15, s12
-; GFX9-NEXT: s_add_i32 s13, s18, s13
-; GFX9-NEXT: s_mul_i32 s19, s14, s12
-; GFX9-NEXT: s_add_i32 s13, s13, s17
-; GFX9-NEXT: s_mul_hi_u32 s18, s12, s19
-; GFX9-NEXT: s_mul_i32 s20, s12, s13
-; GFX9-NEXT: s_mul_hi_u32 s17, s12, s13
+; GFX9-NEXT: v_readfirstlane_b32 s14, v1
+; GFX9-NEXT: v_readfirstlane_b32 s15, v0
+; GFX9-NEXT: s_mul_i32 s16, s12, s14
+; GFX9-NEXT: s_mul_hi_u32 s18, s12, s15
+; GFX9-NEXT: s_mul_i32 s17, s13, s15
+; GFX9-NEXT: s_add_i32 s16, s18, s16
+; GFX9-NEXT: s_mul_i32 s19, s12, s15
+; GFX9-NEXT: s_add_i32 s16, s16, s17
+; GFX9-NEXT: s_mul_hi_u32 s18, s15, s19
+; GFX9-NEXT: s_mul_i32 s20, s15, s16
+; GFX9-NEXT: s_mul_hi_u32 s17, s15, s16
; GFX9-NEXT: s_add_u32 s18, s18, s20
; GFX9-NEXT: s_addc_u32 s17, 0, s17
-; GFX9-NEXT: s_mul_hi_u32 s20, s16, s19
-; GFX9-NEXT: s_mul_i32 s19, s16, s19
+; GFX9-NEXT: s_mul_hi_u32 s20, s14, s19
+; GFX9-NEXT: s_mul_i32 s19, s14, s19
; GFX9-NEXT: s_add_u32 s18, s18, s19
-; GFX9-NEXT: s_mul_hi_u32 s21, s16, s13
+; GFX9-NEXT: s_mul_hi_u32 s21, s14, s16
; GFX9-NEXT: s_addc_u32 s17, s17, s20
; GFX9-NEXT: s_addc_u32 s18, s21, 0
-; GFX9-NEXT: s_mul_i32 s13, s16, s13
-; GFX9-NEXT: s_add_u32 s13, s17, s13
+; GFX9-NEXT: s_mul_i32 s16, s14, s16
+; GFX9-NEXT: s_add_u32 s16, s17, s16
; GFX9-NEXT: s_addc_u32 s17, 0, s18
-; GFX9-NEXT: s_add_u32 s18, s12, s13
-; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0
-; GFX9-NEXT: s_addc_u32 s16, s16, s17
-; GFX9-NEXT: s_mul_i32 s12, s14, s16
-; GFX9-NEXT: s_mul_hi_u32 s13, s14, s18
-; GFX9-NEXT: s_add_i32 s12, s13, s12
-; GFX9-NEXT: s_mul_i32 s15, s15, s18
-; GFX9-NEXT: s_add_i32 s12, s12, s15
-; GFX9-NEXT: s_mul_i32 s14, s14, s18
-; GFX9-NEXT: s_mul_hi_u32 s15, s16, s14
-; GFX9-NEXT: s_mul_i32 s17, s16, s14
-; GFX9-NEXT: s_mul_i32 s20, s18, s12
-; GFX9-NEXT: s_mul_hi_u32 s14, s18, s14
-; GFX9-NEXT: s_mul_hi_u32 s19, s18, s12
-; GFX9-NEXT: s_add_u32 s14, s14, s20
+; GFX9-NEXT: s_add_u32 s15, s15, s16
+; GFX9-NEXT: s_addc_u32 s14, s14, s17
+; GFX9-NEXT: s_mul_i32 s16, s12, s14
+; GFX9-NEXT: s_mul_hi_u32 s17, s12, s15
+; GFX9-NEXT: s_add_i32 s16, s17, s16
+; GFX9-NEXT: s_mul_i32 s13, s13, s15
+; GFX9-NEXT: s_add_i32 s16, s16, s13
+; GFX9-NEXT: s_mul_i32 s12, s12, s15
+; GFX9-NEXT: s_mul_hi_u32 s17, s14, s12
+; GFX9-NEXT: s_mul_i32 s18, s14, s12
+; GFX9-NEXT: s_mul_i32 s20, s15, s16
+; GFX9-NEXT: s_mul_hi_u32 s12, s15, s12
+; GFX9-NEXT: s_mul_hi_u32 s19, s15, s16
+; GFX9-NEXT: s_add_u32 s12, s12, s20
; GFX9-NEXT: s_addc_u32 s19, 0, s19
-; GFX9-NEXT: s_add_u32 s14, s14, s17
-; GFX9-NEXT: s_mul_hi_u32 s13, s16, s12
-; GFX9-NEXT: s_addc_u32 s14, s19, s15
+; GFX9-NEXT: s_add_u32 s12, s12, s18
+; GFX9-NEXT: s_mul_hi_u32 s13, s14, s16
+; GFX9-NEXT: s_addc_u32 s12, s19, s17
; GFX9-NEXT: s_addc_u32 s13, s13, 0
-; GFX9-NEXT: s_mul_i32 s12, s16, s12
-; GFX9-NEXT: s_add_u32 s12, s14, s12
-; GFX9-NEXT: s_addc_u32 s14, 0, s13
-; GFX9-NEXT: s_add_u32 s15, s18, s12
-; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0
-; GFX9-NEXT: s_addc_u32 s14, s16, s14
+; GFX9-NEXT: s_mul_i32 s16, s14, s16
+; GFX9-NEXT: s_add_u32 s12, s12, s16
+; GFX9-NEXT: s_addc_u32 s13, 0, s13
+; GFX9-NEXT: s_add_u32 s15, s15, s12
+; GFX9-NEXT: s_addc_u32 s14, s14, s13
; GFX9-NEXT: s_ashr_i32 s12, s9, 31
; GFX9-NEXT: s_add_u32 s8, s8, s12
; GFX9-NEXT: s_mov_b32 s13, s12
@@ -8691,38 +8668,35 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_addc_u32 s15, s16, s19
; GFX9-NEXT: s_addc_u32 s16, s18, 0
; GFX9-NEXT: s_mul_i32 s14, s9, s14
-; GFX9-NEXT: s_add_u32 s18, s15, s14
-; GFX9-NEXT: s_addc_u32 s19, 0, s16
-; GFX9-NEXT: s_mul_i32 s14, s6, s19
-; GFX9-NEXT: s_mul_hi_u32 s15, s6, s18
+; GFX9-NEXT: s_add_u32 s17, s15, s14
+; GFX9-NEXT: s_addc_u32 s16, 0, s16
+; GFX9-NEXT: s_mul_i32 s14, s6, s16
+; GFX9-NEXT: s_mul_hi_u32 s15, s6, s17
; GFX9-NEXT: s_add_i32 s14, s15, s14
-; GFX9-NEXT: s_mul_i32 s15, s7, s18
-; GFX9-NEXT: s_add_i32 s20, s14, s15
-; GFX9-NEXT: s_sub_i32 s16, s9, s20
-; GFX9-NEXT: s_mul_i32 s14, s6, s18
+; GFX9-NEXT: s_mul_i32 s15, s7, s17
+; GFX9-NEXT: s_add_i32 s18, s14, s15
+; GFX9-NEXT: s_sub_i32 s19, s9, s18
+; GFX9-NEXT: s_mul_i32 s14, s6, s17
; GFX9-NEXT: s_sub_u32 s8, s8, s14
; GFX9-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0
-; GFX9-NEXT: s_subb_u32 s21, s16, s7
-; GFX9-NEXT: s_sub_u32 s22, s8, s6
-; GFX9-NEXT: s_cselect_b64 s[16:17], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[16:17], 0
-; GFX9-NEXT: s_subb_u32 s16, s21, 0
-; GFX9-NEXT: s_cmp_ge_u32 s16, s7
-; GFX9-NEXT: s_cselect_b32 s17, -1, 0
-; GFX9-NEXT: s_cmp_ge_u32 s22, s6
+; GFX9-NEXT: s_subb_u32 s19, s19, s7
+; GFX9-NEXT: s_sub_u32 s20, s8, s6
+; GFX9-NEXT: s_subb_u32 s19, s19, 0
+; GFX9-NEXT: s_cmp_ge_u32 s19, s7
; GFX9-NEXT: s_cselect_b32 s21, -1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s16, s7
-; GFX9-NEXT: s_cselect_b32 s16, s21, s17
-; GFX9-NEXT: s_add_u32 s17, s18, 1
-; GFX9-NEXT: s_addc_u32 s21, s19, 0
-; GFX9-NEXT: s_add_u32 s22, s18, 2
-; GFX9-NEXT: s_addc_u32 s23, s19, 0
-; GFX9-NEXT: s_cmp_lg_u32 s16, 0
-; GFX9-NEXT: s_cselect_b32 s16, s22, s17
-; GFX9-NEXT: s_cselect_b32 s17, s23, s21
+; GFX9-NEXT: s_cmp_ge_u32 s20, s6
+; GFX9-NEXT: s_cselect_b32 s20, -1, 0
+; GFX9-NEXT: s_cmp_eq_u32 s19, s7
+; GFX9-NEXT: s_cselect_b32 s19, s20, s21
+; GFX9-NEXT: s_add_u32 s20, s17, 1
+; GFX9-NEXT: s_addc_u32 s21, s16, 0
+; GFX9-NEXT: s_add_u32 s22, s17, 2
+; GFX9-NEXT: s_addc_u32 s23, s16, 0
+; GFX9-NEXT: s_cmp_lg_u32 s19, 0
+; GFX9-NEXT: s_cselect_b32 s19, s22, s20
+; GFX9-NEXT: s_cselect_b32 s20, s23, s21
; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0
-; GFX9-NEXT: s_subb_u32 s9, s9, s20
+; GFX9-NEXT: s_subb_u32 s9, s9, s18
; GFX9-NEXT: s_cmp_ge_u32 s9, s7
; GFX9-NEXT: s_cselect_b32 s14, -1, 0
; GFX9-NEXT: s_cmp_ge_u32 s8, s6
@@ -8730,12 +8704,12 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_cmp_eq_u32 s9, s7
; GFX9-NEXT: s_cselect_b32 s6, s6, s14
; GFX9-NEXT: s_cmp_lg_u32 s6, 0
-; GFX9-NEXT: s_cselect_b32 s7, s17, s19
-; GFX9-NEXT: s_cselect_b32 s6, s16, s18
+; GFX9-NEXT: s_cselect_b32 s7, s20, s16
+; GFX9-NEXT: s_cselect_b32 s6, s19, s17
; GFX9-NEXT: s_xor_b64 s[2:3], s[12:13], s[2:3]
; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3]
-; GFX9-NEXT: s_sub_u32 s14, s6, s2
-; GFX9-NEXT: s_subb_u32 s15, s7, s3
+; GFX9-NEXT: s_sub_u32 s12, s6, s2
+; GFX9-NEXT: s_subb_u32 s13, s7, s3
; GFX9-NEXT: s_ashr_i32 s2, s1, 31
; GFX9-NEXT: s_add_u32 s0, s0, s2
; GFX9-NEXT: s_mov_b32 s3, s2
@@ -8744,8 +8718,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT: s_sub_u32 s8, 0, s6
-; GFX9-NEXT: s_subb_u32 s9, 0, s7
+; GFX9-NEXT: s_sub_u32 s4, 0, s6
+; GFX9-NEXT: s_subb_u32 s5, 0, s7
; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
; GFX9-NEXT: v_rcp_f32_e32 v1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
@@ -8755,105 +8729,98 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT: v_readfirstlane_b32 s4, v1
-; GFX9-NEXT: v_readfirstlane_b32 s13, v2
-; GFX9-NEXT: s_mul_hi_u32 s12, s8, s4
-; GFX9-NEXT: s_mul_i32 s16, s8, s13
-; GFX9-NEXT: s_mul_i32 s5, s9, s4
-; GFX9-NEXT: s_add_i32 s12, s12, s16
-; GFX9-NEXT: s_add_i32 s12, s12, s5
-; GFX9-NEXT: s_mul_i32 s17, s8, s4
-; GFX9-NEXT: s_mul_i32 s16, s4, s12
-; GFX9-NEXT: s_mul_hi_u32 s18, s4, s17
-; GFX9-NEXT: s_mul_hi_u32 s5, s4, s12
+; GFX9-NEXT: v_readfirstlane_b32 s8, v1
+; GFX9-NEXT: v_readfirstlane_b32 s15, v2
+; GFX9-NEXT: s_mul_hi_u32 s14, s4, s8
+; GFX9-NEXT: s_mul_i32 s16, s4, s15
+; GFX9-NEXT: s_mul_i32 s9, s5, s8
+; GFX9-NEXT: s_add_i32 s14, s14, s16
+; GFX9-NEXT: s_add_i32 s14, s14, s9
+; GFX9-NEXT: s_mul_i32 s17, s4, s8
+; GFX9-NEXT: s_mul_i32 s16, s8, s14
+; GFX9-NEXT: s_mul_hi_u32 s18, s8, s17
+; GFX9-NEXT: s_mul_hi_u32 s9, s8, s14
; GFX9-NEXT: s_add_u32 s16, s18, s16
-; GFX9-NEXT: s_addc_u32 s5, 0, s5
-; GFX9-NEXT: s_mul_hi_u32 s19, s13, s17
-; GFX9-NEXT: s_mul_i32 s17, s13, s17
+; GFX9-NEXT: s_addc_u32 s9, 0, s9
+; GFX9-NEXT: s_mul_hi_u32 s19, s15, s17
+; GFX9-NEXT: s_mul_i32 s17, s15, s17
; GFX9-NEXT: s_add_u32 s16, s16, s17
-; GFX9-NEXT: s_mul_hi_u32 s18, s13, s12
-; GFX9-NEXT: s_addc_u32 s5, s5, s19
+; GFX9-NEXT: s_mul_hi_u32 s18, s15, s14
+; GFX9-NEXT: s_addc_u32 s9, s9, s19
; GFX9-NEXT: s_addc_u32 s16, s18, 0
-; GFX9-NEXT: s_mul_i32 s12, s13, s12
-; GFX9-NEXT: s_add_u32 s5, s5, s12
-; GFX9-NEXT: s_addc_u32 s12, 0, s16
-; GFX9-NEXT: s_add_u32 s16, s4, s5
-; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT: s_addc_u32 s12, s13, s12
-; GFX9-NEXT: s_mul_i32 s4, s8, s12
-; GFX9-NEXT: s_mul_hi_u32 s5, s8, s16
-; GFX9-NEXT: s_add_i32 s4, s5, s4
-; GFX9-NEXT: s_mul_i32 s9, s9, s16
-; GFX9-NEXT: s_add_i32 s4, s4, s9
-; GFX9-NEXT: s_mul_i32 s8, s8, s16
-; GFX9-NEXT: s_mul_hi_u32 s9, s12, s8
-; GFX9-NEXT: s_mul_i32 s13, s12, s8
-; GFX9-NEXT: s_mul_i32 s18, s16, s4
-; GFX9-NEXT: s_mul_hi_u32 s8, s16, s8
-; GFX9-NEXT: s_mul_hi_u32 s17, s16, s4
-; GFX9-NEXT: s_add_u32 s8, s8, s18
+; GFX9-NEXT: s_mul_i32 s14, s15, s14
+; GFX9-NEXT: s_add_u32 s9, s9, s14
+; GFX9-NEXT: s_addc_u32 s14, 0, s16
+; GFX9-NEXT: s_add_u32 s8, s8, s9
+; GFX9-NEXT: s_addc_u32 s9, s15, s14
+; GFX9-NEXT: s_mul_i32 s14, s4, s9
+; GFX9-NEXT: s_mul_hi_u32 s15, s4, s8
+; GFX9-NEXT: s_add_i32 s14, s15, s14
+; GFX9-NEXT: s_mul_i32 s5, s5, s8
+; GFX9-NEXT: s_add_i32 s14, s14, s5
+; GFX9-NEXT: s_mul_i32 s4, s4, s8
+; GFX9-NEXT: s_mul_hi_u32 s15, s9, s4
+; GFX9-NEXT: s_mul_i32 s16, s9, s4
+; GFX9-NEXT: s_mul_i32 s18, s8, s14
+; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4
+; GFX9-NEXT: s_mul_hi_u32 s17, s8, s14
+; GFX9-NEXT: s_add_u32 s4, s4, s18
; GFX9-NEXT: s_addc_u32 s17, 0, s17
-; GFX9-NEXT: s_add_u32 s8, s8, s13
-; GFX9-NEXT: s_mul_hi_u32 s5, s12, s4
-; GFX9-NEXT: s_addc_u32 s8, s17, s9
+; GFX9-NEXT: s_add_u32 s4, s4, s16
+; GFX9-NEXT: s_mul_hi_u32 s5, s9, s14
+; GFX9-NEXT: s_addc_u32 s4, s17, s15
; GFX9-NEXT: s_addc_u32 s5, s5, 0
-; GFX9-NEXT: s_mul_i32 s4, s12, s4
-; GFX9-NEXT: s_add_u32 s4, s8, s4
-; GFX9-NEXT: s_addc_u32 s8, 0, s5
-; GFX9-NEXT: s_add_u32 s13, s16, s4
-; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT: s_addc_u32 s12, s12, s8
+; GFX9-NEXT: s_mul_i32 s14, s9, s14
+; GFX9-NEXT: s_add_u32 s4, s4, s14
+; GFX9-NEXT: s_addc_u32 s5, 0, s5
+; GFX9-NEXT: s_add_u32 s14, s8, s4
+; GFX9-NEXT: s_addc_u32 s15, s9, s5
; GFX9-NEXT: s_ashr_i32 s4, s11, 31
; GFX9-NEXT: s_add_u32 s8, s10, s4
; GFX9-NEXT: s_mov_b32 s5, s4
; GFX9-NEXT: s_addc_u32 s9, s11, s4
; GFX9-NEXT: s_xor_b64 s[8:9], s[8:9], s[4:5]
-; GFX9-NEXT: s_mul_i32 s11, s8, s12
-; GFX9-NEXT: s_mul_hi_u32 s16, s8, s13
-; GFX9-NEXT: s_mul_hi_u32 s10, s8, s12
+; GFX9-NEXT: s_mul_i32 s11, s8, s15
+; GFX9-NEXT: s_mul_hi_u32 s16, s8, s14
+; GFX9-NEXT: s_mul_hi_u32 s10, s8, s15
; GFX9-NEXT: s_add_u32 s11, s16, s11
; GFX9-NEXT: s_addc_u32 s10, 0, s10
-; GFX9-NEXT: s_mul_hi_u32 s17, s9, s13
-; GFX9-NEXT: s_mul_i32 s13, s9, s13
-; GFX9-NEXT: s_add_u32 s11, s11, s13
-; GFX9-NEXT: s_mul_hi_u32 s16, s9, s12
+; GFX9-NEXT: s_mul_hi_u32 s17, s9, s14
+; GFX9-NEXT: s_mul_i32 s14, s9, s14
+; GFX9-NEXT: s_add_u32 s11, s11, s14
+; GFX9-NEXT: s_mul_hi_u32 s16, s9, s15
; GFX9-NEXT: s_addc_u32 s10, s10, s17
; GFX9-NEXT: s_addc_u32 s11, s16, 0
-; GFX9-NEXT: s_mul_i32 s12, s9, s12
-; GFX9-NEXT: s_add_u32 s16, s10, s12
-; GFX9-NEXT: s_addc_u32 s17, 0, s11
-; GFX9-NEXT: s_mul_i32 s10, s6, s17
-; GFX9-NEXT: s_mul_hi_u32 s11, s6, s16
+; GFX9-NEXT: s_mul_i32 s14, s9, s15
+; GFX9-NEXT: s_add_u32 s14, s10, s14
+; GFX9-NEXT: s_addc_u32 s15, 0, s11
+; GFX9-NEXT: s_mul_i32 s10, s6, s15
+; GFX9-NEXT: s_mul_hi_u32 s11, s6, s14
; GFX9-NEXT: s_add_i32 s10, s11, s10
-; GFX9-NEXT: s_mul_i32 s11, s7, s16
-; GFX9-NEXT: s_add_i32 s18, s10, s11
-; GFX9-NEXT: s_sub_i32 s12, s9, s18
-; GFX9-NEXT: s_mul_i32 s10, s6, s16
+; GFX9-NEXT: s_mul_i32 s11, s7, s14
+; GFX9-NEXT: s_add_i32 s16, s10, s11
+; GFX9-NEXT: s_sub_i32 s17, s9, s16
+; GFX9-NEXT: s_mul_i32 s10, s6, s14
; GFX9-NEXT: s_sub_u32 s8, s8, s10
; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT: s_subb_u32 s19, s12, s7
-; GFX9-NEXT: s_sub_u32 s20, s8, s6
-; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0
-; GFX9-NEXT: s_subb_u32 s12, s19, 0
-; GFX9-NEXT: s_cmp_ge_u32 s12, s7
-; GFX9-NEXT: s_cselect_b32 s13, -1, 0
-; GFX9-NEXT: s_cmp_ge_u32 s20, s6
+; GFX9-NEXT: s_subb_u32 s17, s17, s7
+; GFX9-NEXT: s_sub_u32 s18, s8, s6
+; GFX9-NEXT: s_subb_u32 s17, s17, 0
+; GFX9-NEXT: s_cmp_ge_u32 s17, s7
; GFX9-NEXT: s_cselect_b32 s19, -1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s12, s7
-; GFX9-NEXT: s_cselect_b32 s12, s19, s13
-; GFX9-NEXT: s_add_u32 s13, s16, 1
-; GFX9-NEXT: s_addc_u32 s19, s17, 0
-; GFX9-NEXT: s_add_u32 s20, s16, 2
-; GFX9-NEXT: s_addc_u32 s21, s17, 0
-; GFX9-NEXT: s_cmp_lg_u32 s12, 0
-; GFX9-NEXT: s_cselect_b32 s12, s20, s13
-; GFX9-NEXT: s_cselect_b32 s13, s21, s19
+; GFX9-NEXT: s_cmp_ge_u32 s18, s6
+; GFX9-NEXT: s_cselect_b32 s18, -1, 0
+; GFX9-NEXT: s_cmp_eq_u32 s17, s7
+; GFX9-NEXT: s_cselect_b32 s17, s18, s19
+; GFX9-NEXT: s_add_u32 s18, s14, 1
+; GFX9-NEXT: s_addc_u32 s19, s15, 0
+; GFX9-NEXT: s_add_u32 s20, s14, 2
+; GFX9-NEXT: s_addc_u32 s21, s15, 0
+; GFX9-NEXT: s_cmp_lg_u32 s17, 0
+; GFX9-NEXT: s_cselect_b32 s17, s20, s18
+; GFX9-NEXT: s_cselect_b32 s18, s21, s19
; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT: s_subb_u32 s9, s9, s18
+; GFX9-NEXT: s_subb_u32 s9, s9, s16
; GFX9-NEXT: s_cmp_ge_u32 s9, s7
; GFX9-NEXT: s_cselect_b32 s10, -1, 0
; GFX9-NEXT: s_cmp_ge_u32 s8, s6
@@ -8861,14 +8828,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_cmp_eq_u32 s9, s7
; GFX9-NEXT: s_cselect_b32 s6, s6, s10
; GFX9-NEXT: s_cmp_lg_u32 s6, 0
-; GFX9-NEXT: s_cselect_b32 s7, s13, s17
-; GFX9-NEXT: s_cselect_b32 s6, s12, s16
+; GFX9-NEXT: s_cselect_b32 s7, s18, s15
+; GFX9-NEXT: s_cselect_b32 s6, s17, s14
; GFX9-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3]
; GFX9-NEXT: s_xor_b64 s[4:5], s[6:7], s[2:3]
; GFX9-NEXT: s_sub_u32 s2, s4, s2
; GFX9-NEXT: s_subb_u32 s3, s5, s3
-; GFX9-NEXT: v_mov_b32_e32 v1, s14
-; GFX9-NEXT: v_mov_b32_e32 v2, s15
+; GFX9-NEXT: v_mov_b32_e32 v1, s12
+; GFX9-NEXT: v_mov_b32_e32 v2, s13
; GFX9-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NEXT: v_mov_b32_e32 v4, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -9089,10 +9056,9 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: s_addc_u32 s13, 0, s14
; GFX6-NEXT: s_add_u32 s14, s0, s1
; GFX6-NEXT: v_mov_b32_e32 v0, s14
-; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0
+; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX6-NEXT: s_or_b32 s0, s0, s1
-; GFX6-NEXT: s_cmp_lg_u32 s0, 0
; GFX6-NEXT: s_addc_u32 s12, s12, s13
; GFX6-NEXT: s_mul_i32 s0, s10, s12
; GFX6-NEXT: v_readfirstlane_b32 s1, v0
@@ -9123,7 +9089,6 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: s_add_u32 s13, s14, s0
; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX6-NEXT: s_or_b32 s0, s0, s1
-; GFX6-NEXT: s_cmp_lg_u32 s0, 0
; GFX6-NEXT: s_addc_u32 s12, s12, s10
; GFX6-NEXT: s_ashr_i32 s10, s7, 31
; GFX6-NEXT: s_add_u32 s0, s6, s10
@@ -9158,46 +9123,43 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: v_readfirstlane_b32 s5, v0
; GFX6-NEXT: s_add_i32 s4, s5, s4
; GFX6-NEXT: s_mul_i32 s5, s9, s12
-; GFX6-NEXT: s_add_i32 s13, s4, s5
-; GFX6-NEXT: s_sub_i32 s14, s7, s13
+; GFX6-NEXT: s_add_i32 s14, s4, s5
+; GFX6-NEXT: s_sub_i32 s13, s7, s14
; GFX6-NEXT: s_mul_i32 s4, s8, s12
; GFX6-NEXT: s_sub_u32 s6, s6, s4
; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX6-NEXT: s_or_b32 s12, s4, s5
-; GFX6-NEXT: s_cmp_lg_u32 s12, 0
-; GFX6-NEXT: s_subb_u32 s14, s14, s9
-; GFX6-NEXT: s_sub_u32 s15, s6, s8
-; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX6-NEXT: s_subb_u32 s15, s13, s9
+; GFX6-NEXT: s_sub_u32 s16, s6, s8
+; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT: s_or_b32 s17, s12, s13
+; GFX6-NEXT: s_subb_u32 s17, s15, 0
+; GFX6-NEXT: s_cmp_ge_u32 s17, s9
+; GFX6-NEXT: s_cselect_b32 s18, -1, 0
+; GFX6-NEXT: s_cmp_ge_u32 s16, s8
+; GFX6-NEXT: s_cselect_b32 s19, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s17, s9
+; GFX6-NEXT: s_cselect_b32 s18, s19, s18
+; GFX6-NEXT: s_or_b32 s12, s12, s13
+; GFX6-NEXT: s_subb_u32 s15, s15, s9
+; GFX6-NEXT: s_sub_u32 s19, s16, s8
+; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT: s_or_b32 s12, s12, s13
+; GFX6-NEXT: s_subb_u32 s12, s15, 0
+; GFX6-NEXT: s_cmp_lg_u32 s18, 0
+; GFX6-NEXT: s_cselect_b32 s13, s19, s16
+; GFX6-NEXT: s_cselect_b32 s12, s12, s17
; GFX6-NEXT: s_or_b32 s4, s4, s5
-; GFX6-NEXT: s_cmp_lg_u32 s4, 0
-; GFX6-NEXT: s_subb_u32 s16, s14, 0
-; GFX6-NEXT: s_cmp_ge_u32 s16, s9
+; GFX6-NEXT: s_subb_u32 s4, s7, s14
+; GFX6-NEXT: s_cmp_ge_u32 s4, s9
; GFX6-NEXT: s_cselect_b32 s5, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s15, s8
-; GFX6-NEXT: s_cselect_b32 s17, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s16, s9
-; GFX6-NEXT: s_cselect_b32 s17, s17, s5
-; GFX6-NEXT: s_cmp_lg_u32 s4, 0
-; GFX6-NEXT: s_subb_u32 s14, s14, s9
-; GFX6-NEXT: s_sub_u32 s18, s15, s8
-; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX6-NEXT: s_or_b32 s4, s4, s5
-; GFX6-NEXT: s_cmp_lg_u32 s4, 0
-; GFX6-NEXT: s_subb_u32 s4, s14, 0
-; GFX6-NEXT: s_cmp_lg_u32 s17, 0
-; GFX6-NEXT: s_cselect_b32 s14, s18, s15
-; GFX6-NEXT: s_cselect_b32 s4, s4, s16
-; GFX6-NEXT: s_cmp_lg_u32 s12, 0
-; GFX6-NEXT: s_subb_u32 s5, s7, s13
-; GFX6-NEXT: s_cmp_ge_u32 s5, s9
-; GFX6-NEXT: s_cselect_b32 s7, -1, 0
; GFX6-NEXT: s_cmp_ge_u32 s6, s8
-; GFX6-NEXT: s_cselect_b32 s8, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s5, s9
-; GFX6-NEXT: s_cselect_b32 s7, s8, s7
-; GFX6-NEXT: s_cmp_lg_u32 s7, 0
-; GFX6-NEXT: s_cselect_b32 s5, s4, s5
-; GFX6-NEXT: s_cselect_b32 s4, s14, s6
+; GFX6-NEXT: s_cselect_b32 s7, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s4, s9
+; GFX6-NEXT: s_cselect_b32 s5, s7, s5
+; GFX6-NEXT: s_cmp_lg_u32 s5, 0
+; GFX6-NEXT: s_cselect_b32 s5, s12, s4
+; GFX6-NEXT: s_cselect_b32 s4, s13, s6
; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11]
; GFX6-NEXT: s_sub_u32 s4, s4, s10
; GFX6-NEXT: s_subb_u32 s5, s5, s10
@@ -9219,8 +9181,8 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT: s_sub_u32 s8, 0, s6
-; GFX9-NEXT: s_subb_u32 s9, 0, s7
+; GFX9-NEXT: s_sub_u32 s4, 0, s6
+; GFX9-NEXT: s_subb_u32 s5, 0, s7
; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GFX9-NEXT: v_rcp_f32_e32 v1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
@@ -9230,56 +9192,52 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT: v_readfirstlane_b32 s10, v2
-; GFX9-NEXT: v_readfirstlane_b32 s4, v1
-; GFX9-NEXT: s_mul_i32 s5, s8, s10
-; GFX9-NEXT: s_mul_hi_u32 s12, s8, s4
-; GFX9-NEXT: s_mul_i32 s11, s9, s4
-; GFX9-NEXT: s_add_i32 s5, s12, s5
-; GFX9-NEXT: s_mul_i32 s13, s8, s4
-; GFX9-NEXT: s_add_i32 s5, s5, s11
-; GFX9-NEXT: s_mul_hi_u32 s12, s4, s13
-; GFX9-NEXT: s_mul_i32 s14, s4, s5
-; GFX9-NEXT: s_mul_hi_u32 s11, s4, s5
+; GFX9-NEXT: v_readfirstlane_b32 s8, v2
+; GFX9-NEXT: v_readfirstlane_b32 s9, v1
+; GFX9-NEXT: s_mul_i32 s10, s4, s8
+; GFX9-NEXT: s_mul_hi_u32 s12, s4, s9
+; GFX9-NEXT: s_mul_i32 s11, s5, s9
+; GFX9-NEXT: s_add_i32 s10, s12, s10
+; GFX9-NEXT: s_mul_i32 s13, s4, s9
+; GFX9-NEXT: s_add_i32 s10, s10, s11
+; GFX9-NEXT: s_mul_hi_u32 s12, s9, s13
+; GFX9-NEXT: s_mul_i32 s14, s9, s10
+; GFX9-NEXT: s_mul_hi_u32 s11, s9, s10
; GFX9-NEXT: s_add_u32 s12, s12, s14
; GFX9-NEXT: s_addc_u32 s11, 0, s11
-; GFX9-NEXT: s_mul_hi_u32 s15, s10, s13
-; GFX9-NEXT: s_mul_i32 s13, s10, s13
+; GFX9-NEXT: s_mul_hi_u32 s15, s8, s13
+; GFX9-NEXT: s_mul_i32 s13, s8, s13
; GFX9-NEXT: s_add_u32 s12, s12, s13
-; GFX9-NEXT: s_mul_hi_u32 s14, s10, s5
+; GFX9-NEXT: s_mul_hi_u32 s14, s8, s10
; GFX9-NEXT: s_addc_u32 s11, s11, s15
; GFX9-NEXT: s_addc_u32 s12, s14, 0
-; GFX9-NEXT: s_mul_i32 s5, s10, s5
-; GFX9-NEXT: s_add_u32 s5, s11, s5
+; GFX9-NEXT: s_mul_i32 s10, s8, s10
+; GFX9-NEXT: s_add_u32 s10, s11, s10
; GFX9-NEXT: s_addc_u32 s11, 0, s12
-; GFX9-NEXT: s_add_u32 s12, s4, s5
-; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT: s_addc_u32 s10, s10, s11
-; GFX9-NEXT: s_mul_i32 s4, s8, s10
-; GFX9-NEXT: s_mul_hi_u32 s5, s8, s12
-; GFX9-NEXT: s_add_i32 s4, s5, s4
-; GFX9-NEXT: s_mul_i32 s9, s9, s12
-; GFX9-NEXT: s_add_i32 s4, s4, s9
-; GFX9-NEXT: s_mul_i32 s8, s8, s12
-; GFX9-NEXT: s_mul_hi_u32 s9, s10, s8
-; GFX9-NEXT: s_mul_i32 s11, s10, s8
-; GFX9-NEXT: s_mul_i32 s14, s12, s4
-; GFX9-NEXT: s_mul_hi_u32 s8, s12, s8
-; GFX9-NEXT: s_mul_hi_u32 s13, s12, s4
-; GFX9-NEXT: s_add_u32 s8, s8, s14
+; GFX9-NEXT: s_add_u32 s9, s9, s10
+; GFX9-NEXT: s_addc_u32 s8, s8, s11
+; GFX9-NEXT: s_mul_i32 s10, s4, s8
+; GFX9-NEXT: s_mul_hi_u32 s11, s4, s9
+; GFX9-NEXT: s_add_i32 s10, s11, s10
+; GFX9-NEXT: s_mul_i32 s5, s5, s9
+; GFX9-NEXT: s_add_i32 s10, s10, s5
+; GFX9-NEXT: s_mul_i32 s4, s4, s9
+; GFX9-NEXT: s_mul_hi_u32 s11, s8, s4
+; GFX9-NEXT: s_mul_i32 s12, s8, s4
+; GFX9-NEXT: s_mul_i32 s14, s9, s10
+; GFX9-NEXT: s_mul_hi_u32 s4, s9, s4
+; GFX9-NEXT: s_mul_hi_u32 s13, s9, s10
+; GFX9-NEXT: s_add_u32 s4, s4, s14
; GFX9-NEXT: s_addc_u32 s13, 0, s13
-; GFX9-NEXT: s_add_u32 s8, s8, s11
-; GFX9-NEXT: s_mul_hi_u32 s5, s10, s4
-; GFX9-NEXT: s_addc_u32 s8, s13, s9
+; GFX9-NEXT: s_add_u32 s4, s4, s12
+; GFX9-NEXT: s_mul_hi_u32 s5, s8, s10
+; GFX9-NEXT: s_addc_u32 s4, s13, s11
; GFX9-NEXT: s_addc_u32 s5, s5, 0
-; GFX9-NEXT: s_mul_i32 s4, s10, s4
-; GFX9-NEXT: s_add_u32 s4, s8, s4
-; GFX9-NEXT: s_addc_u32 s8, 0, s5
-; GFX9-NEXT: s_add_u32 s9, s12, s4
-; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT: s_addc_u32 s8, s10, s8
+; GFX9-NEXT: s_mul_i32 s10, s8, s10
+; GFX9-NEXT: s_add_u32 s4, s4, s10
+; GFX9-NEXT: s_addc_u32 s5, 0, s5
+; GFX9-NEXT: s_add_u32 s9, s9, s4
+; GFX9-NEXT: s_addc_u32 s8, s8, s5
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_ashr_i32 s4, s3, 31
; GFX9-NEXT: s_add_u32 s2, s2, s4
@@ -9309,11 +9267,9 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: s_mul_i32 s8, s6, s8
; GFX9-NEXT: s_sub_u32 s2, s2, s8
; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX9-NEXT: s_subb_u32 s13, s10, s7
; GFX9-NEXT: s_sub_u32 s14, s2, s6
; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
; GFX9-NEXT: s_subb_u32 s15, s13, 0
; GFX9-NEXT: s_cmp_ge_u32 s15, s7
; GFX9-NEXT: s_cselect_b32 s16, -1, 0
@@ -9322,13 +9278,11 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: s_cmp_eq_u32 s15, s7
; GFX9-NEXT: s_cselect_b32 s16, s17, s16
; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT: s_subb_u32 s13, s13, s7
-; GFX9-NEXT: s_sub_u32 s17, s14, s6
-; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT: s_subb_u32 s10, s13, 0
+; GFX9-NEXT: s_subb_u32 s10, s13, s7
+; GFX9-NEXT: s_sub_u32 s11, s14, s6
+; GFX9-NEXT: s_subb_u32 s10, s10, 0
; GFX9-NEXT: s_cmp_lg_u32 s16, 0
-; GFX9-NEXT: s_cselect_b32 s11, s17, s14
+; GFX9-NEXT: s_cselect_b32 s11, s11, s14
; GFX9-NEXT: s_cselect_b32 s10, s10, s15
; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX9-NEXT: s_subb_u32 s3, s3, s12
@@ -9490,10 +9444,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_addc_u32 s15, 0, s16
; GFX6-NEXT: s_add_u32 s16, s6, s7
; GFX6-NEXT: v_mov_b32_e32 v0, s16
-; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0
+; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX6-NEXT: s_or_b32 s6, s6, s7
-; GFX6-NEXT: s_cmp_lg_u32 s6, 0
; GFX6-NEXT: s_addc_u32 s14, s14, s15
; GFX6-NEXT: s_mul_i32 s6, s12, s14
; GFX6-NEXT: v_readfirstlane_b32 s7, v0
@@ -9524,7 +9477,6 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_add_u32 s13, s16, s6
; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX6-NEXT: s_or_b32 s6, s6, s7
-; GFX6-NEXT: s_cmp_lg_u32 s6, 0
; GFX6-NEXT: s_addc_u32 s12, s14, s12
; GFX6-NEXT: s_ashr_i32 s6, s9, 31
; GFX6-NEXT: s_add_u32 s8, s8, s6
@@ -9557,49 +9509,46 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_readfirstlane_b32 s14, v0
; GFX6-NEXT: s_add_i32 s13, s14, s13
; GFX6-NEXT: s_mul_i32 s14, s3, s12
-; GFX6-NEXT: s_add_i32 s14, s13, s14
-; GFX6-NEXT: s_sub_i32 s15, s9, s14
+; GFX6-NEXT: s_add_i32 s16, s13, s14
+; GFX6-NEXT: s_sub_i32 s14, s9, s16
; GFX6-NEXT: s_mul_i32 s12, s2, s12
; GFX6-NEXT: s_sub_u32 s8, s8, s12
; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT: s_or_b32 s16, s12, s13
-; GFX6-NEXT: s_cmp_lg_u32 s16, 0
-; GFX6-NEXT: s_subb_u32 s15, s15, s3
-; GFX6-NEXT: s_sub_u32 s17, s8, s2
-; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_cmp_lg_u32 s12, 0
-; GFX6-NEXT: s_subb_u32 s18, s15, 0
-; GFX6-NEXT: s_cmp_ge_u32 s18, s3
-; GFX6-NEXT: s_cselect_b32 s13, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s17, s2
-; GFX6-NEXT: s_cselect_b32 s19, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s18, s3
-; GFX6-NEXT: s_cselect_b32 s19, s19, s13
-; GFX6-NEXT: s_cmp_lg_u32 s12, 0
-; GFX6-NEXT: s_subb_u32 s15, s15, s3
-; GFX6-NEXT: s_sub_u32 s20, s17, s2
-; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT: s_or_b32 s15, s12, s13
+; GFX6-NEXT: s_subb_u32 s17, s14, s3
+; GFX6-NEXT: s_sub_u32 s18, s8, s2
+; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
+; GFX6-NEXT: s_or_b32 s19, s14, s15
+; GFX6-NEXT: s_subb_u32 s19, s17, 0
+; GFX6-NEXT: s_cmp_ge_u32 s19, s3
+; GFX6-NEXT: s_cselect_b32 s20, -1, 0
+; GFX6-NEXT: s_cmp_ge_u32 s18, s2
+; GFX6-NEXT: s_cselect_b32 s21, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s19, s3
+; GFX6-NEXT: s_cselect_b32 s20, s21, s20
+; GFX6-NEXT: s_or_b32 s14, s14, s15
+; GFX6-NEXT: s_subb_u32 s17, s17, s3
+; GFX6-NEXT: s_sub_u32 s21, s18, s2
+; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
+; GFX6-NEXT: s_or_b32 s14, s14, s15
+; GFX6-NEXT: s_subb_u32 s14, s17, 0
+; GFX6-NEXT: s_cmp_lg_u32 s20, 0
+; GFX6-NEXT: s_cselect_b32 s15, s21, s18
+; GFX6-NEXT: s_cselect_b32 s14, s14, s19
; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_cmp_lg_u32 s12, 0
-; GFX6-NEXT: s_subb_u32 s12, s15, 0
-; GFX6-NEXT: s_cmp_lg_u32 s19, 0
-; GFX6-NEXT: s_cselect_b32 s13, s20, s17
-; GFX6-NEXT: s_cselect_b32 s12, s12, s18
-; GFX6-NEXT: s_cmp_lg_u32 s16, 0
-; GFX6-NEXT: s_subb_u32 s9, s9, s14
+; GFX6-NEXT: s_subb_u32 s9, s9, s16
; GFX6-NEXT: s_cmp_ge_u32 s9, s3
-; GFX6-NEXT: s_cselect_b32 s14, -1, 0
+; GFX6-NEXT: s_cselect_b32 s12, -1, 0
; GFX6-NEXT: s_cmp_ge_u32 s8, s2
; GFX6-NEXT: s_cselect_b32 s2, -1, 0
; GFX6-NEXT: s_cmp_eq_u32 s9, s3
-; GFX6-NEXT: s_cselect_b32 s2, s2, s14
+; GFX6-NEXT: s_cselect_b32 s2, s2, s12
; GFX6-NEXT: s_cmp_lg_u32 s2, 0
-; GFX6-NEXT: s_cselect_b32 s3, s12, s9
-; GFX6-NEXT: s_cselect_b32 s2, s13, s8
+; GFX6-NEXT: s_cselect_b32 s3, s14, s9
+; GFX6-NEXT: s_cselect_b32 s2, s15, s8
; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7]
-; GFX6-NEXT: s_sub_u32 s12, s2, s6
-; GFX6-NEXT: s_subb_u32 s13, s3, s6
+; GFX6-NEXT: s_sub_u32 s14, s2, s6
+; GFX6-NEXT: s_subb_u32 s15, s3, s6
; GFX6-NEXT: s_ashr_i32 s2, s1, 31
; GFX6-NEXT: s_add_u32 s0, s0, s2
; GFX6-NEXT: s_mov_b32 s3, s2
@@ -9618,40 +9567,39 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX6-NEXT: v_mul_hi_u32 v2, s8, v0
-; GFX6-NEXT: v_readfirstlane_b32 s14, v1
+; GFX6-NEXT: v_readfirstlane_b32 s12, v1
; GFX6-NEXT: v_readfirstlane_b32 s2, v0
-; GFX6-NEXT: s_mul_i32 s1, s8, s14
+; GFX6-NEXT: s_mul_i32 s1, s8, s12
; GFX6-NEXT: v_readfirstlane_b32 s3, v2
; GFX6-NEXT: s_mul_i32 s0, s9, s2
; GFX6-NEXT: s_add_i32 s1, s3, s1
; GFX6-NEXT: s_add_i32 s3, s1, s0
-; GFX6-NEXT: s_mul_i32 s15, s8, s2
+; GFX6-NEXT: s_mul_i32 s13, s8, s2
; GFX6-NEXT: v_mul_hi_u32 v2, v0, s3
-; GFX6-NEXT: v_mul_hi_u32 v0, v0, s15
+; GFX6-NEXT: v_mul_hi_u32 v0, v0, s13
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GFX6-NEXT: s_mul_i32 s4, s2, s3
; GFX6-NEXT: v_readfirstlane_b32 s5, v2
; GFX6-NEXT: v_readfirstlane_b32 s16, v0
-; GFX6-NEXT: v_mul_hi_u32 v0, v1, s15
+; GFX6-NEXT: v_mul_hi_u32 v0, v1, s13
; GFX6-NEXT: v_mul_hi_u32 v1, v1, s3
; GFX6-NEXT: s_add_u32 s4, s16, s4
; GFX6-NEXT: s_addc_u32 s5, 0, s5
-; GFX6-NEXT: s_mul_i32 s15, s14, s15
+; GFX6-NEXT: s_mul_i32 s13, s12, s13
; GFX6-NEXT: v_readfirstlane_b32 s16, v0
-; GFX6-NEXT: s_add_u32 s4, s4, s15
+; GFX6-NEXT: s_add_u32 s4, s4, s13
; GFX6-NEXT: s_addc_u32 s4, s5, s16
; GFX6-NEXT: v_readfirstlane_b32 s5, v1
; GFX6-NEXT: s_addc_u32 s5, s5, 0
-; GFX6-NEXT: s_mul_i32 s3, s14, s3
+; GFX6-NEXT: s_mul_i32 s3, s12, s3
; GFX6-NEXT: s_add_u32 s3, s4, s3
; GFX6-NEXT: s_addc_u32 s4, 0, s5
; GFX6-NEXT: s_add_u32 s5, s2, s3
; GFX6-NEXT: v_mov_b32_e32 v0, s5
-; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0
+; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX6-NEXT: s_or_b32 s2, s2, s3
-; GFX6-NEXT: s_cmp_lg_u32 s2, 0
-; GFX6-NEXT: s_addc_u32 s4, s14, s4
+; GFX6-NEXT: s_addc_u32 s4, s12, s4
; GFX6-NEXT: s_mul_i32 s2, s8, s4
; GFX6-NEXT: v_readfirstlane_b32 s3, v0
; GFX6-NEXT: s_add_i32 s2, s3, s2
@@ -9665,102 +9613,98 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_mul_hi_u32 v1, s4, v0
; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0
; GFX6-NEXT: s_mul_i32 s9, s5, s2
-; GFX6-NEXT: v_readfirstlane_b32 s15, v2
-; GFX6-NEXT: s_add_u32 s9, s15, s9
-; GFX6-NEXT: v_readfirstlane_b32 s14, v0
+; GFX6-NEXT: v_readfirstlane_b32 s13, v2
+; GFX6-NEXT: s_add_u32 s9, s13, s9
+; GFX6-NEXT: v_readfirstlane_b32 s12, v0
; GFX6-NEXT: s_mul_i32 s3, s4, s3
-; GFX6-NEXT: s_addc_u32 s14, 0, s14
+; GFX6-NEXT: s_addc_u32 s12, 0, s12
; GFX6-NEXT: v_readfirstlane_b32 s8, v3
; GFX6-NEXT: s_add_u32 s3, s9, s3
-; GFX6-NEXT: s_addc_u32 s3, s14, s8
+; GFX6-NEXT: s_addc_u32 s3, s12, s8
; GFX6-NEXT: v_readfirstlane_b32 s8, v1
; GFX6-NEXT: s_addc_u32 s8, s8, 0
; GFX6-NEXT: s_mul_i32 s2, s4, s2
; GFX6-NEXT: s_add_u32 s2, s3, s2
; GFX6-NEXT: s_addc_u32 s8, 0, s8
-; GFX6-NEXT: s_add_u32 s14, s5, s2
+; GFX6-NEXT: s_add_u32 s12, s5, s2
; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX6-NEXT: s_or_b32 s2, s2, s3
-; GFX6-NEXT: s_cmp_lg_u32 s2, 0
-; GFX6-NEXT: s_addc_u32 s15, s4, s8
+; GFX6-NEXT: s_addc_u32 s13, s4, s8
; GFX6-NEXT: s_ashr_i32 s4, s11, 31
; GFX6-NEXT: s_add_u32 s2, s10, s4
; GFX6-NEXT: s_mov_b32 s5, s4
; GFX6-NEXT: s_addc_u32 s3, s11, s4
; GFX6-NEXT: s_xor_b64 s[8:9], s[2:3], s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v0, s15
+; GFX6-NEXT: v_mov_b32_e32 v0, s13
; GFX6-NEXT: v_mul_hi_u32 v1, s8, v0
-; GFX6-NEXT: v_mov_b32_e32 v2, s14
+; GFX6-NEXT: v_mov_b32_e32 v2, s12
; GFX6-NEXT: v_mul_hi_u32 v3, s8, v2
-; GFX6-NEXT: s_mul_i32 s2, s8, s15
+; GFX6-NEXT: s_mul_i32 s2, s8, s13
; GFX6-NEXT: v_readfirstlane_b32 s10, v1
; GFX6-NEXT: v_mul_hi_u32 v1, s9, v2
; GFX6-NEXT: v_readfirstlane_b32 s11, v3
; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0
; GFX6-NEXT: s_add_u32 s2, s11, s2
; GFX6-NEXT: s_addc_u32 s10, 0, s10
-; GFX6-NEXT: s_mul_i32 s11, s9, s14
-; GFX6-NEXT: v_readfirstlane_b32 s14, v1
+; GFX6-NEXT: s_mul_i32 s11, s9, s12
+; GFX6-NEXT: v_readfirstlane_b32 s12, v1
; GFX6-NEXT: s_add_u32 s2, s2, s11
-; GFX6-NEXT: s_addc_u32 s2, s10, s14
+; GFX6-NEXT: s_addc_u32 s2, s10, s12
; GFX6-NEXT: v_readfirstlane_b32 s10, v0
; GFX6-NEXT: s_addc_u32 s10, s10, 0
-; GFX6-NEXT: s_mul_i32 s11, s9, s15
+; GFX6-NEXT: s_mul_i32 s11, s9, s13
; GFX6-NEXT: s_add_u32 s11, s2, s11
; GFX6-NEXT: v_mov_b32_e32 v0, s11
; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0
; GFX6-NEXT: s_addc_u32 s10, 0, s10
; GFX6-NEXT: s_mul_i32 s10, s6, s10
; GFX6-NEXT: s_mov_b32 s3, 0xf000
-; GFX6-NEXT: v_readfirstlane_b32 s14, v0
-; GFX6-NEXT: s_add_i32 s10, s14, s10
-; GFX6-NEXT: s_mul_i32 s14, s7, s11
-; GFX6-NEXT: s_add_i32 s14, s10, s14
-; GFX6-NEXT: s_sub_i32 s15, s9, s14
+; GFX6-NEXT: v_readfirstlane_b32 s12, v0
+; GFX6-NEXT: s_add_i32 s10, s12, s10
+; GFX6-NEXT: s_mul_i32 s12, s7, s11
+; GFX6-NEXT: s_add_i32 s16, s10, s12
+; GFX6-NEXT: s_sub_i32 s12, s9, s16
; GFX6-NEXT: s_mul_i32 s10, s6, s11
; GFX6-NEXT: s_sub_u32 s8, s8, s10
; GFX6-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GFX6-NEXT: s_or_b32 s16, s10, s11
-; GFX6-NEXT: s_cmp_lg_u32 s16, 0
-; GFX6-NEXT: s_subb_u32 s15, s15, s7
-; GFX6-NEXT: s_sub_u32 s17, s8, s6
-; GFX6-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GFX6-NEXT: s_or_b32 s10, s10, s11
-; GFX6-NEXT: s_cmp_lg_u32 s10, 0
-; GFX6-NEXT: s_subb_u32 s18, s15, 0
-; GFX6-NEXT: s_cmp_ge_u32 s18, s7
-; GFX6-NEXT: s_cselect_b32 s11, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s17, s6
-; GFX6-NEXT: s_cselect_b32 s19, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s18, s7
-; GFX6-NEXT: s_cselect_b32 s19, s19, s11
-; GFX6-NEXT: s_cmp_lg_u32 s10, 0
-; GFX6-NEXT: s_subb_u32 s15, s15, s7
-; GFX6-NEXT: s_sub_u32 s20, s17, s6
-; GFX6-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GFX6-NEXT: s_or_b32 s13, s10, s11
+; GFX6-NEXT: s_subb_u32 s17, s12, s7
+; GFX6-NEXT: s_sub_u32 s18, s8, s6
+; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT: s_or_b32 s19, s12, s13
+; GFX6-NEXT: s_subb_u32 s19, s17, 0
+; GFX6-NEXT: s_cmp_ge_u32 s19, s7
+; GFX6-NEXT: s_cselect_b32 s20, -1, 0
+; GFX6-NEXT: s_cmp_ge_u32 s18, s6
+; GFX6-NEXT: s_cselect_b32 s21, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s19, s7
+; GFX6-NEXT: s_cselect_b32 s20, s21, s20
+; GFX6-NEXT: s_or_b32 s12, s12, s13
+; GFX6-NEXT: s_subb_u32 s17, s17, s7
+; GFX6-NEXT: s_sub_u32 s21, s18, s6
+; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT: s_or_b32 s12, s12, s13
+; GFX6-NEXT: s_subb_u32 s12, s17, 0
+; GFX6-NEXT: s_cmp_lg_u32 s20, 0
+; GFX6-NEXT: s_cselect_b32 s13, s21, s18
+; GFX6-NEXT: s_cselect_b32 s12, s12, s19
; GFX6-NEXT: s_or_b32 s10, s10, s11
-; GFX6-NEXT: s_cmp_lg_u32 s10, 0
-; GFX6-NEXT: s_subb_u32 s10, s15, 0
-; GFX6-NEXT: s_cmp_lg_u32 s19, 0
-; GFX6-NEXT: s_cselect_b32 s11, s20, s17
-; GFX6-NEXT: s_cselect_b32 s10, s10, s18
-; GFX6-NEXT: s_cmp_lg_u32 s16, 0
-; GFX6-NEXT: s_subb_u32 s9, s9, s14
+; GFX6-NEXT: s_subb_u32 s9, s9, s16
; GFX6-NEXT: s_cmp_ge_u32 s9, s7
-; GFX6-NEXT: s_cselect_b32 s14, -1, 0
+; GFX6-NEXT: s_cselect_b32 s10, -1, 0
; GFX6-NEXT: s_cmp_ge_u32 s8, s6
; GFX6-NEXT: s_cselect_b32 s6, -1, 0
; GFX6-NEXT: s_cmp_eq_u32 s9, s7
-; GFX6-NEXT: s_cselect_b32 s6, s6, s14
+; GFX6-NEXT: s_cselect_b32 s6, s6, s10
; GFX6-NEXT: s_cmp_lg_u32 s6, 0
-; GFX6-NEXT: s_cselect_b32 s7, s10, s9
-; GFX6-NEXT: s_cselect_b32 s6, s11, s8
+; GFX6-NEXT: s_cselect_b32 s7, s12, s9
+; GFX6-NEXT: s_cselect_b32 s6, s13, s8
; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5]
; GFX6-NEXT: s_sub_u32 s5, s6, s4
; GFX6-NEXT: s_subb_u32 s4, s7, s4
; GFX6-NEXT: s_mov_b32 s2, -1
-; GFX6-NEXT: v_mov_b32_e32 v0, s12
-; GFX6-NEXT: v_mov_b32_e32 v1, s13
+; GFX6-NEXT: v_mov_b32_e32 v0, s14
+; GFX6-NEXT: v_mov_b32_e32 v1, s15
; GFX6-NEXT: v_mov_b32_e32 v2, s5
; GFX6-NEXT: v_mov_b32_e32 v3, s4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
@@ -9780,8 +9724,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7]
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3
-; GFX9-NEXT: s_sub_u32 s12, 0, s2
-; GFX9-NEXT: s_subb_u32 s13, 0, s3
+; GFX9-NEXT: s_sub_u32 s6, 0, s2
+; GFX9-NEXT: s_subb_u32 s7, 0, s3
; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
; GFX9-NEXT: v_rcp_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -9790,56 +9734,52 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s14, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v0
-; GFX9-NEXT: s_mul_i32 s7, s12, s14
-; GFX9-NEXT: s_mul_hi_u32 s16, s12, s6
-; GFX9-NEXT: s_mul_i32 s15, s13, s6
-; GFX9-NEXT: s_add_i32 s7, s16, s7
-; GFX9-NEXT: s_mul_i32 s17, s12, s6
-; GFX9-NEXT: s_add_i32 s7, s7, s15
-; GFX9-NEXT: s_mul_hi_u32 s16, s6, s17
-; GFX9-NEXT: s_mul_i32 s18, s6, s7
-; GFX9-NEXT: s_mul_hi_u32 s15, s6, s7
+; GFX9-NEXT: v_readfirstlane_b32 s12, v1
+; GFX9-NEXT: v_readfirstlane_b32 s13, v0
+; GFX9-NEXT: s_mul_i32 s14, s6, s12
+; GFX9-NEXT: s_mul_hi_u32 s16, s6, s13
+; GFX9-NEXT: s_mul_i32 s15, s7, s13
+; GFX9-NEXT: s_add_i32 s14, s16, s14
+; GFX9-NEXT: s_mul_i32 s17, s6, s13
+; GFX9-NEXT: s_add_i32 s14, s14, s15
+; GFX9-NEXT: s_mul_hi_u32 s16, s13, s17
+; GFX9-NEXT: s_mul_i32 s18, s13, s14
+; GFX9-NEXT: s_mul_hi_u32 s15, s13, s14
; GFX9-NEXT: s_add_u32 s16, s16, s18
; GFX9-NEXT: s_addc_u32 s15, 0, s15
-; GFX9-NEXT: s_mul_hi_u32 s18, s14, s17
-; GFX9-NEXT: s_mul_i32 s17, s14, s17
+; GFX9-NEXT: s_mul_hi_u32 s18, s12, s17
+; GFX9-NEXT: s_mul_i32 s17, s12, s17
; GFX9-NEXT: s_add_u32 s16, s16, s17
-; GFX9-NEXT: s_mul_hi_u32 s19, s14, s7
+; GFX9-NEXT: s_mul_hi_u32 s19, s12, s14
; GFX9-NEXT: s_addc_u32 s15, s15, s18
; GFX9-NEXT: s_addc_u32 s16, s19, 0
-; GFX9-NEXT: s_mul_i32 s7, s14, s7
-; GFX9-NEXT: s_add_u32 s7, s15, s7
+; GFX9-NEXT: s_mul_i32 s14, s12, s14
+; GFX9-NEXT: s_add_u32 s14, s15, s14
; GFX9-NEXT: s_addc_u32 s15, 0, s16
-; GFX9-NEXT: s_add_u32 s16, s6, s7
-; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX9-NEXT: s_addc_u32 s14, s14, s15
-; GFX9-NEXT: s_mul_i32 s6, s12, s14
-; GFX9-NEXT: s_mul_hi_u32 s7, s12, s16
-; GFX9-NEXT: s_add_i32 s6, s7, s6
-; GFX9-NEXT: s_mul_i32 s13, s13, s16
-; GFX9-NEXT: s_add_i32 s6, s6, s13
-; GFX9-NEXT: s_mul_i32 s12, s12, s16
-; GFX9-NEXT: s_mul_hi_u32 s13, s14, s12
-; GFX9-NEXT: s_mul_i32 s15, s14, s12
-; GFX9-NEXT: s_mul_i32 s18, s16, s6
-; GFX9-NEXT: s_mul_hi_u32 s12, s16, s12
-; GFX9-NEXT: s_mul_hi_u32 s17, s16, s6
-; GFX9-NEXT: s_add_u32 s12, s12, s18
+; GFX9-NEXT: s_add_u32 s13, s13, s14
+; GFX9-NEXT: s_addc_u32 s12, s12, s15
+; GFX9-NEXT: s_mul_i32 s14, s6, s12
+; GFX9-NEXT: s_mul_hi_u32 s15, s6, s13
+; GFX9-NEXT: s_add_i32 s14, s15, s14
+; GFX9-NEXT: s_mul_i32 s7, s7, s13
+; GFX9-NEXT: s_add_i32 s14, s14, s7
+; GFX9-NEXT: s_mul_i32 s6, s6, s13
+; GFX9-NEXT: s_mul_hi_u32 s15, s12, s6
+; GFX9-NEXT: s_mul_i32 s16, s12, s6
+; GFX9-NEXT: s_mul_i32 s18, s13, s14
+; GFX9-NEXT: s_mul_hi_u32 s6, s13, s6
+; GFX9-NEXT: s_mul_hi_u32 s17, s13, s14
+; GFX9-NEXT: s_add_u32 s6, s6, s18
; GFX9-NEXT: s_addc_u32 s17, 0, s17
-; GFX9-NEXT: s_add_u32 s12, s12, s15
-; GFX9-NEXT: s_mul_hi_u32 s7, s14, s6
-; GFX9-NEXT: s_addc_u32 s12, s17, s13
+; GFX9-NEXT: s_add_u32 s6, s6, s16
+; GFX9-NEXT: s_mul_hi_u32 s7, s12, s14
+; GFX9-NEXT: s_addc_u32 s6, s17, s15
; GFX9-NEXT: s_addc_u32 s7, s7, 0
-; GFX9-NEXT: s_mul_i32 s6, s14, s6
-; GFX9-NEXT: s_add_u32 s6, s12, s6
-; GFX9-NEXT: s_addc_u32 s12, 0, s7
-; GFX9-NEXT: s_add_u32 s13, s16, s6
-; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX9-NEXT: s_addc_u32 s12, s14, s12
+; GFX9-NEXT: s_mul_i32 s14, s12, s14
+; GFX9-NEXT: s_add_u32 s6, s6, s14
+; GFX9-NEXT: s_addc_u32 s7, 0, s7
+; GFX9-NEXT: s_add_u32 s13, s13, s6
+; GFX9-NEXT: s_addc_u32 s12, s12, s7
; GFX9-NEXT: s_ashr_i32 s6, s9, 31
; GFX9-NEXT: s_add_u32 s8, s8, s6
; GFX9-NEXT: s_mov_b32 s7, s6
@@ -9868,11 +9808,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_mul_i32 s12, s2, s12
; GFX9-NEXT: s_sub_u32 s8, s8, s12
; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0
; GFX9-NEXT: s_subb_u32 s17, s14, s3
; GFX9-NEXT: s_sub_u32 s18, s8, s2
; GFX9-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0
; GFX9-NEXT: s_subb_u32 s19, s17, 0
; GFX9-NEXT: s_cmp_ge_u32 s19, s3
; GFX9-NEXT: s_cselect_b32 s20, -1, 0
@@ -9881,13 +9819,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_cmp_eq_u32 s19, s3
; GFX9-NEXT: s_cselect_b32 s20, s21, s20
; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0
-; GFX9-NEXT: s_subb_u32 s17, s17, s3
-; GFX9-NEXT: s_sub_u32 s21, s18, s2
-; GFX9-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0
-; GFX9-NEXT: s_subb_u32 s14, s17, 0
+; GFX9-NEXT: s_subb_u32 s14, s17, s3
+; GFX9-NEXT: s_sub_u32 s15, s18, s2
+; GFX9-NEXT: s_subb_u32 s14, s14, 0
; GFX9-NEXT: s_cmp_lg_u32 s20, 0
-; GFX9-NEXT: s_cselect_b32 s15, s21, s18
+; GFX9-NEXT: s_cselect_b32 s15, s15, s18
; GFX9-NEXT: s_cselect_b32 s14, s14, s19
; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0
; GFX9-NEXT: s_subb_u32 s9, s9, s16
@@ -9911,8 +9847,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT: s_sub_u32 s6, 0, s2
-; GFX9-NEXT: s_subb_u32 s7, 0, s3
+; GFX9-NEXT: s_sub_u32 s4, 0, s2
+; GFX9-NEXT: s_subb_u32 s5, 0, s3
; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
; GFX9-NEXT: v_rcp_f32_e32 v1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
@@ -9922,74 +9858,70 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT: v_readfirstlane_b32 s4, v1
+; GFX9-NEXT: v_readfirstlane_b32 s6, v1
; GFX9-NEXT: v_readfirstlane_b32 s9, v2
-; GFX9-NEXT: s_mul_hi_u32 s8, s6, s4
-; GFX9-NEXT: s_mul_i32 s14, s6, s9
-; GFX9-NEXT: s_mul_i32 s5, s7, s4
+; GFX9-NEXT: s_mul_hi_u32 s8, s4, s6
+; GFX9-NEXT: s_mul_i32 s14, s4, s9
+; GFX9-NEXT: s_mul_i32 s7, s5, s6
; GFX9-NEXT: s_add_i32 s8, s8, s14
-; GFX9-NEXT: s_add_i32 s8, s8, s5
-; GFX9-NEXT: s_mul_i32 s15, s6, s4
-; GFX9-NEXT: s_mul_i32 s14, s4, s8
-; GFX9-NEXT: s_mul_hi_u32 s16, s4, s15
-; GFX9-NEXT: s_mul_hi_u32 s5, s4, s8
+; GFX9-NEXT: s_add_i32 s8, s8, s7
+; GFX9-NEXT: s_mul_i32 s15, s4, s6
+; GFX9-NEXT: s_mul_i32 s14, s6, s8
+; GFX9-NEXT: s_mul_hi_u32 s16, s6, s15
+; GFX9-NEXT: s_mul_hi_u32 s7, s6, s8
; GFX9-NEXT: s_add_u32 s14, s16, s14
-; GFX9-NEXT: s_addc_u32 s5, 0, s5
+; GFX9-NEXT: s_addc_u32 s7, 0, s7
; GFX9-NEXT: s_mul_hi_u32 s17, s9, s15
; GFX9-NEXT: s_mul_i32 s15, s9, s15
; GFX9-NEXT: s_add_u32 s14, s14, s15
; GFX9-NEXT: s_mul_hi_u32 s16, s9, s8
-; GFX9-NEXT: s_addc_u32 s5, s5, s17
+; GFX9-NEXT: s_addc_u32 s7, s7, s17
; GFX9-NEXT: s_addc_u32 s14, s16, 0
; GFX9-NEXT: s_mul_i32 s8, s9, s8
-; GFX9-NEXT: s_add_u32 s5, s5, s8
+; GFX9-NEXT: s_add_u32 s7, s7, s8
; GFX9-NEXT: s_addc_u32 s8, 0, s14
-; GFX9-NEXT: s_add_u32 s14, s4, s5
-; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT: s_addc_u32 s8, s9, s8
-; GFX9-NEXT: s_mul_i32 s4, s6, s8
-; GFX9-NEXT: s_mul_hi_u32 s5, s6, s14
-; GFX9-NEXT: s_add_i32 s4, s5, s4
-; GFX9-NEXT: s_mul_i32 s7, s7, s14
-; GFX9-NEXT: s_add_i32 s4, s4, s7
-; GFX9-NEXT: s_mul_i32 s6, s6, s14
-; GFX9-NEXT: s_mul_hi_u32 s7, s8, s6
-; GFX9-NEXT: s_mul_i32 s9, s8, s6
-; GFX9-NEXT: s_mul_i32 s16, s14, s4
-; GFX9-NEXT: s_mul_hi_u32 s6, s14, s6
-; GFX9-NEXT: s_mul_hi_u32 s15, s14, s4
-; GFX9-NEXT: s_add_u32 s6, s6, s16
+; GFX9-NEXT: s_add_u32 s6, s6, s7
+; GFX9-NEXT: s_addc_u32 s7, s9, s8
+; GFX9-NEXT: s_mul_i32 s8, s4, s7
+; GFX9-NEXT: s_mul_hi_u32 s9, s4, s6
+; GFX9-NEXT: s_add_i32 s8, s9, s8
+; GFX9-NEXT: s_mul_i32 s5, s5, s6
+; GFX9-NEXT: s_add_i32 s8, s8, s5
+; GFX9-NEXT: s_mul_i32 s4, s4, s6
+; GFX9-NEXT: s_mul_hi_u32 s9, s7, s4
+; GFX9-NEXT: s_mul_i32 s14, s7, s4
+; GFX9-NEXT: s_mul_i32 s16, s6, s8
+; GFX9-NEXT: s_mul_hi_u32 s4, s6, s4
+; GFX9-NEXT: s_mul_hi_u32 s15, s6, s8
+; GFX9-NEXT: s_add_u32 s4, s4, s16
; GFX9-NEXT: s_addc_u32 s15, 0, s15
-; GFX9-NEXT: s_add_u32 s6, s6, s9
-; GFX9-NEXT: s_mul_hi_u32 s5, s8, s4
-; GFX9-NEXT: s_addc_u32 s6, s15, s7
+; GFX9-NEXT: s_add_u32 s4, s4, s14
+; GFX9-NEXT: s_mul_hi_u32 s5, s7, s8
+; GFX9-NEXT: s_addc_u32 s4, s15, s9
; GFX9-NEXT: s_addc_u32 s5, s5, 0
-; GFX9-NEXT: s_mul_i32 s4, s8, s4
-; GFX9-NEXT: s_add_u32 s4, s6, s4
-; GFX9-NEXT: s_addc_u32 s6, 0, s5
-; GFX9-NEXT: s_add_u32 s9, s14, s4
-; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT: s_addc_u32 s8, s8, s6
+; GFX9-NEXT: s_mul_i32 s8, s7, s8
+; GFX9-NEXT: s_add_u32 s4, s4, s8
+; GFX9-NEXT: s_addc_u32 s5, 0, s5
+; GFX9-NEXT: s_add_u32 s8, s6, s4
+; GFX9-NEXT: s_addc_u32 s9, s7, s5
; GFX9-NEXT: s_ashr_i32 s4, s11, 31
; GFX9-NEXT: s_add_u32 s6, s10, s4
; GFX9-NEXT: s_mov_b32 s5, s4
; GFX9-NEXT: s_addc_u32 s7, s11, s4
; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5]
-; GFX9-NEXT: s_mul_i32 s11, s6, s8
-; GFX9-NEXT: s_mul_hi_u32 s14, s6, s9
-; GFX9-NEXT: s_mul_hi_u32 s10, s6, s8
+; GFX9-NEXT: s_mul_i32 s11, s6, s9
+; GFX9-NEXT: s_mul_hi_u32 s14, s6, s8
+; GFX9-NEXT: s_mul_hi_u32 s10, s6, s9
; GFX9-NEXT: s_add_u32 s11, s14, s11
; GFX9-NEXT: s_addc_u32 s10, 0, s10
-; GFX9-NEXT: s_mul_hi_u32 s15, s7, s9
-; GFX9-NEXT: s_mul_i32 s9, s7, s9
-; GFX9-NEXT: s_add_u32 s9, s11, s9
-; GFX9-NEXT: s_mul_hi_u32 s14, s7, s8
-; GFX9-NEXT: s_addc_u32 s9, s10, s15
-; GFX9-NEXT: s_addc_u32 s10, s14, 0
+; GFX9-NEXT: s_mul_hi_u32 s15, s7, s8
; GFX9-NEXT: s_mul_i32 s8, s7, s8
-; GFX9-NEXT: s_add_u32 s8, s9, s8
+; GFX9-NEXT: s_add_u32 s8, s11, s8
+; GFX9-NEXT: s_mul_hi_u32 s14, s7, s9
+; GFX9-NEXT: s_addc_u32 s8, s10, s15
+; GFX9-NEXT: s_addc_u32 s10, s14, 0
+; GFX9-NEXT: s_mul_i32 s9, s7, s9
+; GFX9-NEXT: s_add_u32 s8, s8, s9
; GFX9-NEXT: s_addc_u32 s9, 0, s10
; GFX9-NEXT: s_mul_i32 s9, s2, s9
; GFX9-NEXT: s_mul_hi_u32 s10, s2, s8
@@ -10000,11 +9932,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_mul_i32 s8, s2, s8
; GFX9-NEXT: s_sub_u32 s6, s6, s8
; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX9-NEXT: s_subb_u32 s15, s10, s3
; GFX9-NEXT: s_sub_u32 s16, s6, s2
; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
; GFX9-NEXT: s_subb_u32 s17, s15, 0
; GFX9-NEXT: s_cmp_ge_u32 s17, s3
; GFX9-NEXT: s_cselect_b32 s18, -1, 0
@@ -10013,13 +9943,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_cmp_eq_u32 s17, s3
; GFX9-NEXT: s_cselect_b32 s18, s19, s18
; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT: s_subb_u32 s15, s15, s3
-; GFX9-NEXT: s_sub_u32 s19, s16, s2
-; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT: s_subb_u32 s10, s15, 0
+; GFX9-NEXT: s_subb_u32 s10, s15, s3
+; GFX9-NEXT: s_sub_u32 s11, s16, s2
+; GFX9-NEXT: s_subb_u32 s10, s10, 0
; GFX9-NEXT: s_cmp_lg_u32 s18, 0
-; GFX9-NEXT: s_cselect_b32 s11, s19, s16
+; GFX9-NEXT: s_cselect_b32 s11, s11, s16
; GFX9-NEXT: s_cselect_b32 s10, s10, s17
; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX9-NEXT: s_subb_u32 s7, s7, s14
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index 394727c..01f4414 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -612,12 +612,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8-NEXT: s_mov_b32 m0, s3
-; GFX8-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT: v_readlane_b32 s6, v0, s3
; GFX8-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8-NEXT: s_add_i32 s2, s2, s8
+; GFX8-NEXT: s_add_i32 s2, s2, s6
+; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB2_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -653,12 +652,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9-NEXT: s_mov_b32 m0, s3
-; GFX9-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT: v_readlane_b32 s6, v0, s3
; GFX9-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9-NEXT: s_add_i32 s2, s2, s8
+; GFX9-NEXT: s_add_i32 s2, s2, s6
+; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB2_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -693,11 +691,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX10W64-NEXT: s_add_i32 s2, s2, s8
-; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1
; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -733,11 +730,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
-; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2
; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -774,11 +770,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: s_add_i32 s2, s2, s8
-; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1
; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -818,11 +813,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
-; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2
; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -859,11 +853,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1
; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -901,15 +894,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
-; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2
; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -999,12 +992,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8-NEXT: s_mov_b32 m0, s3
-; GFX8-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT: v_readlane_b32 s6, v0, s3
; GFX8-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8-NEXT: s_add_i32 s2, s2, s8
+; GFX8-NEXT: s_add_i32 s2, s2, s6
+; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB3_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1042,12 +1034,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9-NEXT: s_mov_b32 m0, s3
-; GFX9-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT: v_readlane_b32 s6, v0, s3
; GFX9-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9-NEXT: s_add_i32 s2, s2, s8
+; GFX9-NEXT: s_add_i32 s2, s2, s6
+; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1084,11 +1075,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX10W64-NEXT: s_add_i32 s2, s2, s8
-; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_1
; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1127,11 +1117,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
-; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2
; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_1
; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1171,11 +1160,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: s_add_i32 s2, s2, s8
-; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_1
; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1218,11 +1206,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
-; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2
; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_1
; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1261,11 +1248,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX12W64-NEXT: s_cbranch_scc1 .LBB3_1
; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1306,15 +1292,15 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB3_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
-; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2
; GFX12W32-NEXT: s_cbranch_scc1 .LBB3_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2073,12 +2059,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8-NEXT: s_mov_b32 m0, s3
-; GFX8-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT: v_readlane_b32 s6, v0, s3
; GFX8-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8-NEXT: s_add_i32 s2, s2, s8
+; GFX8-NEXT: s_add_i32 s2, s2, s6
+; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB7_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2114,12 +2099,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9-NEXT: s_mov_b32 m0, s3
-; GFX9-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT: v_readlane_b32 s6, v0, s3
; GFX9-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9-NEXT: s_add_i32 s2, s2, s8
+; GFX9-NEXT: s_add_i32 s2, s2, s6
+; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB7_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2154,11 +2138,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX10W64-NEXT: s_add_i32 s2, s2, s8
-; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_1
; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2194,11 +2177,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
-; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2
; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_1
; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2235,11 +2217,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: s_add_i32 s2, s2, s8
-; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1
; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2279,11 +2260,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
-; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2
; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1
; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2321,11 +2301,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX12W64-NEXT: s_cbranch_scc1 .LBB7_1
; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2363,15 +2342,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB7_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
-; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2
; GFX12W32-NEXT: s_cbranch_scc1 .LBB7_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 258bc295..9db6d70 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -717,12 +717,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s2
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s2
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0
-; GFX8_ITERATIVE-NEXT: s_add_i32 s6, s6, s7
+; GFX8_ITERATIVE-NEXT: s_add_i32 s6, s6, s3
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -762,12 +761,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s2
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s2
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0
-; GFX9_ITERATIVE-NEXT: s_add_i32 s6, s6, s7
+; GFX9_ITERATIVE-NEXT: s_add_i32 s6, s6, s3
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -805,13 +803,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1
; GFX1064_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s7, s[0:1]
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s7
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s7
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s2
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s2
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1064_ITERATIVE-NEXT: s_add_i32 s6, s6, s7
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064_ITERATIVE-NEXT: s_add_i32 s6, s6, s8
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -853,11 +850,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s1
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s3
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032_ITERATIVE-NEXT: s_add_i32 s6, s6, s2
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -897,14 +893,13 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0
; GFX1164_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1]
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v1, s2
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1164_ITERATIVE-NEXT: s_add_i32 s6, s6, s7
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164_ITERATIVE-NEXT: s_add_i32 s6, s6, s8
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -949,11 +944,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v1, s1
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
; GFX1132_ITERATIVE-NEXT: s_add_i32 s6, s6, s2
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -993,14 +987,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0
; GFX1264_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1]
+; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1]
+; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s7, v1, s2
+; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2
+; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s7
; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7
-; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7
-; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7
; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s8
-; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -1028,6 +1022,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0
; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xf1ff
; GFX1264_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1264_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null
@@ -1041,15 +1036,15 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0
; GFX1232_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v1, s1
-; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
-; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3
+; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s2
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -2363,7 +2358,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -2416,7 +2410,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX9_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -2462,13 +2455,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s2
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s2
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s7, s2
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s2
; GFX1064_ITERATIVE-NEXT: s_add_u32 s6, s6, s3
; GFX1064_ITERATIVE-NEXT: s_addc_u32 s7, s7, s8
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -2515,13 +2507,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1
-; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s1
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s7, s1
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s1
; GFX1032_ITERATIVE-NEXT: s_add_u32 s6, s6, s2
; GFX1032_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -2569,14 +2560,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s2
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s2
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s2
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2
; GFX1164_ITERATIVE-NEXT: s_add_u32 s6, s6, s3
; GFX1164_ITERATIVE-NEXT: s_addc_u32 s7, s7, s8
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -2626,14 +2616,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1
-; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
; GFX1132_ITERATIVE-NEXT: s_add_u32 s6, s6, s2
; GFX1132_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -2677,16 +2666,16 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1264_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop
; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[0:1]
+; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[0:1]
+; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s8
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s8
+; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s8
+; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s8
+; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s10
-; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s10
-; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
-; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s10
-; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s10
; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[8:9]
; GFX1264_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3]
-; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -2731,17 +2720,17 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1232_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop
; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1
-; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
-; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s8
-; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3]
+; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3]
; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -4490,12 +4479,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s2
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s2
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s12, m0
-; GFX8_ITERATIVE-NEXT: s_add_i32 s12, s12, s6
+; GFX8_ITERATIVE-NEXT: s_add_i32 s12, s12, s3
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -4550,12 +4538,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s2
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s2
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s12, m0
-; GFX9_ITERATIVE-NEXT: s_add_i32 s12, s12, s6
+; GFX9_ITERATIVE-NEXT: s_add_i32 s12, s12, s3
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -4608,13 +4595,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr2
; GFX1064_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1]
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s6
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s12, s6
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s2
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s12, s2
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1064_ITERATIVE-NEXT: s_add_i32 s12, s12, s6
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064_ITERATIVE-NEXT: s_add_i32 s12, s12, s7
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -4670,11 +4656,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s8, s1
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s3
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032_ITERATIVE-NEXT: s_add_i32 s8, s8, s2
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -4728,14 +4713,13 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2
; GFX1164_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[0:1]
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s6
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v2, s12, s6
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s2
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v2, s12, s2
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1164_ITERATIVE-NEXT: s_add_i32 s12, s12, s6
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164_ITERATIVE-NEXT: s_add_i32 s12, s12, s7
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -4799,11 +4783,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v2, s8, s1
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
; GFX1132_ITERATIVE-NEXT: s_add_i32 s8, s8, s2
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -4861,14 +4844,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0
; GFX1264_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1]
+; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1]
+; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s7, v1, s2
+; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2
+; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s7
; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7
-; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7
-; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7
; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s8
-; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -4896,6 +4879,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0
; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xf1ff
; GFX1264_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1264_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null
@@ -4909,15 +4893,15 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0
; GFX1232_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v1, s1
-; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
-; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3
+; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s2
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -6673,7 +6657,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8_ITERATIVE-NEXT: s_addc_u32 s9, s9, s3
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -6746,7 +6729,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX9_ITERATIVE-NEXT: s_addc_u32 s9, s9, s3
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -6812,13 +6794,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v1, s2
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s2
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s2
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s2
; GFX1064_ITERATIVE-NEXT: s_add_u32 s8, s8, s3
; GFX1064_ITERATIVE-NEXT: s_addc_u32 s9, s9, s6
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -6883,13 +6864,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s1
-; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s1
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s1
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s1
; GFX1032_ITERATIVE-NEXT: s_add_u32 s8, s8, s2
; GFX1032_ITERATIVE-NEXT: s_addc_u32 s9, s9, s3
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -6955,14 +6935,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v1, s2
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s2
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s2
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s2
; GFX1164_ITERATIVE-NEXT: s_add_u32 s8, s8, s3
; GFX1164_ITERATIVE-NEXT: s_addc_u32 s9, s9, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -7036,14 +7015,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s1
-; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s1
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s1
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s1
; GFX1132_ITERATIVE-NEXT: s_add_u32 s8, s8, s2
; GFX1132_ITERATIVE-NEXT: s_addc_u32 s9, s9, s3
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -7109,16 +7087,16 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1264_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[0:1]
+; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[0:1]
+; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s8
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s8
+; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s8
+; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s8
+; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s10
-; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s10
-; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
-; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s10
-; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s10
; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[8:9]
; GFX1264_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3]
-; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -7163,17 +7141,17 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1232_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1
-; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
-; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s8
-; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3]
+; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3]
; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 23c5f4f..6167a84 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -499,12 +499,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -540,12 +539,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -580,11 +578,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -621,11 +618,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -663,11 +659,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -707,11 +702,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1088,11 +1082,10 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX8_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3
+; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s4
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1117,11 +1110,10 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX9_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3
+; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s4
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1147,9 +1139,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1176,9 +1167,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1206,10 +1196,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5]
; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1239,10 +1227,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2022,7 +2008,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2071,7 +2056,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2112,13 +2096,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s6
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6
; GFX1064_ITERATIVE-NEXT: s_add_u32 s0, s0, s7
; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2160,13 +2143,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
-; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3
; GFX1032_ITERATIVE-NEXT: s_add_u32 s0, s0, s6
; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2209,14 +2191,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6
; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7
; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -2261,14 +2242,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
-; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -2881,7 +2861,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2914,7 +2893,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2946,7 +2924,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s6
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2979,7 +2956,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3013,8 +2989,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3048,9 +3022,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s4
; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3906,12 +3879,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3947,12 +3919,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3987,11 +3958,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4028,11 +3998,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4070,11 +4039,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -4114,11 +4082,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -4495,11 +4462,10 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX8_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3
+; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s4
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4524,11 +4490,10 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX9_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3
+; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s4
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4554,9 +4519,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4583,9 +4547,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4613,10 +4576,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5]
; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4646,10 +4607,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5452,7 +5411,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5501,7 +5459,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5542,13 +5499,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s6
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6
; GFX1064_ITERATIVE-NEXT: s_add_u32 s0, s0, s7
; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5590,13 +5546,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
-; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3
; GFX1032_ITERATIVE-NEXT: s_add_u32 s0, s0, s6
; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5639,14 +5594,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6
; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7
; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -5691,14 +5645,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
-; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -6313,12 +6266,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_and_b32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT: s_and_b32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6354,12 +6306,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_and_b32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT: s_and_b32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6394,11 +6345,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: s_and_b32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6435,11 +6385,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1032_ITERATIVE-NEXT: s_and_b32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6477,11 +6426,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: s_and_b32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -6521,11 +6469,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1132_ITERATIVE-NEXT: s_and_b32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -6926,12 +6873,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8
; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
; GFX8_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6973,12 +6919,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8
; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
; GFX9_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7015,15 +6960,14 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX1064_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3]
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s8
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s8
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX1064_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7065,12 +7009,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7109,16 +7052,15 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9]
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX1164_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -7163,12 +7105,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1132_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -7672,12 +7613,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_or_b32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT: s_or_b32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7713,12 +7653,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_or_b32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT: s_or_b32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7753,11 +7692,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: s_or_b32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7794,11 +7732,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1032_ITERATIVE-NEXT: s_or_b32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7836,11 +7773,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: s_or_b32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -7880,11 +7816,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1132_ITERATIVE-NEXT: s_or_b32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -8284,12 +8219,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8
; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
; GFX8_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8331,12 +8265,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8
; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
; GFX9_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8373,15 +8306,14 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX1064_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3]
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s8
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s8
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX1064_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8423,12 +8355,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8467,16 +8398,15 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9]
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX1164_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8521,12 +8451,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1132_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9030,12 +8959,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT: s_xor_b32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9071,12 +8999,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT: s_xor_b32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9111,11 +9038,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9152,11 +9078,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1032_ITERATIVE-NEXT: s_xor_b32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9194,11 +9119,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -9238,11 +9162,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1132_ITERATIVE-NEXT: s_xor_b32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -9642,12 +9565,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8
; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9689,12 +9611,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8
; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9731,15 +9652,14 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX1064_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3]
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s8
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s8
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9781,12 +9701,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9825,16 +9744,15 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9]
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9879,12 +9797,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1132_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -10388,12 +10305,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_max_i32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT: s_max_i32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10429,12 +10345,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_max_i32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT: s_max_i32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10469,11 +10384,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: s_max_i32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10510,11 +10424,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1032_ITERATIVE-NEXT: s_max_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10552,11 +10465,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: s_max_i32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -10596,11 +10508,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1132_ITERATIVE-NEXT: s_max_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -11255,7 +11166,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11311,7 +11221,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11363,7 +11272,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11415,7 +11323,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11468,9 +11375,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -11525,9 +11431,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -12214,12 +12119,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_min_i32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT: s_min_i32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12255,12 +12159,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_min_i32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT: s_min_i32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12295,11 +12198,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: s_min_i32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12336,11 +12238,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1032_ITERATIVE-NEXT: s_min_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12378,11 +12279,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: s_min_i32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -12422,11 +12322,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1132_ITERATIVE-NEXT: s_min_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -13081,7 +12980,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13137,7 +13035,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13189,7 +13086,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13241,7 +13137,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13294,9 +13189,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -13351,9 +13245,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -14040,12 +13933,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_max_u32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT: s_max_u32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14081,12 +13973,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_max_u32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT: s_max_u32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14121,11 +14012,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: s_max_u32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14162,11 +14052,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1032_ITERATIVE-NEXT: s_max_u32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14204,11 +14093,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: s_max_u32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -14248,11 +14136,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1132_ITERATIVE-NEXT: s_max_u32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -14901,7 +14788,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14956,7 +14842,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15007,7 +14892,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15058,7 +14942,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15112,8 +14995,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -15169,8 +15050,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -15853,12 +15732,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_min_u32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT: s_min_u32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15894,12 +15772,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_min_u32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT: s_min_u32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15934,11 +15811,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: s_min_u32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15975,11 +15851,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1032_ITERATIVE-NEXT: s_min_u32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16017,11 +15892,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: s_min_u32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -16061,11 +15935,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1132_ITERATIVE-NEXT: s_min_u32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -16715,7 +16588,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16770,7 +16642,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16821,7 +16692,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16872,7 +16742,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16926,8 +16795,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -16983,8 +16850,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
index e4def28..9afc0c6 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -611,12 +611,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8-NEXT: s_mov_b32 m0, s3
-; GFX8-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT: v_readlane_b32 s6, v0, s3
; GFX8-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8-NEXT: s_add_i32 s2, s2, s8
+; GFX8-NEXT: s_add_i32 s2, s2, s6
+; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB2_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -652,12 +651,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9-NEXT: s_mov_b32 m0, s3
-; GFX9-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT: v_readlane_b32 s6, v0, s3
; GFX9-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9-NEXT: s_add_i32 s2, s2, s8
+; GFX9-NEXT: s_add_i32 s2, s2, s6
+; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB2_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -692,11 +690,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX10W64-NEXT: s_add_i32 s2, s2, s8
-; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1
; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -732,11 +729,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
-; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2
; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -773,11 +769,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: s_add_i32 s2, s2, s8
-; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1
; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -817,11 +812,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
-; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2
; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -858,11 +852,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1
; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -900,15 +893,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
-; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2
; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1665,12 +1658,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8-NEXT: s_mov_b32 m0, s3
-; GFX8-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT: v_readlane_b32 s6, v0, s3
; GFX8-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8-NEXT: s_add_i32 s2, s2, s8
+; GFX8-NEXT: s_add_i32 s2, s2, s6
+; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB6_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1706,12 +1698,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9-NEXT: s_mov_b32 m0, s3
-; GFX9-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT: v_readlane_b32 s6, v0, s3
; GFX9-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9-NEXT: s_add_i32 s2, s2, s8
+; GFX9-NEXT: s_add_i32 s2, s2, s6
+; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB6_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1746,11 +1737,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX10W64-NEXT: s_add_i32 s2, s2, s8
-; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_1
; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1786,11 +1776,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
-; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2
; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_1
; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1827,11 +1816,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: s_add_i32 s2, s2, s8
-; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_1
; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1871,11 +1859,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
-; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2
; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_1
; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1913,11 +1900,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX12W64-NEXT: s_cbranch_scc1 .LBB6_1
; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1955,15 +1941,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB6_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
-; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2
; GFX12W32-NEXT: s_cbranch_scc1 .LBB6_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
index 39a3c9a..10fd34f 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -628,12 +628,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8-NEXT: s_mov_b32 m0, s3
-; GFX8-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT: v_readlane_b32 s6, v0, s3
; GFX8-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8-NEXT: s_add_i32 s2, s2, s8
+; GFX8-NEXT: s_add_i32 s2, s2, s6
+; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB2_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -670,12 +669,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9-NEXT: s_mov_b32 m0, s3
-; GFX9-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT: v_readlane_b32 s6, v0, s3
; GFX9-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9-NEXT: s_add_i32 s2, s2, s8
+; GFX9-NEXT: s_add_i32 s2, s2, s6
+; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB2_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -711,11 +709,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX10W64-NEXT: s_add_i32 s2, s2, s8
-; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1
; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -752,11 +749,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
-; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2
; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -794,11 +790,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: s_add_i32 s2, s2, s8
-; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1
; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -839,11 +834,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
-; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2
; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -880,11 +874,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1
; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -923,15 +916,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
-; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2
; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1833,12 +1826,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8-NEXT: s_mov_b32 m0, s3
-; GFX8-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT: v_readlane_b32 s6, v0, s3
; GFX8-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8-NEXT: s_add_i32 s2, s2, s8
+; GFX8-NEXT: s_add_i32 s2, s2, s6
+; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB7_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1875,12 +1867,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9-NEXT: s_mov_b32 m0, s3
-; GFX9-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT: v_readlane_b32 s6, v0, s3
; GFX9-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9-NEXT: s_add_i32 s2, s2, s8
+; GFX9-NEXT: s_add_i32 s2, s2, s6
+; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB7_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1916,11 +1907,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX10W64-NEXT: s_add_i32 s2, s2, s8
-; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_1
; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1957,11 +1947,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
-; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2
; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_1
; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1999,11 +1988,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: s_add_i32 s2, s2, s8
-; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1
; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2044,11 +2032,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
-; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2
; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1
; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2086,11 +2073,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX12W64-NEXT: s_cbranch_scc1 .LBB7_1
; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2129,15 +2115,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB7_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
-; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2
; GFX12W32-NEXT: s_cbranch_scc1 .LBB7_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 711d57b..30ad46d9 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -39131,21 +39131,21 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_xor_b32_e32 v2, v0, v1
; GFX1250-NEXT: v_cls_i32_e32 v3, v1
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_dual_add_nc_u32 v3, -1, v3 :: v_dual_bitop2_b32 v2, v0, v1 bitop3:0x14
; GFX1250-NEXT: v_ashrrev_i32_e32 v2, 31, v2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_add_nc_u32_e32 v2, 32, v2
+; GFX1250-NEXT: v_min_u32_e32 v2, v3, v2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_add_min_u32 v2, v3, -1, v2
; GFX1250-NEXT: v_lshlrev_b64_e32 v[0:1], v2, v[0:1]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_min_u32_e32 v0, 1, v0
-; GFX1250-NEXT: v_dual_sub_nc_u32 v1, 32, v2 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_dual_sub_nc_u32 v1, 32, v2 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54
; GFX1250-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_ldexp_f32 v0, v0, v1
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%op = sitofp i64 %x to bfloat
@@ -39483,29 +39483,30 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
; GFX1250-NEXT: v_xor_b32_e32 v4, v2, v3
; GFX1250-NEXT: v_cls_i32_e32 v6, v3
; GFX1250-NEXT: v_cls_i32_e32 v7, v1
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_dual_ashrrev_i32 v5, 31, v5 :: v_dual_ashrrev_i32 v4, 31, v4
+; GFX1250-NEXT: v_dual_add_nc_u32 v6, -1, v6 :: v_dual_add_nc_u32 v7, -1, v7
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_add_nc_u32 v5, 32, v5 :: v_dual_add_nc_u32 v4, 32, v4
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_add_min_u32 v5, v7, -1, v5
-; GFX1250-NEXT: v_add_min_u32 v4, v6, -1, v4
+; GFX1250-NEXT: v_min_u32_e32 v5, v7, v5
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_min_u32_e32 v4, v6, v4
; GFX1250-NEXT: v_lshlrev_b64_e32 v[0:1], v5, v[0:1]
-; GFX1250-NEXT: v_lshlrev_b64_e32 v[2:3], v4, v[2:3]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[2:3], v4, v[2:3]
; GFX1250-NEXT: v_min_u32_e32 v0, 1, v0
-; GFX1250-NEXT: v_min_u32_e32 v2, 1, v2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_min_u32_e32 v2, 1, v2
; GFX1250-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1250-NEXT: v_dual_sub_nc_u32 v1, 32, v4 :: v_dual_bitop2_b32 v2, v3, v2 bitop3:0x54
; GFX1250-NEXT: v_sub_nc_u32_e32 v3, 32, v5
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_cvt_f32_i32_e32 v2, v2
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_ldexp_f32 v0, v0, v3
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_ldexp_f32 v1, v2, v1
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%op = sitofp <2 x i64> %x to <2 x bfloat>
@@ -39968,41 +39969,42 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
; GFX1250TRUE16: ; %bb.0:
; GFX1250TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250TRUE16-NEXT: v_xor_b32_e32 v7, v2, v3
-; GFX1250TRUE16-NEXT: v_xor_b32_e32 v6, v4, v5
+; GFX1250TRUE16-NEXT: v_cls_i32_e32 v6, v5
+; GFX1250TRUE16-NEXT: v_xor_b32_e32 v7, v4, v5
; GFX1250TRUE16-NEXT: v_cls_i32_e32 v10, v3
-; GFX1250TRUE16-NEXT: v_cls_i32_e32 v9, v5
; GFX1250TRUE16-NEXT: v_cls_i32_e32 v11, v1
-; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1250TRUE16-NEXT: v_dual_ashrrev_i32 v7, 31, v7 :: v_dual_ashrrev_i32 v6, 31, v6
-; GFX1250TRUE16-NEXT: v_xor_b32_e32 v8, v0, v1
-; GFX1250TRUE16-NEXT: v_dual_add_nc_u32 v7, 32, v7 :: v_dual_add_nc_u32 v6, 32, v6
-; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250TRUE16-NEXT: v_ashrrev_i32_e32 v8, 31, v8
-; GFX1250TRUE16-NEXT: v_add_min_u32 v7, v10, -1, v7
-; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250TRUE16-NEXT: v_add_min_u32 v6, v9, -1, v6
-; GFX1250TRUE16-NEXT: v_lshlrev_b64_e32 v[2:3], v7, v[2:3]
-; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250TRUE16-NEXT: v_lshlrev_b64_e32 v[4:5], v6, v[4:5]
-; GFX1250TRUE16-NEXT: v_min_u32_e32 v2, 1, v2
-; GFX1250TRUE16-NEXT: v_add_nc_u32_e32 v8, 32, v8
+; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250TRUE16-NEXT: v_dual_add_nc_u32 v6, -1, v6 :: v_dual_bitop2_b32 v9, v0, v1 bitop3:0x14
+; GFX1250TRUE16-NEXT: v_ashrrev_i32_e32 v7, 31, v7
+; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250TRUE16-NEXT: v_dual_ashrrev_i32 v9, 31, v9 :: v_dual_bitop2_b32 v8, v2, v3 bitop3:0x14
+; GFX1250TRUE16-NEXT: v_dual_add_nc_u32 v7, 32, v7 :: v_dual_ashrrev_i32 v8, 31, v8
+; GFX1250TRUE16-NEXT: v_dual_add_nc_u32 v10, -1, v10 :: v_dual_add_nc_u32 v11, -1, v11
; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250TRUE16-NEXT: v_add_nc_u32_e32 v9, 32, v9
+; GFX1250TRUE16-NEXT: v_min_u32_e32 v6, v6, v7
+; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250TRUE16-NEXT: v_add_nc_u32_e32 v8, 32, v8
+; GFX1250TRUE16-NEXT: v_lshlrev_b64_e32 v[4:5], v6, v[4:5]
+; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250TRUE16-NEXT: v_min_u32_e32 v7, v10, v8
+; GFX1250TRUE16-NEXT: v_min_u32_e32 v8, v11, v9
+; GFX1250TRUE16-NEXT: v_lshlrev_b64_e32 v[2:3], v7, v[2:3]
+; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1250TRUE16-NEXT: v_lshlrev_b64_e32 v[0:1], v8, v[0:1]
; GFX1250TRUE16-NEXT: v_min_u32_e32 v4, 1, v4
-; GFX1250TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
-; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250TRUE16-NEXT: v_add_min_u32 v8, v11, -1, v8
-; GFX1250TRUE16-NEXT: v_dual_sub_nc_u32 v3, 32, v6 :: v_dual_bitop2_b32 v4, v5, v4 bitop3:0x54
+; GFX1250TRUE16-NEXT: v_min_u32_e32 v2, 1, v2
; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250TRUE16-NEXT: v_cvt_f32_i32_e32 v2, v2
-; GFX1250TRUE16-NEXT: v_lshlrev_b64_e32 v[0:1], v8, v[0:1]
-; GFX1250TRUE16-NEXT: v_sub_nc_u32_e32 v5, 32, v8
-; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250TRUE16-NEXT: v_min_u32_e32 v0, 1, v0
-; GFX1250TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX1250TRUE16-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250TRUE16-NEXT: v_dual_sub_nc_u32 v5, 32, v8 :: v_dual_bitop2_b32 v2, v3, v2 bitop3:0x54
+; GFX1250TRUE16-NEXT: v_dual_sub_nc_u32 v3, 32, v6 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54
+; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX1250TRUE16-NEXT: v_cvt_f32_i32_e32 v1, v4
; GFX1250TRUE16-NEXT: v_sub_nc_u32_e32 v4, 32, v7
-; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250TRUE16-NEXT: v_cvt_f32_i32_e32 v2, v2
+; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1250TRUE16-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX1250TRUE16-NEXT: v_ldexp_f32 v1, v1, v3
; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -40017,44 +40019,47 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
; GFX1250FAKE16: ; %bb.0:
; GFX1250FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250FAKE16-NEXT: v_xor_b32_e32 v8, v4, v5
-; GFX1250FAKE16-NEXT: v_xor_b32_e32 v6, v2, v3
+; GFX1250FAKE16-NEXT: v_cls_i32_e32 v6, v5
+; GFX1250FAKE16-NEXT: v_xor_b32_e32 v7, v2, v3
; GFX1250FAKE16-NEXT: v_cls_i32_e32 v10, v3
-; GFX1250FAKE16-NEXT: v_cls_i32_e32 v9, v5
; GFX1250FAKE16-NEXT: v_cls_i32_e32 v11, v1
-; GFX1250FAKE16-NEXT: v_dual_ashrrev_i32 v8, 31, v8 :: v_dual_bitop2_b32 v7, v0, v1 bitop3:0x14
-; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250FAKE16-NEXT: v_dual_ashrrev_i32 v6, 31, v6 :: v_dual_ashrrev_i32 v7, 31, v7
-; GFX1250FAKE16-NEXT: v_dual_add_nc_u32 v6, 32, v6 :: v_dual_add_nc_u32 v7, 32, v7
-; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250FAKE16-NEXT: v_add_min_u32 v6, v10, -1, v6
-; GFX1250FAKE16-NEXT: v_add_min_u32 v7, v11, -1, v7
+; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250FAKE16-NEXT: v_dual_add_nc_u32 v6, -1, v6 :: v_dual_bitop2_b32 v8, v4, v5 bitop3:0x14
+; GFX1250FAKE16-NEXT: v_dual_ashrrev_i32 v7, 31, v7 :: v_dual_bitop2_b32 v9, v0, v1 bitop3:0x14
; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250FAKE16-NEXT: v_lshlrev_b64_e32 v[2:3], v6, v[2:3]
-; GFX1250FAKE16-NEXT: v_lshlrev_b64_e32 v[0:1], v7, v[0:1]
-; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1250FAKE16-NEXT: v_min_u32_e32 v2, 1, v2
-; GFX1250FAKE16-NEXT: v_add_nc_u32_e32 v8, 32, v8
-; GFX1250FAKE16-NEXT: v_min_u32_e32 v0, 1, v0
+; GFX1250FAKE16-NEXT: v_dual_add_nc_u32 v10, -1, v10 :: v_dual_ashrrev_i32 v8, 31, v8
+; GFX1250FAKE16-NEXT: v_dual_add_nc_u32 v11, -1, v11 :: v_dual_ashrrev_i32 v9, 31, v9
+; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250FAKE16-NEXT: v_dual_add_nc_u32 v7, 32, v7 :: v_dual_add_nc_u32 v8, 32, v8
+; GFX1250FAKE16-NEXT: v_add_nc_u32_e32 v9, 32, v9
+; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250FAKE16-NEXT: v_min_u32_e32 v7, v10, v7
+; GFX1250FAKE16-NEXT: v_min_u32_e32 v6, v6, v8
; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250FAKE16-NEXT: v_or_b32_e32 v2, v3, v2
-; GFX1250FAKE16-NEXT: v_add_min_u32 v8, v9, -1, v8
+; GFX1250FAKE16-NEXT: v_min_u32_e32 v9, v11, v9
+; GFX1250FAKE16-NEXT: v_lshlrev_b64_e32 v[2:3], v7, v[2:3]
; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250FAKE16-NEXT: v_dual_sub_nc_u32 v3, 32, v6 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54
-; GFX1250FAKE16-NEXT: v_cvt_f32_i32_e32 v2, v2
-; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1250FAKE16-NEXT: v_lshlrev_b64_e32 v[4:5], v8, v[4:5]
-; GFX1250FAKE16-NEXT: v_sub_nc_u32_e32 v8, 32, v8
-; GFX1250FAKE16-NEXT: v_ldexp_f32 v2, v2, v3
-; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250FAKE16-NEXT: v_lshlrev_b64_e32 v[4:5], v6, v[4:5]
+; GFX1250FAKE16-NEXT: v_lshlrev_b64_e32 v[0:1], v9, v[0:1]
+; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250FAKE16-NEXT: v_min_u32_e32 v2, 1, v2
; GFX1250FAKE16-NEXT: v_min_u32_e32 v4, 1, v4
-; GFX1250FAKE16-NEXT: v_dual_sub_nc_u32 v4, 32, v7 :: v_dual_bitop2_b32 v1, v5, v4 bitop3:0x54
+; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX1250FAKE16-NEXT: v_min_u32_e32 v0, 1, v0
+; GFX1250FAKE16-NEXT: v_sub_nc_u32_e32 v6, 32, v6
+; GFX1250FAKE16-NEXT: v_dual_sub_nc_u32 v3, 32, v7 :: v_dual_bitop2_b32 v2, v3, v2 bitop3:0x54
+; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1250FAKE16-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX1250FAKE16-NEXT: v_dual_sub_nc_u32 v4, 32, v9 :: v_dual_bitop2_b32 v1, v5, v4 bitop3:0x54
+; GFX1250FAKE16-NEXT: v_cvt_f32_i32_e32 v2, v2
+; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250FAKE16-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250FAKE16-NEXT: v_cvt_f32_i32_e32 v1, v1
+; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250FAKE16-NEXT: v_ldexp_f32 v2, v2, v3
; GFX1250FAKE16-NEXT: v_ldexp_f32 v0, v0, v4
-; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250FAKE16-NEXT: v_ldexp_f32 v1, v1, v8
+; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250FAKE16-NEXT: v_ldexp_f32 v1, v1, v6
; GFX1250FAKE16-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2
; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250FAKE16-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0
@@ -40644,51 +40649,54 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_xor_b32_e32 v9, v4, v5
+; GFX1250-NEXT: v_cls_i32_e32 v9, v7
; GFX1250-NEXT: v_xor_b32_e32 v8, v6, v7
-; GFX1250-NEXT: v_cls_i32_e32 v12, v7
-; GFX1250-NEXT: v_cls_i32_e32 v13, v5
-; GFX1250-NEXT: v_cls_i32_e32 v14, v3
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: v_dual_ashrrev_i32 v9, 31, v9 :: v_dual_ashrrev_i32 v8, 31, v8
-; GFX1250-NEXT: v_xor_b32_e32 v10, v2, v3
-; GFX1250-NEXT: v_cls_i32_e32 v15, v1
-; GFX1250-NEXT: v_dual_add_nc_u32 v9, 32, v9 :: v_dual_add_nc_u32 v8, 32, v8
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_dual_ashrrev_i32 v10, 31, v10 :: v_dual_bitop2_b32 v11, v0, v1 bitop3:0x14
-; GFX1250-NEXT: v_add_min_u32 v9, v13, -1, v9
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: v_add_min_u32 v8, v12, -1, v8
-; GFX1250-NEXT: v_dual_ashrrev_i32 v11, 31, v11 :: v_dual_add_nc_u32 v10, 32, v10
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: v_lshlrev_b64_e32 v[4:5], v9, v[4:5]
-; GFX1250-NEXT: v_lshlrev_b64_e32 v[6:7], v8, v[6:7]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT: v_add_nc_u32_e32 v11, 32, v11
-; GFX1250-NEXT: v_add_min_u32 v10, v14, -1, v10
+; GFX1250-NEXT: v_cls_i32_e32 v10, v5
+; GFX1250-NEXT: v_xor_b32_e32 v14, v0, v1
+; GFX1250-NEXT: v_cls_i32_e32 v12, v3
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_dual_add_nc_u32 v9, -1, v9 :: v_dual_ashrrev_i32 v8, 31, v8
+; GFX1250-NEXT: v_dual_add_nc_u32 v10, -1, v10 :: v_dual_bitop2_b32 v11, v4, v5 bitop3:0x14
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_add_min_u32 v11, v15, -1, v11
-; GFX1250-NEXT: v_lshlrev_b64_e32 v[2:3], v10, v[2:3]
-; GFX1250-NEXT: v_min_u32_e32 v6, 1, v6
-; GFX1250-NEXT: v_min_u32_e32 v4, 1, v4
+; GFX1250-NEXT: v_dual_add_nc_u32 v8, 32, v8 :: v_dual_bitop2_b32 v13, v2, v3 bitop3:0x14
+; GFX1250-NEXT: v_ashrrev_i32_e32 v11, 31, v11
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_min_u32_e32 v8, v9, v8
+; GFX1250-NEXT: v_ashrrev_i32_e32 v9, 31, v13
+; GFX1250-NEXT: v_cls_i32_e32 v13, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_dual_ashrrev_i32 v14, 31, v14 :: v_dual_add_nc_u32 v11, 32, v11
+; GFX1250-NEXT: v_dual_add_nc_u32 v12, -1, v12 :: v_dual_add_nc_u32 v9, 32, v9
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_dual_add_nc_u32 v13, -1, v13 :: v_dual_add_nc_u32 v14, 32, v14
+; GFX1250-NEXT: v_min_u32_e32 v10, v10, v11
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[6:7], v8, v[6:7]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_min_u32_e32 v9, v12, v9
+; GFX1250-NEXT: v_min_u32_e32 v11, v13, v14
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[4:5], v10, v[4:5]
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[2:3], v9, v[2:3]
+; GFX1250-NEXT: v_min_u32_e32 v6, 1, v6
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1250-NEXT: v_lshlrev_b64_e32 v[0:1], v11, v[0:1]
+; GFX1250-NEXT: v_min_u32_e32 v4, 1, v4
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX1250-NEXT: v_or_b32_e32 v6, v7, v6
; GFX1250-NEXT: v_min_u32_e32 v2, 1, v2
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1250-NEXT: v_dual_sub_nc_u32 v5, 32, v10 :: v_dual_bitop2_b32 v4, v5, v4 bitop3:0x54
-; GFX1250-NEXT: v_sub_nc_u32_e32 v7, 32, v9
+; GFX1250-NEXT: v_sub_nc_u32_e32 v7, 32, v10
; GFX1250-NEXT: v_min_u32_e32 v0, 1, v0
-; GFX1250-NEXT: v_dual_sub_nc_u32 v3, 32, v8 :: v_dual_bitop2_b32 v2, v3, v2 bitop3:0x54
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: v_cvt_f32_i32_e32 v4, v4
+; GFX1250-NEXT: v_dual_sub_nc_u32 v5, 32, v9 :: v_dual_bitop2_b32 v4, v5, v4 bitop3:0x54
+; GFX1250-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1250-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX1250-NEXT: v_cvt_f32_i32_e32 v1, v6
+; GFX1250-NEXT: v_sub_nc_u32_e32 v1, 32, v8
+; GFX1250-NEXT: v_cvt_f32_i32_e32 v3, v6
+; GFX1250-NEXT: v_cvt_f32_i32_e32 v4, v4
; GFX1250-NEXT: v_sub_nc_u32_e32 v6, 32, v11
; GFX1250-NEXT: v_cvt_f32_i32_e32 v2, v2
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1250-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GFX1250-NEXT: v_ldexp_f32 v1, v1, v3
+; GFX1250-NEXT: v_ldexp_f32 v1, v3, v1
; GFX1250-NEXT: v_ldexp_f32 v3, v4, v7
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1250-NEXT: v_ldexp_f32 v2, v2, v5
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
index afd0f01..6831380 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
@@ -415,28 +415,18 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
; GISEL-GFX942-LABEL: memcpy_known:
; GISEL-GFX942: ; %bb.0:
; GISEL-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GISEL-GFX942-NEXT: s_load_dword s7, s[4:5], 0x54
; GISEL-GFX942-NEXT: s_load_dword s11, s[4:5], 0x34
-; GISEL-GFX942-NEXT: s_mov_b32 s7, 0
; GISEL-GFX942-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x44
-; GISEL-GFX942-NEXT: s_mov_b32 s8, s7
+; GISEL-GFX942-NEXT: s_mov_b32 s16, 0
+; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, 0x2000
; GISEL-GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX942-NEXT: s_mov_b32 s6, s1
+; GISEL-GFX942-NEXT: s_mov_b32 s8, s1
; GISEL-GFX942-NEXT: s_mov_b32 s9, s2
-; GISEL-GFX942-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9]
-; GISEL-GFX942-NEXT: s_mov_b32 s6, s3
-; GISEL-GFX942-NEXT: s_load_dword s3, s[4:5], 0x54
-; GISEL-GFX942-NEXT: s_mov_b32 s10, s7
-; GISEL-GFX942-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11]
-; GISEL-GFX942-NEXT: s_mov_b32 s6, s13
-; GISEL-GFX942-NEXT: s_mov_b32 s4, s7
+; GISEL-GFX942-NEXT: s_mov_b32 s10, s3
+; GISEL-GFX942-NEXT: s_mov_b32 s4, s13
; GISEL-GFX942-NEXT: s_mov_b32 s5, s14
-; GISEL-GFX942-NEXT: s_mov_b32 s16, 0
-; GISEL-GFX942-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
; GISEL-GFX942-NEXT: s_mov_b32 s6, s15
-; GISEL-GFX942-NEXT: s_mov_b32 s2, s7
-; GISEL-GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX942-NEXT: s_or_b64 s[6:7], s[6:7], s[2:3]
-; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, 0x2000
; GISEL-GFX942-NEXT: v_mov_b32_e32 v1, s16
; GISEL-GFX942-NEXT: .LBB0_1: ; %load-store-loop
; GISEL-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -491,25 +481,16 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
; GISEL-GFX1100-NEXT: s_load_b128 s[8:11], s[4:5], 0x44
; GISEL-GFX1100-NEXT: s_load_b32 s7, s[4:5], 0x34
; GISEL-GFX1100-NEXT: s_load_b32 s15, s[4:5], 0x54
-; GISEL-GFX1100-NEXT: s_mov_b32 s17, 0
-; GISEL-GFX1100-NEXT: s_mov_b32 s12, 0
-; GISEL-GFX1100-NEXT: s_mov_b32 s4, s17
-; GISEL-GFX1100-NEXT: s_mov_b32 s6, s17
-; GISEL-GFX1100-NEXT: v_mov_b32_e32 v0, s12
-; GISEL-GFX1100-NEXT: s_mov_b32 s14, s17
+; GISEL-GFX1100-NEXT: s_mov_b32 s4, 0
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-GFX1100-NEXT: v_mov_b32_e32 v0, s4
; GISEL-GFX1100-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX1100-NEXT: s_mov_b32 s16, s1
+; GISEL-GFX1100-NEXT: s_mov_b32 s4, s1
; GISEL-GFX1100-NEXT: s_mov_b32 s5, s2
-; GISEL-GFX1100-NEXT: s_mov_b32 s2, s17
-; GISEL-GFX1100-NEXT: s_or_b64 s[4:5], s[16:17], s[4:5]
-; GISEL-GFX1100-NEXT: s_mov_b32 s16, s3
-; GISEL-GFX1100-NEXT: s_mov_b32 s3, s10
-; GISEL-GFX1100-NEXT: s_or_b64 s[6:7], s[16:17], s[6:7]
-; GISEL-GFX1100-NEXT: s_mov_b32 s16, s9
-; GISEL-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GISEL-GFX1100-NEXT: s_or_b64 s[12:13], s[16:17], s[2:3]
-; GISEL-GFX1100-NEXT: s_mov_b32 s16, s11
-; GISEL-GFX1100-NEXT: s_or_b64 s[14:15], s[16:17], s[14:15]
+; GISEL-GFX1100-NEXT: s_mov_b32 s6, s3
+; GISEL-GFX1100-NEXT: s_mov_b32 s12, s9
+; GISEL-GFX1100-NEXT: s_mov_b32 s13, s10
+; GISEL-GFX1100-NEXT: s_mov_b32 s14, s11
; GISEL-GFX1100-NEXT: .LBB0_1: ; %load-store-loop
; GISEL-GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1
; GISEL-GFX1100-NEXT: v_add_nc_u32_e32 v61, s0, v0
@@ -960,28 +941,18 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
; GISEL-GFX942-LABEL: memcpy_known_medium:
; GISEL-GFX942: ; %bb.0:
; GISEL-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GISEL-GFX942-NEXT: s_load_dword s7, s[4:5], 0x54
; GISEL-GFX942-NEXT: s_load_dword s11, s[4:5], 0x34
-; GISEL-GFX942-NEXT: s_mov_b32 s7, 0
; GISEL-GFX942-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x44
-; GISEL-GFX942-NEXT: s_mov_b32 s8, s7
+; GISEL-GFX942-NEXT: s_mov_b32 s16, 0
+; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, 0x100
; GISEL-GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX942-NEXT: s_mov_b32 s6, s1
+; GISEL-GFX942-NEXT: s_mov_b32 s8, s1
; GISEL-GFX942-NEXT: s_mov_b32 s9, s2
-; GISEL-GFX942-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9]
-; GISEL-GFX942-NEXT: s_mov_b32 s6, s3
-; GISEL-GFX942-NEXT: s_load_dword s3, s[4:5], 0x54
-; GISEL-GFX942-NEXT: s_mov_b32 s10, s7
-; GISEL-GFX942-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11]
-; GISEL-GFX942-NEXT: s_mov_b32 s6, s13
-; GISEL-GFX942-NEXT: s_mov_b32 s4, s7
+; GISEL-GFX942-NEXT: s_mov_b32 s10, s3
+; GISEL-GFX942-NEXT: s_mov_b32 s4, s13
; GISEL-GFX942-NEXT: s_mov_b32 s5, s14
-; GISEL-GFX942-NEXT: s_mov_b32 s16, 0
-; GISEL-GFX942-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
; GISEL-GFX942-NEXT: s_mov_b32 s6, s15
-; GISEL-GFX942-NEXT: s_mov_b32 s2, s7
-; GISEL-GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX942-NEXT: s_or_b64 s[6:7], s[6:7], s[2:3]
-; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, 0x100
; GISEL-GFX942-NEXT: v_mov_b32_e32 v1, s16
; GISEL-GFX942-NEXT: .LBB1_1: ; %load-store-loop
; GISEL-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -1036,25 +1007,16 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
; GISEL-GFX1100-NEXT: s_load_b128 s[8:11], s[4:5], 0x44
; GISEL-GFX1100-NEXT: s_load_b32 s7, s[4:5], 0x34
; GISEL-GFX1100-NEXT: s_load_b32 s15, s[4:5], 0x54
-; GISEL-GFX1100-NEXT: s_mov_b32 s17, 0
-; GISEL-GFX1100-NEXT: s_mov_b32 s12, 0
-; GISEL-GFX1100-NEXT: s_mov_b32 s4, s17
-; GISEL-GFX1100-NEXT: s_mov_b32 s6, s17
-; GISEL-GFX1100-NEXT: v_mov_b32_e32 v0, s12
-; GISEL-GFX1100-NEXT: s_mov_b32 s14, s17
+; GISEL-GFX1100-NEXT: s_mov_b32 s4, 0
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-GFX1100-NEXT: v_mov_b32_e32 v0, s4
; GISEL-GFX1100-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX1100-NEXT: s_mov_b32 s16, s1
+; GISEL-GFX1100-NEXT: s_mov_b32 s4, s1
; GISEL-GFX1100-NEXT: s_mov_b32 s5, s2
-; GISEL-GFX1100-NEXT: s_mov_b32 s2, s17
-; GISEL-GFX1100-NEXT: s_or_b64 s[4:5], s[16:17], s[4:5]
-; GISEL-GFX1100-NEXT: s_mov_b32 s16, s3
-; GISEL-GFX1100-NEXT: s_mov_b32 s3, s10
-; GISEL-GFX1100-NEXT: s_or_b64 s[6:7], s[16:17], s[6:7]
-; GISEL-GFX1100-NEXT: s_mov_b32 s16, s9
-; GISEL-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GISEL-GFX1100-NEXT: s_or_b64 s[12:13], s[16:17], s[2:3]
-; GISEL-GFX1100-NEXT: s_mov_b32 s16, s11
-; GISEL-GFX1100-NEXT: s_or_b64 s[14:15], s[16:17], s[14:15]
+; GISEL-GFX1100-NEXT: s_mov_b32 s6, s3
+; GISEL-GFX1100-NEXT: s_mov_b32 s12, s9
+; GISEL-GFX1100-NEXT: s_mov_b32 s13, s10
+; GISEL-GFX1100-NEXT: s_mov_b32 s14, s11
; GISEL-GFX1100-NEXT: .LBB1_1: ; %load-store-loop
; GISEL-GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1
; GISEL-GFX1100-NEXT: v_add_nc_u32_e32 v61, s0, v0
@@ -1228,27 +1190,18 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa
; GISEL-GFX942: ; %bb.0:
; GISEL-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GISEL-GFX942-NEXT: s_load_dword s11, s[4:5], 0x34
-; GISEL-GFX942-NEXT: s_mov_b32 s7, 0
-; GISEL-GFX942-NEXT: s_mov_b32 s8, s7
-; GISEL-GFX942-NEXT: s_mov_b32 s10, s7
; GISEL-GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX942-NEXT: s_mov_b32 s6, s1
+; GISEL-GFX942-NEXT: s_mov_b32 s8, s1
; GISEL-GFX942-NEXT: s_mov_b32 s9, s2
-; GISEL-GFX942-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9]
-; GISEL-GFX942-NEXT: s_mov_b32 s6, s3
-; GISEL-GFX942-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11]
+; GISEL-GFX942-NEXT: s_mov_b32 s10, s3
; GISEL-GFX942-NEXT: v_mov_b32_e32 v4, s0
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[0:3], v4, s[8:11], 0 offen
; GISEL-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x44
-; GISEL-GFX942-NEXT: s_load_dword s13, s[4:5], 0x54
-; GISEL-GFX942-NEXT: s_mov_b32 s4, s7
-; GISEL-GFX942-NEXT: s_mov_b32 s12, s7
+; GISEL-GFX942-NEXT: s_load_dword s7, s[4:5], 0x54
; GISEL-GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX942-NEXT: s_mov_b32 s6, s1
+; GISEL-GFX942-NEXT: s_mov_b32 s4, s1
; GISEL-GFX942-NEXT: s_mov_b32 s5, s2
-; GISEL-GFX942-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
; GISEL-GFX942-NEXT: s_mov_b32 s6, s3
-; GISEL-GFX942-NEXT: s_or_b64 s[6:7], s[6:7], s[12:13]
; GISEL-GFX942-NEXT: v_mov_b32_e32 v5, s0
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[0:3], v5, s[4:7], 0 offen
@@ -1261,35 +1214,24 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa
; GISEL-GFX1100: ; %bb.0:
; GISEL-GFX1100-NEXT: s_clause 0x1
; GISEL-GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GISEL-GFX1100-NEXT: s_load_b32 s7, s[4:5], 0x34
-; GISEL-GFX1100-NEXT: s_mov_b32 s13, 0
-; GISEL-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-GFX1100-NEXT: s_mov_b32 s8, s13
-; GISEL-GFX1100-NEXT: s_mov_b32 s6, s13
+; GISEL-GFX1100-NEXT: s_load_b32 s11, s[4:5], 0x34
; GISEL-GFX1100-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX1100-NEXT: s_mov_b32 s12, s1
-; GISEL-GFX1100-NEXT: s_mov_b32 s9, s2
; GISEL-GFX1100-NEXT: v_mov_b32_e32 v4, s0
-; GISEL-GFX1100-NEXT: s_or_b64 s[0:1], s[12:13], s[8:9]
-; GISEL-GFX1100-NEXT: s_mov_b32 s12, s3
-; GISEL-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-GFX1100-NEXT: s_or_b64 s[2:3], s[12:13], s[6:7]
-; GISEL-GFX1100-NEXT: buffer_load_b128 v[0:3], v4, s[0:3], 0 offen
+; GISEL-GFX1100-NEXT: s_mov_b32 s8, s1
+; GISEL-GFX1100-NEXT: s_mov_b32 s9, s2
+; GISEL-GFX1100-NEXT: s_mov_b32 s10, s3
+; GISEL-GFX1100-NEXT: buffer_load_b128 v[0:3], v4, s[8:11], 0 offen
; GISEL-GFX1100-NEXT: s_clause 0x1
-; GISEL-GFX1100-NEXT: s_load_b128 s[8:11], s[4:5], 0x44
+; GISEL-GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x44
; GISEL-GFX1100-NEXT: s_load_b32 s7, s[4:5], 0x54
-; GISEL-GFX1100-NEXT: s_mov_b32 s4, s13
; GISEL-GFX1100-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX1100-NEXT: s_mov_b32 s12, s9
-; GISEL-GFX1100-NEXT: s_mov_b32 s5, s10
-; GISEL-GFX1100-NEXT: v_mov_b32_e32 v5, s8
-; GISEL-GFX1100-NEXT: s_or_b64 s[4:5], s[12:13], s[4:5]
-; GISEL-GFX1100-NEXT: s_mov_b32 s12, s11
-; GISEL-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-GFX1100-NEXT: s_or_b64 s[6:7], s[12:13], s[6:7]
+; GISEL-GFX1100-NEXT: v_mov_b32_e32 v5, s0
+; GISEL-GFX1100-NEXT: s_mov_b32 s4, s1
+; GISEL-GFX1100-NEXT: s_mov_b32 s5, s2
+; GISEL-GFX1100-NEXT: s_mov_b32 s6, s3
; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0)
; GISEL-GFX1100-NEXT: buffer_store_b128 v[0:3], v5, s[4:7], 0 offen
-; GISEL-GFX1100-NEXT: buffer_load_b128 v[0:3], v4, s[0:3], 0 offen offset:16
+; GISEL-GFX1100-NEXT: buffer_load_b128 v[0:3], v4, s[8:11], 0 offen offset:16
; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0)
; GISEL-GFX1100-NEXT: buffer_store_b128 v[0:3], v5, s[4:7], 0 offen offset:16
; GISEL-GFX1100-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index 4a6fa4f..b96de17 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -704,7 +704,6 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; CISI-NEXT: s_add_u32 s4, s4, s6
; CISI-NEXT: s_cselect_b64 s[12:13], -1, 0
; CISI-NEXT: s_or_b32 s6, s12, s13
-; CISI-NEXT: s_cmp_lg_u32 s6, 0
; CISI-NEXT: s_addc_u32 s5, s5, s7
; CISI-NEXT: s_mov_b32 s8, s0
; CISI-NEXT: s_mov_b32 s9, s1
@@ -725,16 +724,14 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: s_add_u32 s2, s4, s6
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_add_u32 s0, s4, s6
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_addc_u32 s1, s5, s7
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
-; VI-NEXT: s_addc_u32 s0, s5, s7
-; VI-NEXT: v_mov_b32_e32 v4, s2
-; VI-NEXT: v_mov_b32_e32 v5, s0
-; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -746,12 +743,10 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s2, s12, s14
-; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: s_addc_u32 s0, s13, s15
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: s_add_u32 s0, s12, s14
+; GFX9-NEXT: s_addc_u32 s1, s13, s15
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -764,10 +759,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
; GFX1010-NEXT: s_add_u32 s0, s12, s14
-; GFX1010-NEXT: s_cselect_b32 s1, -1, 0
-; GFX1010-NEXT: v_mov_b32_e32 v0, s0
-; GFX1010-NEXT: s_cmp_lg_u32 s1, 0
; GFX1010-NEXT: s_addc_u32 s1, s13, s15
+; GFX1010-NEXT: v_mov_b32_e32 v0, s0
; GFX1010-NEXT: s_cselect_b32 s0, -1, 0
; GFX1010-NEXT: v_mov_b32_e32 v1, s1
; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
@@ -781,10 +774,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030W32-NEXT: s_add_u32 s4, s4, s6
-; GFX1030W32-NEXT: s_cselect_b32 s6, -1, 0
-; GFX1030W32-NEXT: v_mov_b32_e32 v0, s4
-; GFX1030W32-NEXT: s_cmp_lg_u32 s6, 0
; GFX1030W32-NEXT: s_addc_u32 s5, s5, s7
+; GFX1030W32-NEXT: v_mov_b32_e32 v0, s4
; GFX1030W32-NEXT: s_cselect_b32 s4, -1, 0
; GFX1030W32-NEXT: v_mov_b32_e32 v1, s5
; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
@@ -798,10 +789,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030W64-NEXT: s_add_u32 s4, s4, s6
-; GFX1030W64-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GFX1030W64-NEXT: v_mov_b32_e32 v0, s4
-; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX1030W64-NEXT: s_addc_u32 s5, s5, s7
+; GFX1030W64-NEXT: v_mov_b32_e32 v0, s4
; GFX1030W64-NEXT: v_mov_b32_e32 v1, s5
; GFX1030W64-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5]
@@ -814,10 +803,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_u32 s4, s4, s6
-; GFX11-NEXT: s_cselect_b32 s6, -1, 0
-; GFX11-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-NEXT: s_cmp_lg_u32 s6, 0
; GFX11-NEXT: s_addc_u32 s5, s5, s7
+; GFX11-NEXT: v_mov_b32_e32 v0, s4
; GFX11-NEXT: s_cselect_b32 s4, -1, 0
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
@@ -831,10 +818,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_add_co_u32 s0, s12, s14
-; GFX1250-NEXT: s_cselect_b32 s1, -1, 0
-; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
-; GFX1250-NEXT: s_cmp_lg_u32 s1, 0
; GFX1250-NEXT: s_add_co_ci_u32 s1, s13, s15
+; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
; GFX1250-NEXT: s_cselect_b32 s0, -1, 0
; GFX1250-NEXT: v_mov_b32_e32 v1, s1
; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
@@ -1691,7 +1676,6 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; CISI-NEXT: s_sub_u32 s4, s4, s6
; CISI-NEXT: s_cselect_b64 s[12:13], -1, 0
; CISI-NEXT: s_or_b32 s6, s12, s13
-; CISI-NEXT: s_cmp_lg_u32 s6, 0
; CISI-NEXT: s_subb_u32 s5, s5, s7
; CISI-NEXT: s_mov_b32 s8, s0
; CISI-NEXT: s_mov_b32 s9, s1
@@ -1712,16 +1696,14 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: s_sub_u32 s2, s4, s6
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_sub_u32 s0, s4, s6
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_subb_u32 s1, s5, s7
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
-; VI-NEXT: s_subb_u32 s0, s5, s7
-; VI-NEXT: v_mov_b32_e32 v4, s2
-; VI-NEXT: v_mov_b32_e32 v5, s0
-; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -1733,12 +1715,10 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sub_u32 s2, s12, s14
-; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: s_subb_u32 s0, s13, s15
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: s_sub_u32 s0, s12, s14
+; GFX9-NEXT: s_subb_u32 s1, s13, s15
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -1751,10 +1731,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
; GFX1010-NEXT: s_sub_u32 s0, s12, s14
-; GFX1010-NEXT: s_cselect_b32 s1, -1, 0
-; GFX1010-NEXT: v_mov_b32_e32 v0, s0
-; GFX1010-NEXT: s_cmp_lg_u32 s1, 0
; GFX1010-NEXT: s_subb_u32 s1, s13, s15
+; GFX1010-NEXT: v_mov_b32_e32 v0, s0
; GFX1010-NEXT: s_cselect_b32 s0, -1, 0
; GFX1010-NEXT: v_mov_b32_e32 v1, s1
; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
@@ -1768,10 +1746,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030W32-NEXT: s_sub_u32 s4, s4, s6
-; GFX1030W32-NEXT: s_cselect_b32 s6, -1, 0
-; GFX1030W32-NEXT: v_mov_b32_e32 v0, s4
-; GFX1030W32-NEXT: s_cmp_lg_u32 s6, 0
; GFX1030W32-NEXT: s_subb_u32 s5, s5, s7
+; GFX1030W32-NEXT: v_mov_b32_e32 v0, s4
; GFX1030W32-NEXT: s_cselect_b32 s4, -1, 0
; GFX1030W32-NEXT: v_mov_b32_e32 v1, s5
; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
@@ -1785,10 +1761,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030W64-NEXT: s_sub_u32 s4, s4, s6
-; GFX1030W64-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GFX1030W64-NEXT: v_mov_b32_e32 v0, s4
-; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX1030W64-NEXT: s_subb_u32 s5, s5, s7
+; GFX1030W64-NEXT: v_mov_b32_e32 v0, s4
; GFX1030W64-NEXT: v_mov_b32_e32 v1, s5
; GFX1030W64-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5]
@@ -1801,10 +1775,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_sub_u32 s4, s4, s6
-; GFX11-NEXT: s_cselect_b32 s6, -1, 0
-; GFX11-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-NEXT: s_cmp_lg_u32 s6, 0
; GFX11-NEXT: s_subb_u32 s5, s5, s7
+; GFX11-NEXT: v_mov_b32_e32 v0, s4
; GFX11-NEXT: s_cselect_b32 s4, -1, 0
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
@@ -1818,10 +1790,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_sub_co_u32 s0, s12, s14
-; GFX1250-NEXT: s_cselect_b32 s1, -1, 0
-; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
-; GFX1250-NEXT: s_cmp_lg_u32 s1, 0
; GFX1250-NEXT: s_sub_co_ci_u32 s1, s13, s15
+; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
; GFX1250-NEXT: s_cselect_b32 s0, -1, 0
; GFX1250-NEXT: v_mov_b32_e32 v1, s1
; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
@@ -2218,49 +2188,46 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; VI-NEXT: s_addc_u32 s6, s7, s9
; VI-NEXT: s_addc_u32 s8, s8, 0
; VI-NEXT: v_readfirstlane_b32 s7, v0
-; VI-NEXT: s_add_u32 s12, s6, s7
-; VI-NEXT: v_mov_b32_e32 v0, s12
+; VI-NEXT: s_add_u32 s10, s6, s7
+; VI-NEXT: v_mov_b32_e32 v0, s10
; VI-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s4, v0, 0
-; VI-NEXT: s_addc_u32 s13, 0, s8
-; VI-NEXT: s_mul_i32 s8, s4, s13
+; VI-NEXT: s_addc_u32 s11, 0, s8
+; VI-NEXT: s_mul_i32 s8, s4, s11
; VI-NEXT: v_readfirstlane_b32 s9, v1
; VI-NEXT: s_add_i32 s8, s9, s8
-; VI-NEXT: s_mul_i32 s9, s5, s12
-; VI-NEXT: s_add_i32 s14, s8, s9
-; VI-NEXT: s_sub_i32 s10, s3, s14
+; VI-NEXT: s_mul_i32 s9, s5, s10
+; VI-NEXT: s_add_i32 s12, s8, s9
+; VI-NEXT: s_sub_i32 s13, s3, s12
; VI-NEXT: v_readfirstlane_b32 s8, v0
-; VI-NEXT: s_sub_u32 s15, s2, s8
+; VI-NEXT: s_sub_u32 s14, s2, s8
; VI-NEXT: s_cselect_b64 s[8:9], -1, 0
-; VI-NEXT: s_cmp_lg_u64 s[8:9], 0
-; VI-NEXT: s_subb_u32 s16, s10, s5
-; VI-NEXT: s_sub_u32 s17, s15, s4
-; VI-NEXT: s_cselect_b64 s[10:11], -1, 0
-; VI-NEXT: s_cmp_lg_u64 s[10:11], 0
-; VI-NEXT: s_subb_u32 s10, s16, 0
-; VI-NEXT: s_cmp_ge_u32 s10, s5
-; VI-NEXT: s_cselect_b32 s11, -1, 0
-; VI-NEXT: s_cmp_ge_u32 s17, s4
+; VI-NEXT: s_subb_u32 s13, s13, s5
+; VI-NEXT: s_sub_u32 s15, s14, s4
+; VI-NEXT: s_subb_u32 s13, s13, 0
+; VI-NEXT: s_cmp_ge_u32 s13, s5
; VI-NEXT: s_cselect_b32 s16, -1, 0
-; VI-NEXT: s_cmp_eq_u32 s10, s5
-; VI-NEXT: s_cselect_b32 s10, s16, s11
-; VI-NEXT: s_add_u32 s11, s12, 1
-; VI-NEXT: s_addc_u32 s16, s13, 0
-; VI-NEXT: s_add_u32 s17, s12, 2
-; VI-NEXT: s_addc_u32 s18, s13, 0
-; VI-NEXT: s_cmp_lg_u32 s10, 0
-; VI-NEXT: s_cselect_b32 s10, s17, s11
-; VI-NEXT: s_cselect_b32 s11, s18, s16
+; VI-NEXT: s_cmp_ge_u32 s15, s4
+; VI-NEXT: s_cselect_b32 s15, -1, 0
+; VI-NEXT: s_cmp_eq_u32 s13, s5
+; VI-NEXT: s_cselect_b32 s13, s15, s16
+; VI-NEXT: s_add_u32 s15, s10, 1
+; VI-NEXT: s_addc_u32 s16, s11, 0
+; VI-NEXT: s_add_u32 s17, s10, 2
+; VI-NEXT: s_addc_u32 s18, s11, 0
+; VI-NEXT: s_cmp_lg_u32 s13, 0
+; VI-NEXT: s_cselect_b32 s13, s17, s15
+; VI-NEXT: s_cselect_b32 s15, s18, s16
; VI-NEXT: s_cmp_lg_u64 s[8:9], 0
-; VI-NEXT: s_subb_u32 s3, s3, s14
+; VI-NEXT: s_subb_u32 s3, s3, s12
; VI-NEXT: s_cmp_ge_u32 s3, s5
; VI-NEXT: s_cselect_b32 s8, -1, 0
-; VI-NEXT: s_cmp_ge_u32 s15, s4
+; VI-NEXT: s_cmp_ge_u32 s14, s4
; VI-NEXT: s_cselect_b32 s9, -1, 0
; VI-NEXT: s_cmp_eq_u32 s3, s5
; VI-NEXT: s_cselect_b32 s3, s9, s8
; VI-NEXT: s_cmp_lg_u32 s3, 0
-; VI-NEXT: s_cselect_b32 s9, s11, s13
-; VI-NEXT: s_cselect_b32 s8, s10, s12
+; VI-NEXT: s_cselect_b32 s9, s15, s11
+; VI-NEXT: s_cselect_b32 s8, s13, s10
; VI-NEXT: s_cbranch_execnz .LBB16_4
; VI-NEXT: .LBB16_2:
; VI-NEXT: v_cvt_f32_u32_e32 v0, s4
@@ -2311,8 +2278,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7
-; GFX9-NEXT: s_sub_u32 s10, 0, s6
-; GFX9-NEXT: s_subb_u32 s11, 0, s7
+; GFX9-NEXT: s_sub_u32 s8, 0, s6
+; GFX9-NEXT: s_subb_u32 s9, 0, s7
; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GFX9-NEXT: v_rcp_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2321,109 +2288,102 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX9-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s12, v1
-; GFX9-NEXT: v_readfirstlane_b32 s8, v0
-; GFX9-NEXT: s_mul_i32 s9, s10, s12
-; GFX9-NEXT: s_mul_hi_u32 s14, s10, s8
-; GFX9-NEXT: s_mul_i32 s13, s11, s8
-; GFX9-NEXT: s_add_i32 s9, s14, s9
-; GFX9-NEXT: s_add_i32 s9, s9, s13
-; GFX9-NEXT: s_mul_i32 s15, s10, s8
-; GFX9-NEXT: s_mul_i32 s14, s8, s9
-; GFX9-NEXT: s_mul_hi_u32 s16, s8, s15
-; GFX9-NEXT: s_mul_hi_u32 s13, s8, s9
+; GFX9-NEXT: v_readfirstlane_b32 s10, v1
+; GFX9-NEXT: v_readfirstlane_b32 s11, v0
+; GFX9-NEXT: s_mul_i32 s12, s8, s10
+; GFX9-NEXT: s_mul_hi_u32 s14, s8, s11
+; GFX9-NEXT: s_mul_i32 s13, s9, s11
+; GFX9-NEXT: s_add_i32 s12, s14, s12
+; GFX9-NEXT: s_add_i32 s12, s12, s13
+; GFX9-NEXT: s_mul_i32 s15, s8, s11
+; GFX9-NEXT: s_mul_i32 s14, s11, s12
+; GFX9-NEXT: s_mul_hi_u32 s16, s11, s15
+; GFX9-NEXT: s_mul_hi_u32 s13, s11, s12
; GFX9-NEXT: s_add_u32 s14, s16, s14
; GFX9-NEXT: s_addc_u32 s13, 0, s13
-; GFX9-NEXT: s_mul_hi_u32 s17, s12, s15
-; GFX9-NEXT: s_mul_i32 s15, s12, s15
+; GFX9-NEXT: s_mul_hi_u32 s17, s10, s15
+; GFX9-NEXT: s_mul_i32 s15, s10, s15
; GFX9-NEXT: s_add_u32 s14, s14, s15
-; GFX9-NEXT: s_mul_hi_u32 s16, s12, s9
+; GFX9-NEXT: s_mul_hi_u32 s16, s10, s12
; GFX9-NEXT: s_addc_u32 s13, s13, s17
; GFX9-NEXT: s_addc_u32 s14, s16, 0
-; GFX9-NEXT: s_mul_i32 s9, s12, s9
-; GFX9-NEXT: s_add_u32 s9, s13, s9
+; GFX9-NEXT: s_mul_i32 s12, s10, s12
+; GFX9-NEXT: s_add_u32 s12, s13, s12
; GFX9-NEXT: s_addc_u32 s13, 0, s14
-; GFX9-NEXT: s_add_u32 s14, s8, s9
-; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
-; GFX9-NEXT: s_addc_u32 s12, s12, s13
-; GFX9-NEXT: s_mul_i32 s8, s10, s12
-; GFX9-NEXT: s_mul_hi_u32 s9, s10, s14
-; GFX9-NEXT: s_add_i32 s8, s9, s8
-; GFX9-NEXT: s_mul_i32 s11, s11, s14
-; GFX9-NEXT: s_add_i32 s8, s8, s11
-; GFX9-NEXT: s_mul_i32 s10, s10, s14
-; GFX9-NEXT: s_mul_hi_u32 s11, s12, s10
-; GFX9-NEXT: s_mul_i32 s13, s12, s10
-; GFX9-NEXT: s_mul_i32 s16, s14, s8
-; GFX9-NEXT: s_mul_hi_u32 s10, s14, s10
-; GFX9-NEXT: s_mul_hi_u32 s15, s14, s8
-; GFX9-NEXT: s_add_u32 s10, s10, s16
+; GFX9-NEXT: s_add_u32 s11, s11, s12
+; GFX9-NEXT: s_addc_u32 s10, s10, s13
+; GFX9-NEXT: s_mul_i32 s12, s8, s10
+; GFX9-NEXT: s_mul_hi_u32 s13, s8, s11
+; GFX9-NEXT: s_add_i32 s12, s13, s12
+; GFX9-NEXT: s_mul_i32 s9, s9, s11
+; GFX9-NEXT: s_add_i32 s12, s12, s9
+; GFX9-NEXT: s_mul_i32 s8, s8, s11
+; GFX9-NEXT: s_mul_hi_u32 s13, s10, s8
+; GFX9-NEXT: s_mul_i32 s14, s10, s8
+; GFX9-NEXT: s_mul_i32 s16, s11, s12
+; GFX9-NEXT: s_mul_hi_u32 s8, s11, s8
+; GFX9-NEXT: s_mul_hi_u32 s15, s11, s12
+; GFX9-NEXT: s_add_u32 s8, s8, s16
; GFX9-NEXT: s_addc_u32 s15, 0, s15
-; GFX9-NEXT: s_add_u32 s10, s10, s13
-; GFX9-NEXT: s_mul_hi_u32 s9, s12, s8
-; GFX9-NEXT: s_addc_u32 s10, s15, s11
+; GFX9-NEXT: s_add_u32 s8, s8, s14
+; GFX9-NEXT: s_mul_hi_u32 s9, s10, s12
+; GFX9-NEXT: s_addc_u32 s8, s15, s13
; GFX9-NEXT: s_addc_u32 s9, s9, 0
-; GFX9-NEXT: s_mul_i32 s8, s12, s8
-; GFX9-NEXT: s_add_u32 s8, s10, s8
-; GFX9-NEXT: s_addc_u32 s10, 0, s9
-; GFX9-NEXT: s_add_u32 s11, s14, s8
-; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
-; GFX9-NEXT: s_addc_u32 s8, s12, s10
-; GFX9-NEXT: s_mul_i32 s10, s2, s8
-; GFX9-NEXT: s_mul_hi_u32 s12, s2, s11
-; GFX9-NEXT: s_mul_hi_u32 s9, s2, s8
-; GFX9-NEXT: s_add_u32 s10, s12, s10
+; GFX9-NEXT: s_mul_i32 s12, s10, s12
+; GFX9-NEXT: s_add_u32 s8, s8, s12
; GFX9-NEXT: s_addc_u32 s9, 0, s9
-; GFX9-NEXT: s_mul_hi_u32 s13, s3, s11
-; GFX9-NEXT: s_mul_i32 s11, s3, s11
-; GFX9-NEXT: s_add_u32 s10, s10, s11
-; GFX9-NEXT: s_mul_hi_u32 s12, s3, s8
-; GFX9-NEXT: s_addc_u32 s9, s9, s13
-; GFX9-NEXT: s_addc_u32 s10, s12, 0
+; GFX9-NEXT: s_add_u32 s8, s11, s8
+; GFX9-NEXT: s_addc_u32 s9, s10, s9
+; GFX9-NEXT: s_mul_i32 s11, s2, s9
+; GFX9-NEXT: s_mul_hi_u32 s12, s2, s8
+; GFX9-NEXT: s_mul_hi_u32 s10, s2, s9
+; GFX9-NEXT: s_add_u32 s11, s12, s11
+; GFX9-NEXT: s_addc_u32 s10, 0, s10
+; GFX9-NEXT: s_mul_hi_u32 s13, s3, s8
; GFX9-NEXT: s_mul_i32 s8, s3, s8
-; GFX9-NEXT: s_add_u32 s12, s9, s8
-; GFX9-NEXT: s_addc_u32 s13, 0, s10
-; GFX9-NEXT: s_mul_i32 s8, s6, s13
-; GFX9-NEXT: s_mul_hi_u32 s9, s6, s12
+; GFX9-NEXT: s_add_u32 s8, s11, s8
+; GFX9-NEXT: s_mul_hi_u32 s12, s3, s9
+; GFX9-NEXT: s_addc_u32 s8, s10, s13
+; GFX9-NEXT: s_addc_u32 s10, s12, 0
+; GFX9-NEXT: s_mul_i32 s9, s3, s9
+; GFX9-NEXT: s_add_u32 s11, s8, s9
+; GFX9-NEXT: s_addc_u32 s10, 0, s10
+; GFX9-NEXT: s_mul_i32 s8, s6, s10
+; GFX9-NEXT: s_mul_hi_u32 s9, s6, s11
; GFX9-NEXT: s_add_i32 s8, s9, s8
-; GFX9-NEXT: s_mul_i32 s9, s7, s12
-; GFX9-NEXT: s_add_i32 s14, s8, s9
-; GFX9-NEXT: s_sub_i32 s10, s3, s14
-; GFX9-NEXT: s_mul_i32 s8, s6, s12
-; GFX9-NEXT: s_sub_u32 s15, s2, s8
+; GFX9-NEXT: s_mul_i32 s9, s7, s11
+; GFX9-NEXT: s_add_i32 s12, s8, s9
+; GFX9-NEXT: s_sub_i32 s13, s3, s12
+; GFX9-NEXT: s_mul_i32 s8, s6, s11
+; GFX9-NEXT: s_sub_u32 s14, s2, s8
; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
-; GFX9-NEXT: s_subb_u32 s16, s10, s7
-; GFX9-NEXT: s_sub_u32 s17, s15, s6
-; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT: s_subb_u32 s10, s16, 0
-; GFX9-NEXT: s_cmp_ge_u32 s10, s7
-; GFX9-NEXT: s_cselect_b32 s11, -1, 0
-; GFX9-NEXT: s_cmp_ge_u32 s17, s6
+; GFX9-NEXT: s_subb_u32 s13, s13, s7
+; GFX9-NEXT: s_sub_u32 s15, s14, s6
+; GFX9-NEXT: s_subb_u32 s13, s13, 0
+; GFX9-NEXT: s_cmp_ge_u32 s13, s7
; GFX9-NEXT: s_cselect_b32 s16, -1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s10, s7
-; GFX9-NEXT: s_cselect_b32 s10, s16, s11
-; GFX9-NEXT: s_add_u32 s11, s12, 1
-; GFX9-NEXT: s_addc_u32 s16, s13, 0
-; GFX9-NEXT: s_add_u32 s17, s12, 2
-; GFX9-NEXT: s_addc_u32 s18, s13, 0
-; GFX9-NEXT: s_cmp_lg_u32 s10, 0
-; GFX9-NEXT: s_cselect_b32 s10, s17, s11
-; GFX9-NEXT: s_cselect_b32 s11, s18, s16
+; GFX9-NEXT: s_cmp_ge_u32 s15, s6
+; GFX9-NEXT: s_cselect_b32 s15, -1, 0
+; GFX9-NEXT: s_cmp_eq_u32 s13, s7
+; GFX9-NEXT: s_cselect_b32 s13, s15, s16
+; GFX9-NEXT: s_add_u32 s15, s11, 1
+; GFX9-NEXT: s_addc_u32 s16, s10, 0
+; GFX9-NEXT: s_add_u32 s17, s11, 2
+; GFX9-NEXT: s_addc_u32 s18, s10, 0
+; GFX9-NEXT: s_cmp_lg_u32 s13, 0
+; GFX9-NEXT: s_cselect_b32 s13, s17, s15
+; GFX9-NEXT: s_cselect_b32 s15, s18, s16
; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
-; GFX9-NEXT: s_subb_u32 s3, s3, s14
+; GFX9-NEXT: s_subb_u32 s3, s3, s12
; GFX9-NEXT: s_cmp_ge_u32 s3, s7
; GFX9-NEXT: s_cselect_b32 s8, -1, 0
-; GFX9-NEXT: s_cmp_ge_u32 s15, s6
+; GFX9-NEXT: s_cmp_ge_u32 s14, s6
; GFX9-NEXT: s_cselect_b32 s9, -1, 0
; GFX9-NEXT: s_cmp_eq_u32 s3, s7
; GFX9-NEXT: s_cselect_b32 s3, s9, s8
; GFX9-NEXT: s_cmp_lg_u32 s3, 0
-; GFX9-NEXT: s_cselect_b32 s9, s11, s13
-; GFX9-NEXT: s_cselect_b32 s8, s10, s12
+; GFX9-NEXT: s_cselect_b32 s9, s15, s10
+; GFX9-NEXT: s_cselect_b32 s8, s13, s11
; GFX9-NEXT: s_cbranch_execnz .LBB16_3
; GFX9-NEXT: .LBB16_2:
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
@@ -2503,44 +2463,40 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1010-NEXT: s_add_u32 s11, s12, s11
; GFX1010-NEXT: s_addc_u32 s12, 0, s13
; GFX1010-NEXT: s_add_u32 s8, s8, s11
-; GFX1010-NEXT: s_cselect_b32 s11, -1, 0
-; GFX1010-NEXT: s_mul_hi_u32 s13, s9, s8
-; GFX1010-NEXT: s_cmp_lg_u32 s11, 0
-; GFX1010-NEXT: s_mul_i32 s11, s9, s8
; GFX1010-NEXT: s_addc_u32 s5, s5, s12
-; GFX1010-NEXT: s_mul_i32 s10, s10, s8
+; GFX1010-NEXT: s_mul_hi_u32 s11, s9, s8
+; GFX1010-NEXT: s_mul_i32 s12, s9, s8
; GFX1010-NEXT: s_mul_i32 s9, s9, s5
-; GFX1010-NEXT: s_mul_hi_u32 s12, s8, s11
-; GFX1010-NEXT: s_add_i32 s9, s13, s9
-; GFX1010-NEXT: s_mul_hi_u32 s13, s5, s11
+; GFX1010-NEXT: s_mul_i32 s10, s10, s8
+; GFX1010-NEXT: s_add_i32 s9, s11, s9
+; GFX1010-NEXT: s_mul_i32 s11, s5, s12
; GFX1010-NEXT: s_add_i32 s9, s9, s10
-; GFX1010-NEXT: s_mul_i32 s10, s5, s11
+; GFX1010-NEXT: s_mul_hi_u32 s10, s8, s12
; GFX1010-NEXT: s_mul_i32 s15, s8, s9
; GFX1010-NEXT: s_mul_hi_u32 s14, s8, s9
-; GFX1010-NEXT: s_add_u32 s12, s12, s15
+; GFX1010-NEXT: s_add_u32 s10, s10, s15
+; GFX1010-NEXT: s_mul_hi_u32 s13, s5, s12
; GFX1010-NEXT: s_addc_u32 s14, 0, s14
-; GFX1010-NEXT: s_mul_hi_u32 s11, s5, s9
-; GFX1010-NEXT: s_add_u32 s10, s12, s10
+; GFX1010-NEXT: s_mul_hi_u32 s12, s5, s9
+; GFX1010-NEXT: s_add_u32 s10, s10, s11
; GFX1010-NEXT: s_mul_i32 s9, s5, s9
; GFX1010-NEXT: s_addc_u32 s10, s14, s13
-; GFX1010-NEXT: s_addc_u32 s11, s11, 0
+; GFX1010-NEXT: s_addc_u32 s11, s12, 0
; GFX1010-NEXT: s_add_u32 s9, s10, s9
; GFX1010-NEXT: s_addc_u32 s10, 0, s11
; GFX1010-NEXT: s_add_u32 s8, s8, s9
-; GFX1010-NEXT: s_cselect_b32 s9, -1, 0
-; GFX1010-NEXT: s_mul_hi_u32 s11, s2, s8
-; GFX1010-NEXT: s_cmp_lg_u32 s9, 0
-; GFX1010-NEXT: s_mul_hi_u32 s9, s3, s8
; GFX1010-NEXT: s_addc_u32 s5, s5, s10
-; GFX1010-NEXT: s_mul_i32 s8, s3, s8
+; GFX1010-NEXT: s_mul_hi_u32 s9, s2, s8
; GFX1010-NEXT: s_mul_i32 s12, s2, s5
-; GFX1010-NEXT: s_mul_hi_u32 s10, s2, s5
-; GFX1010-NEXT: s_add_u32 s11, s11, s12
-; GFX1010-NEXT: s_addc_u32 s10, 0, s10
+; GFX1010-NEXT: s_mul_hi_u32 s11, s2, s5
+; GFX1010-NEXT: s_mul_hi_u32 s10, s3, s8
+; GFX1010-NEXT: s_mul_i32 s8, s3, s8
+; GFX1010-NEXT: s_add_u32 s9, s9, s12
+; GFX1010-NEXT: s_addc_u32 s11, 0, s11
; GFX1010-NEXT: s_mul_hi_u32 s13, s3, s5
-; GFX1010-NEXT: s_add_u32 s8, s11, s8
+; GFX1010-NEXT: s_add_u32 s8, s9, s8
; GFX1010-NEXT: s_mul_i32 s5, s3, s5
-; GFX1010-NEXT: s_addc_u32 s8, s10, s9
+; GFX1010-NEXT: s_addc_u32 s8, s11, s10
; GFX1010-NEXT: s_addc_u32 s9, s13, 0
; GFX1010-NEXT: s_add_u32 s5, s8, s5
; GFX1010-NEXT: s_addc_u32 s8, 0, s9
@@ -2553,11 +2509,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1010-NEXT: s_sub_i32 s11, s3, s9
; GFX1010-NEXT: s_sub_u32 s10, s2, s10
; GFX1010-NEXT: s_cselect_b32 s12, -1, 0
-; GFX1010-NEXT: s_cmp_lg_u32 s12, 0
; GFX1010-NEXT: s_subb_u32 s11, s11, s7
; GFX1010-NEXT: s_sub_u32 s13, s10, s6
-; GFX1010-NEXT: s_cselect_b32 s14, -1, 0
-; GFX1010-NEXT: s_cmp_lg_u32 s14, 0
; GFX1010-NEXT: s_subb_u32 s11, s11, 0
; GFX1010-NEXT: s_cmp_ge_u32 s11, s7
; GFX1010-NEXT: s_cselect_b32 s14, -1, 0
@@ -2663,44 +2616,40 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W32-NEXT: s_add_u32 s11, s12, s11
; GFX1030W32-NEXT: s_addc_u32 s12, 0, s13
; GFX1030W32-NEXT: s_add_u32 s8, s8, s11
-; GFX1030W32-NEXT: s_cselect_b32 s11, -1, 0
-; GFX1030W32-NEXT: s_mul_hi_u32 s13, s9, s8
-; GFX1030W32-NEXT: s_cmp_lg_u32 s11, 0
-; GFX1030W32-NEXT: s_mul_i32 s11, s9, s8
; GFX1030W32-NEXT: s_addc_u32 s7, s7, s12
-; GFX1030W32-NEXT: s_mul_i32 s10, s10, s8
+; GFX1030W32-NEXT: s_mul_hi_u32 s11, s9, s8
+; GFX1030W32-NEXT: s_mul_i32 s12, s9, s8
; GFX1030W32-NEXT: s_mul_i32 s9, s9, s7
-; GFX1030W32-NEXT: s_mul_hi_u32 s12, s8, s11
-; GFX1030W32-NEXT: s_add_i32 s9, s13, s9
-; GFX1030W32-NEXT: s_mul_hi_u32 s13, s7, s11
+; GFX1030W32-NEXT: s_mul_i32 s10, s10, s8
+; GFX1030W32-NEXT: s_add_i32 s9, s11, s9
+; GFX1030W32-NEXT: s_mul_i32 s11, s7, s12
; GFX1030W32-NEXT: s_add_i32 s9, s9, s10
-; GFX1030W32-NEXT: s_mul_i32 s10, s7, s11
+; GFX1030W32-NEXT: s_mul_hi_u32 s10, s8, s12
; GFX1030W32-NEXT: s_mul_i32 s15, s8, s9
; GFX1030W32-NEXT: s_mul_hi_u32 s14, s8, s9
-; GFX1030W32-NEXT: s_add_u32 s12, s12, s15
+; GFX1030W32-NEXT: s_add_u32 s10, s10, s15
+; GFX1030W32-NEXT: s_mul_hi_u32 s13, s7, s12
; GFX1030W32-NEXT: s_addc_u32 s14, 0, s14
-; GFX1030W32-NEXT: s_mul_hi_u32 s11, s7, s9
-; GFX1030W32-NEXT: s_add_u32 s10, s12, s10
+; GFX1030W32-NEXT: s_mul_hi_u32 s12, s7, s9
+; GFX1030W32-NEXT: s_add_u32 s10, s10, s11
; GFX1030W32-NEXT: s_mul_i32 s9, s7, s9
; GFX1030W32-NEXT: s_addc_u32 s10, s14, s13
-; GFX1030W32-NEXT: s_addc_u32 s11, s11, 0
+; GFX1030W32-NEXT: s_addc_u32 s11, s12, 0
; GFX1030W32-NEXT: s_add_u32 s9, s10, s9
; GFX1030W32-NEXT: s_addc_u32 s10, 0, s11
; GFX1030W32-NEXT: s_add_u32 s8, s8, s9
-; GFX1030W32-NEXT: s_cselect_b32 s9, -1, 0
-; GFX1030W32-NEXT: s_mul_hi_u32 s11, s2, s8
-; GFX1030W32-NEXT: s_cmp_lg_u32 s9, 0
-; GFX1030W32-NEXT: s_mul_hi_u32 s9, s3, s8
; GFX1030W32-NEXT: s_addc_u32 s7, s7, s10
-; GFX1030W32-NEXT: s_mul_i32 s8, s3, s8
+; GFX1030W32-NEXT: s_mul_hi_u32 s9, s2, s8
; GFX1030W32-NEXT: s_mul_i32 s12, s2, s7
-; GFX1030W32-NEXT: s_mul_hi_u32 s10, s2, s7
-; GFX1030W32-NEXT: s_add_u32 s11, s11, s12
-; GFX1030W32-NEXT: s_addc_u32 s10, 0, s10
+; GFX1030W32-NEXT: s_mul_hi_u32 s11, s2, s7
+; GFX1030W32-NEXT: s_mul_hi_u32 s10, s3, s8
+; GFX1030W32-NEXT: s_mul_i32 s8, s3, s8
+; GFX1030W32-NEXT: s_add_u32 s9, s9, s12
+; GFX1030W32-NEXT: s_addc_u32 s11, 0, s11
; GFX1030W32-NEXT: s_mul_hi_u32 s13, s3, s7
-; GFX1030W32-NEXT: s_add_u32 s8, s11, s8
+; GFX1030W32-NEXT: s_add_u32 s8, s9, s8
; GFX1030W32-NEXT: s_mul_i32 s7, s3, s7
-; GFX1030W32-NEXT: s_addc_u32 s8, s10, s9
+; GFX1030W32-NEXT: s_addc_u32 s8, s11, s10
; GFX1030W32-NEXT: s_addc_u32 s9, s13, 0
; GFX1030W32-NEXT: s_add_u32 s7, s8, s7
; GFX1030W32-NEXT: s_addc_u32 s8, 0, s9
@@ -2713,11 +2662,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W32-NEXT: s_sub_i32 s11, s3, s9
; GFX1030W32-NEXT: s_sub_u32 s10, s2, s10
; GFX1030W32-NEXT: s_cselect_b32 s12, -1, 0
-; GFX1030W32-NEXT: s_cmp_lg_u32 s12, 0
; GFX1030W32-NEXT: s_subb_u32 s11, s11, s5
; GFX1030W32-NEXT: s_sub_u32 s13, s10, s4
-; GFX1030W32-NEXT: s_cselect_b32 s14, -1, 0
-; GFX1030W32-NEXT: s_cmp_lg_u32 s14, 0
; GFX1030W32-NEXT: s_subb_u32 s11, s11, 0
; GFX1030W32-NEXT: s_cmp_ge_u32 s11, s5
; GFX1030W32-NEXT: s_cselect_b32 s14, -1, 0
@@ -2790,8 +2736,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W64-NEXT: ; %bb.1:
; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v0, s4
; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v1, s5
-; GFX1030W64-NEXT: s_sub_u32 s9, 0, s4
-; GFX1030W64-NEXT: s_subb_u32 s10, 0, s5
+; GFX1030W64-NEXT: s_sub_u32 s8, 0, s4
+; GFX1030W64-NEXT: s_subb_u32 s9, 0, s5
; GFX1030W64-NEXT: v_fmamk_f32 v0, v1, 0x4f800000, v0
; GFX1030W64-NEXT: v_rcp_f32_e32 v0, v0
; GFX1030W64-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2800,109 +2746,102 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W64-NEXT: v_fmamk_f32 v0, v1, 0xcf800000, v0
; GFX1030W64-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX1030W64-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX1030W64-NEXT: v_readfirstlane_b32 s8, v1
-; GFX1030W64-NEXT: v_readfirstlane_b32 s6, v0
-; GFX1030W64-NEXT: s_mul_i32 s7, s9, s8
-; GFX1030W64-NEXT: s_mul_hi_u32 s12, s9, s6
-; GFX1030W64-NEXT: s_mul_i32 s11, s10, s6
-; GFX1030W64-NEXT: s_add_i32 s7, s12, s7
-; GFX1030W64-NEXT: s_mul_i32 s13, s9, s6
-; GFX1030W64-NEXT: s_add_i32 s7, s7, s11
-; GFX1030W64-NEXT: s_mul_hi_u32 s12, s6, s13
-; GFX1030W64-NEXT: s_mul_i32 s15, s6, s7
-; GFX1030W64-NEXT: s_mul_hi_u32 s14, s8, s13
-; GFX1030W64-NEXT: s_mul_i32 s11, s8, s13
-; GFX1030W64-NEXT: s_mul_hi_u32 s13, s6, s7
+; GFX1030W64-NEXT: v_readfirstlane_b32 s6, v1
+; GFX1030W64-NEXT: v_readfirstlane_b32 s7, v0
+; GFX1030W64-NEXT: s_mul_i32 s10, s8, s6
+; GFX1030W64-NEXT: s_mul_hi_u32 s12, s8, s7
+; GFX1030W64-NEXT: s_mul_i32 s11, s9, s7
+; GFX1030W64-NEXT: s_add_i32 s10, s12, s10
+; GFX1030W64-NEXT: s_mul_i32 s13, s8, s7
+; GFX1030W64-NEXT: s_add_i32 s10, s10, s11
+; GFX1030W64-NEXT: s_mul_hi_u32 s12, s7, s13
+; GFX1030W64-NEXT: s_mul_i32 s15, s7, s10
+; GFX1030W64-NEXT: s_mul_hi_u32 s14, s6, s13
+; GFX1030W64-NEXT: s_mul_i32 s11, s6, s13
+; GFX1030W64-NEXT: s_mul_hi_u32 s13, s7, s10
; GFX1030W64-NEXT: s_add_u32 s12, s12, s15
; GFX1030W64-NEXT: s_addc_u32 s13, 0, s13
-; GFX1030W64-NEXT: s_mul_hi_u32 s16, s8, s7
+; GFX1030W64-NEXT: s_mul_hi_u32 s16, s6, s10
; GFX1030W64-NEXT: s_add_u32 s11, s12, s11
-; GFX1030W64-NEXT: s_mul_i32 s7, s8, s7
+; GFX1030W64-NEXT: s_mul_i32 s10, s6, s10
; GFX1030W64-NEXT: s_addc_u32 s11, s13, s14
; GFX1030W64-NEXT: s_addc_u32 s12, s16, 0
-; GFX1030W64-NEXT: s_add_u32 s7, s11, s7
+; GFX1030W64-NEXT: s_add_u32 s10, s11, s10
; GFX1030W64-NEXT: s_addc_u32 s11, 0, s12
-; GFX1030W64-NEXT: s_add_u32 s12, s6, s7
-; GFX1030W64-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GFX1030W64-NEXT: s_mul_hi_u32 s13, s9, s12
-; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX1030W64-NEXT: s_mul_i32 s6, s9, s12
-; GFX1030W64-NEXT: s_addc_u32 s8, s8, s11
-; GFX1030W64-NEXT: s_mul_i32 s10, s10, s12
-; GFX1030W64-NEXT: s_mul_i32 s9, s9, s8
-; GFX1030W64-NEXT: s_mul_hi_u32 s7, s12, s6
-; GFX1030W64-NEXT: s_add_i32 s9, s13, s9
-; GFX1030W64-NEXT: s_mul_hi_u32 s11, s8, s6
-; GFX1030W64-NEXT: s_add_i32 s9, s9, s10
-; GFX1030W64-NEXT: s_mul_i32 s6, s8, s6
-; GFX1030W64-NEXT: s_mul_i32 s14, s12, s9
-; GFX1030W64-NEXT: s_mul_hi_u32 s13, s12, s9
-; GFX1030W64-NEXT: s_add_u32 s7, s7, s14
+; GFX1030W64-NEXT: s_add_u32 s7, s7, s10
+; GFX1030W64-NEXT: s_addc_u32 s6, s6, s11
+; GFX1030W64-NEXT: s_mul_hi_u32 s10, s8, s7
+; GFX1030W64-NEXT: s_mul_i32 s11, s8, s7
+; GFX1030W64-NEXT: s_mul_i32 s8, s8, s6
+; GFX1030W64-NEXT: s_mul_i32 s9, s9, s7
+; GFX1030W64-NEXT: s_add_i32 s8, s10, s8
+; GFX1030W64-NEXT: s_mul_i32 s10, s6, s11
+; GFX1030W64-NEXT: s_add_i32 s8, s8, s9
+; GFX1030W64-NEXT: s_mul_hi_u32 s9, s7, s11
+; GFX1030W64-NEXT: s_mul_i32 s14, s7, s8
+; GFX1030W64-NEXT: s_mul_hi_u32 s13, s7, s8
+; GFX1030W64-NEXT: s_add_u32 s9, s9, s14
+; GFX1030W64-NEXT: s_mul_hi_u32 s12, s6, s11
; GFX1030W64-NEXT: s_addc_u32 s13, 0, s13
-; GFX1030W64-NEXT: s_mul_hi_u32 s10, s8, s9
-; GFX1030W64-NEXT: s_add_u32 s6, s7, s6
-; GFX1030W64-NEXT: s_mul_i32 s9, s8, s9
-; GFX1030W64-NEXT: s_addc_u32 s6, s13, s11
-; GFX1030W64-NEXT: s_addc_u32 s7, s10, 0
-; GFX1030W64-NEXT: s_add_u32 s6, s6, s9
-; GFX1030W64-NEXT: s_addc_u32 s9, 0, s7
-; GFX1030W64-NEXT: s_add_u32 s10, s12, s6
-; GFX1030W64-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GFX1030W64-NEXT: s_mul_hi_u32 s11, s2, s10
-; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX1030W64-NEXT: s_mul_hi_u32 s6, s3, s10
-; GFX1030W64-NEXT: s_addc_u32 s7, s8, s9
-; GFX1030W64-NEXT: s_mul_i32 s8, s3, s10
-; GFX1030W64-NEXT: s_mul_i32 s10, s2, s7
-; GFX1030W64-NEXT: s_mul_hi_u32 s9, s2, s7
-; GFX1030W64-NEXT: s_add_u32 s10, s11, s10
-; GFX1030W64-NEXT: s_addc_u32 s9, 0, s9
-; GFX1030W64-NEXT: s_mul_hi_u32 s12, s3, s7
-; GFX1030W64-NEXT: s_add_u32 s8, s10, s8
+; GFX1030W64-NEXT: s_mul_hi_u32 s11, s6, s8
+; GFX1030W64-NEXT: s_add_u32 s9, s9, s10
+; GFX1030W64-NEXT: s_mul_i32 s8, s6, s8
+; GFX1030W64-NEXT: s_addc_u32 s9, s13, s12
+; GFX1030W64-NEXT: s_addc_u32 s10, s11, 0
+; GFX1030W64-NEXT: s_add_u32 s8, s9, s8
+; GFX1030W64-NEXT: s_addc_u32 s9, 0, s10
+; GFX1030W64-NEXT: s_add_u32 s7, s7, s8
+; GFX1030W64-NEXT: s_addc_u32 s6, s6, s9
+; GFX1030W64-NEXT: s_mul_hi_u32 s8, s2, s7
+; GFX1030W64-NEXT: s_mul_i32 s11, s2, s6
+; GFX1030W64-NEXT: s_mul_hi_u32 s10, s2, s6
+; GFX1030W64-NEXT: s_mul_hi_u32 s9, s3, s7
; GFX1030W64-NEXT: s_mul_i32 s7, s3, s7
-; GFX1030W64-NEXT: s_addc_u32 s6, s9, s6
+; GFX1030W64-NEXT: s_add_u32 s8, s8, s11
+; GFX1030W64-NEXT: s_addc_u32 s10, 0, s10
+; GFX1030W64-NEXT: s_mul_hi_u32 s12, s3, s6
+; GFX1030W64-NEXT: s_add_u32 s7, s8, s7
+; GFX1030W64-NEXT: s_mul_i32 s6, s3, s6
+; GFX1030W64-NEXT: s_addc_u32 s7, s10, s9
; GFX1030W64-NEXT: s_addc_u32 s8, s12, 0
-; GFX1030W64-NEXT: s_add_u32 s10, s6, s7
+; GFX1030W64-NEXT: s_add_u32 s10, s7, s6
; GFX1030W64-NEXT: s_addc_u32 s11, 0, s8
; GFX1030W64-NEXT: s_mul_hi_u32 s6, s4, s10
; GFX1030W64-NEXT: s_mul_i32 s7, s4, s11
; GFX1030W64-NEXT: s_mul_i32 s8, s5, s10
; GFX1030W64-NEXT: s_add_i32 s6, s6, s7
-; GFX1030W64-NEXT: s_add_i32 s12, s6, s8
+; GFX1030W64-NEXT: s_add_i32 s8, s6, s8
; GFX1030W64-NEXT: s_mul_i32 s6, s4, s10
-; GFX1030W64-NEXT: s_sub_i32 s8, s3, s12
-; GFX1030W64-NEXT: s_sub_u32 s13, s2, s6
+; GFX1030W64-NEXT: s_sub_i32 s9, s3, s8
+; GFX1030W64-NEXT: s_sub_u32 s12, s2, s6
; GFX1030W64-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX1030W64-NEXT: s_subb_u32 s14, s8, s5
-; GFX1030W64-NEXT: s_sub_u32 s15, s13, s4
-; GFX1030W64-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0
-; GFX1030W64-NEXT: s_subb_u32 s8, s14, 0
-; GFX1030W64-NEXT: s_cmp_ge_u32 s8, s5
-; GFX1030W64-NEXT: s_cselect_b32 s9, -1, 0
-; GFX1030W64-NEXT: s_cmp_ge_u32 s15, s4
+; GFX1030W64-NEXT: s_subb_u32 s9, s9, s5
+; GFX1030W64-NEXT: s_sub_u32 s13, s12, s4
+; GFX1030W64-NEXT: s_subb_u32 s9, s9, 0
+; GFX1030W64-NEXT: s_cmp_ge_u32 s9, s5
; GFX1030W64-NEXT: s_cselect_b32 s14, -1, 0
-; GFX1030W64-NEXT: s_cmp_eq_u32 s8, s5
-; GFX1030W64-NEXT: s_cselect_b32 s8, s14, s9
-; GFX1030W64-NEXT: s_add_u32 s9, s10, 1
+; GFX1030W64-NEXT: s_cmp_ge_u32 s13, s4
+; GFX1030W64-NEXT: s_cselect_b32 s13, -1, 0
+; GFX1030W64-NEXT: s_cmp_eq_u32 s9, s5
+; GFX1030W64-NEXT: s_cselect_b32 s9, s13, s14
+; GFX1030W64-NEXT: s_add_u32 s13, s10, 1
; GFX1030W64-NEXT: s_addc_u32 s14, s11, 0
; GFX1030W64-NEXT: s_add_u32 s15, s10, 2
; GFX1030W64-NEXT: s_addc_u32 s16, s11, 0
-; GFX1030W64-NEXT: s_cmp_lg_u32 s8, 0
-; GFX1030W64-NEXT: s_cselect_b32 s15, s15, s9
+; GFX1030W64-NEXT: s_cmp_lg_u32 s9, 0
+; GFX1030W64-NEXT: s_cselect_b32 s13, s15, s13
; GFX1030W64-NEXT: s_cselect_b32 s14, s16, s14
; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX1030W64-NEXT: s_subb_u32 s3, s3, s12
+; GFX1030W64-NEXT: s_subb_u32 s3, s3, s8
; GFX1030W64-NEXT: s_cmp_ge_u32 s3, s5
; GFX1030W64-NEXT: s_cselect_b32 s6, -1, 0
-; GFX1030W64-NEXT: s_cmp_ge_u32 s13, s4
+; GFX1030W64-NEXT: s_cmp_ge_u32 s12, s4
; GFX1030W64-NEXT: s_cselect_b32 s7, -1, 0
; GFX1030W64-NEXT: s_cmp_eq_u32 s3, s5
; GFX1030W64-NEXT: s_cselect_b32 s3, s7, s6
; GFX1030W64-NEXT: s_cmp_lg_u32 s3, 0
; GFX1030W64-NEXT: s_cselect_b32 s7, s14, s11
-; GFX1030W64-NEXT: s_cselect_b32 s6, s15, s10
+; GFX1030W64-NEXT: s_cselect_b32 s6, s13, s10
; GFX1030W64-NEXT: s_cbranch_execnz .LBB16_3
; GFX1030W64-NEXT: .LBB16_2:
; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v0, s4
@@ -2988,44 +2927,40 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX11-NEXT: s_add_u32 s11, s12, s11
; GFX11-NEXT: s_addc_u32 s12, 0, s13
; GFX11-NEXT: s_add_u32 s8, s8, s11
-; GFX11-NEXT: s_cselect_b32 s11, -1, 0
-; GFX11-NEXT: s_mul_hi_u32 s13, s9, s8
-; GFX11-NEXT: s_cmp_lg_u32 s11, 0
-; GFX11-NEXT: s_mul_i32 s11, s9, s8
; GFX11-NEXT: s_addc_u32 s7, s7, s12
-; GFX11-NEXT: s_mul_i32 s10, s10, s8
+; GFX11-NEXT: s_mul_hi_u32 s11, s9, s8
+; GFX11-NEXT: s_mul_i32 s12, s9, s8
; GFX11-NEXT: s_mul_i32 s9, s9, s7
-; GFX11-NEXT: s_mul_hi_u32 s12, s8, s11
-; GFX11-NEXT: s_add_i32 s9, s13, s9
-; GFX11-NEXT: s_mul_hi_u32 s13, s7, s11
+; GFX11-NEXT: s_mul_i32 s10, s10, s8
+; GFX11-NEXT: s_add_i32 s9, s11, s9
+; GFX11-NEXT: s_mul_i32 s11, s7, s12
; GFX11-NEXT: s_add_i32 s9, s9, s10
-; GFX11-NEXT: s_mul_i32 s10, s7, s11
+; GFX11-NEXT: s_mul_hi_u32 s10, s8, s12
; GFX11-NEXT: s_mul_i32 s15, s8, s9
; GFX11-NEXT: s_mul_hi_u32 s14, s8, s9
-; GFX11-NEXT: s_add_u32 s12, s12, s15
+; GFX11-NEXT: s_add_u32 s10, s10, s15
+; GFX11-NEXT: s_mul_hi_u32 s13, s7, s12
; GFX11-NEXT: s_addc_u32 s14, 0, s14
-; GFX11-NEXT: s_mul_hi_u32 s11, s7, s9
-; GFX11-NEXT: s_add_u32 s10, s12, s10
+; GFX11-NEXT: s_mul_hi_u32 s12, s7, s9
+; GFX11-NEXT: s_add_u32 s10, s10, s11
; GFX11-NEXT: s_mul_i32 s9, s7, s9
; GFX11-NEXT: s_addc_u32 s10, s14, s13
-; GFX11-NEXT: s_addc_u32 s11, s11, 0
+; GFX11-NEXT: s_addc_u32 s11, s12, 0
; GFX11-NEXT: s_add_u32 s9, s10, s9
; GFX11-NEXT: s_addc_u32 s10, 0, s11
; GFX11-NEXT: s_add_u32 s8, s8, s9
-; GFX11-NEXT: s_cselect_b32 s9, -1, 0
-; GFX11-NEXT: s_mul_hi_u32 s11, s2, s8
-; GFX11-NEXT: s_cmp_lg_u32 s9, 0
-; GFX11-NEXT: s_mul_hi_u32 s9, s3, s8
; GFX11-NEXT: s_addc_u32 s7, s7, s10
-; GFX11-NEXT: s_mul_i32 s8, s3, s8
+; GFX11-NEXT: s_mul_hi_u32 s9, s2, s8
; GFX11-NEXT: s_mul_i32 s12, s2, s7
-; GFX11-NEXT: s_mul_hi_u32 s10, s2, s7
-; GFX11-NEXT: s_add_u32 s11, s11, s12
-; GFX11-NEXT: s_addc_u32 s10, 0, s10
+; GFX11-NEXT: s_mul_hi_u32 s11, s2, s7
+; GFX11-NEXT: s_mul_hi_u32 s10, s3, s8
+; GFX11-NEXT: s_mul_i32 s8, s3, s8
+; GFX11-NEXT: s_add_u32 s9, s9, s12
+; GFX11-NEXT: s_addc_u32 s11, 0, s11
; GFX11-NEXT: s_mul_hi_u32 s13, s3, s7
-; GFX11-NEXT: s_add_u32 s8, s11, s8
+; GFX11-NEXT: s_add_u32 s8, s9, s8
; GFX11-NEXT: s_mul_i32 s7, s3, s7
-; GFX11-NEXT: s_addc_u32 s8, s10, s9
+; GFX11-NEXT: s_addc_u32 s8, s11, s10
; GFX11-NEXT: s_addc_u32 s9, s13, 0
; GFX11-NEXT: s_add_u32 s7, s8, s7
; GFX11-NEXT: s_addc_u32 s8, 0, s9
@@ -3035,17 +2970,14 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX11-NEXT: s_add_i32 s9, s9, s10
; GFX11-NEXT: s_mul_i32 s10, s4, s7
; GFX11-NEXT: s_add_i32 s9, s9, s11
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_sub_i32 s11, s3, s9
; GFX11-NEXT: s_sub_u32 s10, s2, s10
; GFX11-NEXT: s_cselect_b32 s12, -1, 0
-; GFX11-NEXT: s_cmp_lg_u32 s12, 0
; GFX11-NEXT: s_subb_u32 s11, s11, s5
; GFX11-NEXT: s_sub_u32 s13, s10, s4
-; GFX11-NEXT: s_cselect_b32 s14, -1, 0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_cmp_lg_u32 s14, 0
; GFX11-NEXT: s_subb_u32 s11, s11, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_cmp_ge_u32 s11, s5
; GFX11-NEXT: s_cselect_b32 s14, -1, 0
; GFX11-NEXT: s_cmp_ge_u32 s13, s4
@@ -3118,9 +3050,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_or_b64 s[4:5], s[2:3], s[6:7]
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: s_and_b64 s[4:5], s[4:5], 0xffffffff00000000
-; GFX1250-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX1250-NEXT: s_cbranch_scc0 .LBB16_4
; GFX1250-NEXT: ; %bb.1:
; GFX1250-NEXT: s_cvt_f32_u32 s4, s6
@@ -3155,12 +3086,9 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[4:5], s[12:13]
; GFX1250-NEXT: s_add_co_u32 s8, s8, s12
-; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT: s_cmp_lg_u32 s4, 0
; GFX1250-NEXT: s_add_co_ci_u32 s9, s9, s13
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: s_mul_u64 s[10:11], s[10:11], s[8:9]
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: s_mul_hi_u32 s13, s8, s11
; GFX1250-NEXT: s_mul_i32 s12, s8, s11
; GFX1250-NEXT: s_mul_hi_u32 s4, s8, s10
@@ -3175,19 +3103,17 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: s_add_nc_u64 s[10:11], s[4:5], s[10:11]
; GFX1250-NEXT: s_add_co_u32 s8, s8, s10
-; GFX1250-NEXT: s_cselect_b32 s10, -1, 0
-; GFX1250-NEXT: s_mul_hi_u32 s4, s2, s8
-; GFX1250-NEXT: s_cmp_lg_u32 s10, 0
-; GFX1250-NEXT: s_mul_hi_u32 s12, s3, s8
; GFX1250-NEXT: s_add_co_ci_u32 s10, s9, s11
-; GFX1250-NEXT: s_mul_i32 s11, s3, s8
+; GFX1250-NEXT: s_mul_hi_u32 s4, s2, s8
+; GFX1250-NEXT: s_mul_hi_u32 s11, s3, s8
+; GFX1250-NEXT: s_mul_i32 s12, s3, s8
; GFX1250-NEXT: s_mul_hi_u32 s9, s2, s10
; GFX1250-NEXT: s_mul_i32 s8, s2, s10
; GFX1250-NEXT: s_mul_hi_u32 s13, s3, s10
; GFX1250-NEXT: s_add_nc_u64 s[8:9], s[4:5], s[8:9]
; GFX1250-NEXT: s_mul_i32 s10, s3, s10
-; GFX1250-NEXT: s_add_co_u32 s4, s8, s11
-; GFX1250-NEXT: s_add_co_ci_u32 s4, s9, s12
+; GFX1250-NEXT: s_add_co_u32 s4, s8, s12
+; GFX1250-NEXT: s_add_co_ci_u32 s4, s9, s11
; GFX1250-NEXT: s_add_co_ci_u32 s11, s13, 0
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: s_add_nc_u64 s[8:9], s[4:5], s[10:11]
@@ -3202,10 +3128,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1250-NEXT: s_cmp_lg_u32 s8, 0
; GFX1250-NEXT: s_sub_co_ci_u32 s12, s12, s7
; GFX1250-NEXT: s_sub_co_u32 s13, s4, s6
-; GFX1250-NEXT: s_cselect_b32 s14, -1, 0
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT: s_cmp_lg_u32 s14, 0
; GFX1250-NEXT: s_sub_co_ci_u32 s12, s12, 0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: s_cmp_ge_u32 s12, s7
; GFX1250-NEXT: s_cselect_b32 s14, -1, 0
; GFX1250-NEXT: s_cmp_ge_u32 s13, s6
diff --git a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
index 57a1e4c..ec92edb 100644
--- a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
@@ -3385,7 +3385,7 @@ declare half @llvm.canonicalize.f16(half)
declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>)
attributes #0 = { nounwind "amdgpu-ieee"="false" }
-attributes #1 = { nounwind "unsafe-fp-math"="true" "no-nans-fp-math"="true" }
+attributes #1 = { nounwind "no-nans-fp-math"="true" }
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX11NONANS-FAKE16: {{.*}}
; GFX11NONANS-TRUE16: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 4b151b9..07e6a76 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -714,9 +714,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; VI-NEXT: s_lshl_b32 s2, s2, 8
; VI-NEXT: s_or_b32 s2, s2, s3
; VI-NEXT: s_lshl_b32 s3, s2, 16
-; VI-NEXT: s_and_b32 s2, s2, 0xffff
; VI-NEXT: s_flbit_i32_b32 s3, s3
-; VI-NEXT: s_cmp_lg_u32 s2, 0
+; VI-NEXT: s_and_b32 s2, s2, 0xffff
; VI-NEXT: s_cselect_b32 s2, s3, 32
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
index cefcbdd..fca57be 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
@@ -1491,7 +1491,6 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace(
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_lshr_b32 s4, s6, 16
-; SI-NEXT: s_cmp_lg_u32 s4, 0
; SI-NEXT: s_cbranch_scc0 .LBB14_4
; SI-NEXT: ; %bb.1: ; %else
; SI-NEXT: s_mov_b32 s11, 0xf000
@@ -1521,7 +1520,6 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace(
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s4, s6, 16
-; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: s_cbranch_scc0 .LBB14_4
; VI-NEXT: ; %bb.1: ; %else
; VI-NEXT: s_mov_b32 s11, 0xf000
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index 6c8207a..df7f8c6 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -4344,7 +4344,7 @@ define i128 @v_sdiv_i128_v_pow2k(i128 %lhs) {
; GFX9-G-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc
; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], 31, v[1:2]
; GFX9-G-NEXT: v_lshrrev_b32_e32 v3, 1, v4
-; GFX9-G-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX9-G-NEXT: v_or_b32_e32 v0, v0, v3
; GFX9-G-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX9-G-NEXT: v_ashrrev_i32_e32 v2, 1, v2
; GFX9-G-NEXT: s_setpc_b64 s[30:31]
@@ -4375,14 +4375,12 @@ define i128 @v_sdiv_i128_v_pow2k(i128 %lhs) {
; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v4
; GFX9-G-O0-NEXT: s_mov_b32 s5, 1
; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s5
-; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v0, v0, v1
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-G-O0-NEXT: v_lshlrev_b64 v[5:6], v2, v[5:6]
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v6
-; GFX9-G-O0-NEXT: v_or_b32_e64 v0, v0, v3
-; GFX9-G-O0-NEXT: v_or_b32_e64 v1, v1, v2
+; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v2, v0, v1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-G-O0-NEXT: v_lshlrev_b64 v[5:6], v0, v[5:6]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v6
+; GFX9-G-O0-NEXT: v_or_b32_e64 v0, v0, v2
; GFX9-G-O0-NEXT: s_mov_b32 s4, 31
; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4
; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v3, v2, v4
@@ -4437,7 +4435,7 @@ define i128 @v_udiv_i128_v_pow2k(i128 %lhs) {
; GFX9-G-NEXT: v_mov_b32_e32 v4, v1
; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], 31, v[2:3]
; GFX9-G-NEXT: v_lshrrev_b32_e32 v2, 1, v4
-; GFX9-G-NEXT: v_or_b32_e32 v0, v2, v0
+; GFX9-G-NEXT: v_or_b32_e32 v0, v0, v2
; GFX9-G-NEXT: v_lshrrev_b32_e32 v2, 1, v3
; GFX9-G-NEXT: v_mov_b32_e32 v3, 0
; GFX9-G-NEXT: s_setpc_b64 s[30:31]
@@ -4450,15 +4448,13 @@ define i128 @v_udiv_i128_v_pow2k(i128 %lhs) {
; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v3
; GFX9-G-O0-NEXT: s_mov_b32 s4, 1
; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v0, v0, v1
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v2, v0, v1
; GFX9-G-O0-NEXT: s_mov_b32 s4, 31
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-G-O0-NEXT: v_lshlrev_b64 v[5:6], v2, v[4:5]
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v5
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v6
-; GFX9-G-O0-NEXT: v_or_b32_e64 v0, v0, v4
-; GFX9-G-O0-NEXT: v_or_b32_e64 v1, v1, v2
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-G-O0-NEXT: v_lshlrev_b64 v[4:5], v0, v[4:5]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v5
+; GFX9-G-O0-NEXT: v_or_b32_e64 v0, v0, v2
; GFX9-G-O0-NEXT: s_mov_b32 s4, 1
; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4
; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v2, v2, v3
diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
index d8a5e7fa..dbdea8e 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
@@ -14,7 +14,6 @@ define i32 @s_add_co_select_user() {
; GFX7-NEXT: s_add_u32 s7, s6, s6
; GFX7-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX7-NEXT: s_or_b32 s4, s4, s5
-; GFX7-NEXT: s_cmp_lg_u32 s4, 0
; GFX7-NEXT: s_addc_u32 s8, s6, 0
; GFX7-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], exec
@@ -31,8 +30,6 @@ define i32 @s_add_co_select_user() {
; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_add_u32 s7, s6, s6
-; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9-NEXT: s_addc_u32 s8, s6, 0
; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec
@@ -49,8 +46,6 @@ define i32 @s_add_co_select_user() {
; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_add_u32 s5, s4, s4
-; GFX10-NEXT: s_cselect_b32 s6, -1, 0
-; GFX10-NEXT: s_cmp_lg_u32 s6, 0
; GFX10-NEXT: s_addc_u32 s6, s4, 0
; GFX10-NEXT: s_cselect_b32 s7, -1, 0
; GFX10-NEXT: s_and_b32 s7, s7, exec_lo
@@ -67,16 +62,13 @@ define i32 @s_add_co_select_user() {
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_u32 s1, s0, s0
-; GFX11-NEXT: s_cselect_b32 s2, -1, 0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-NEXT: s_addc_u32 s2, s0, 0
; GFX11-NEXT: s_cselect_b32 s3, -1, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s3, s3, exec_lo
; GFX11-NEXT: s_cselect_b32 s2, s2, 0
; GFX11-NEXT: s_cmp_gt_u32 s0, 31
; GFX11-NEXT: s_cselect_b32 s0, s1, s2
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
bb:
@@ -104,7 +96,6 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
; GFX7-NEXT: s_add_u32 s0, s2, s2
; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX7-NEXT: s_or_b32 s0, s0, s1
-; GFX7-NEXT: s_cmp_lg_u32 s0, 0
; GFX7-NEXT: s_addc_u32 s0, s2, 0
; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX7-NEXT: s_andn2_b64 vcc, exec, s[0:1]
@@ -125,12 +116,10 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
;
; GFX9-LABEL: s_add_co_br_user:
; GFX9: ; %bb.0: ; %bb
-; GFX9-NEXT: s_load_dword s2, s[8:9], 0x0
+; GFX9-NEXT: s_load_dword s0, s[8:9], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s2, s2
-; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: s_addc_u32 s0, s2, 0
+; GFX9-NEXT: s_add_u32 s1, s0, s0
+; GFX9-NEXT: s_addc_u32 s0, s0, 0
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX9-NEXT: s_andn2_b64 vcc, exec, s[0:1]
; GFX9-NEXT: s_cbranch_vccnz .LBB1_2
@@ -153,8 +142,6 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
; GFX10-NEXT: s_load_dword s0, s[8:9], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_add_u32 s1, s0, s0
-; GFX10-NEXT: s_cselect_b32 s1, -1, 0
-; GFX10-NEXT: s_cmp_lg_u32 s1, 0
; GFX10-NEXT: s_addc_u32 s0, s0, 0
; GFX10-NEXT: s_cselect_b32 s0, -1, 0
; GFX10-NEXT: s_andn2_b32 vcc_lo, exec_lo, s0
@@ -178,11 +165,9 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_u32 s1, s0, s0
-; GFX11-NEXT: s_cselect_b32 s1, -1, 0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-NEXT: s_addc_u32 s0, s0, 0
; GFX11-NEXT: s_cselect_b32 s0, -1, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_vccnz .LBB1_2
; GFX11-NEXT: ; %bb.1: ; %bb0
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index 62847b1..9a17538 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -1117,7 +1117,6 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
; SI: ; %bb.0:
; SI-NEXT: s_and_b32 s3, s1, 0x1ff
; SI-NEXT: s_or_b32 s0, s3, s0
-; SI-NEXT: s_cmp_lg_u32 s0, 0
; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; SI-NEXT: s_lshr_b32 s0, s1, 8
@@ -1169,7 +1168,6 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
; VI: ; %bb.0:
; VI-NEXT: s_and_b32 s3, s1, 0x1ff
; VI-NEXT: s_or_b32 s0, s3, s0
-; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_cselect_b64 s[4:5], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; VI-NEXT: s_lshr_b32 s0, s1, 8
@@ -1217,7 +1215,6 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
; GFX9: ; %bb.0:
; GFX9-NEXT: s_and_b32 s3, s1, 0x1ff
; GFX9-NEXT: s_or_b32 s0, s3, s0
-; GFX9-NEXT: s_cmp_lg_u32 s0, 0
; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9-NEXT: s_lshr_b32 s0, s1, 8
@@ -1264,11 +1261,9 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
; GFX11-TRUE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_and_b32 s3, s1, 0x1ff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_or_b32 s0, s3, s0
-; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-TRUE16-NEXT: s_cselect_b32 s0, -1, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-TRUE16-NEXT: s_bfe_u32 s0, s1, 0xb0014
; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s1, 8
@@ -1320,11 +1315,9 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
; GFX11-FAKE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_and_b32 s3, s1, 0x1ff
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_or_b32 s0, s3, s0
-; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-FAKE16-NEXT: s_cselect_b32 s0, -1, 0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-FAKE16-NEXT: s_bfe_u32 s0, s1, 0xb0014
; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s1, 8
@@ -4023,7 +4016,6 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
; SI-NEXT: s_and_b32 s6, s4, 0xffe
; SI-NEXT: s_and_b32 s4, s1, 0x1ff
; SI-NEXT: s_or_b32 s0, s4, s0
-; SI-NEXT: s_cmp_lg_u32 s0, 0
; SI-NEXT: v_cvt_f16_f32_e32 v0, s5
; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5]
@@ -4066,7 +4058,6 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
; SI-NEXT: s_and_b32 s5, s0, 0xffe
; SI-NEXT: s_and_b32 s0, s3, 0x1ff
; SI-NEXT: s_or_b32 s0, s0, s2
-; SI-NEXT: s_cmp_lg_u32 s0, 0
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; SI-NEXT: v_readfirstlane_b32 s0, v2
@@ -4120,10 +4111,9 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
; VI-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_lshr_b32 s5, s3, 8
-; VI-NEXT: s_and_b32 s6, s3, 0x1ff
; VI-NEXT: s_and_b32 s5, s5, 0xffe
+; VI-NEXT: s_and_b32 s6, s3, 0x1ff
; VI-NEXT: s_or_b32 s2, s6, s2
-; VI-NEXT: s_cmp_lg_u32 s2, 0
; VI-NEXT: s_cselect_b64 s[6:7], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7]
; VI-NEXT: s_bfe_u32 s3, s3, 0xb0014
@@ -4163,7 +4153,6 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
; VI-NEXT: s_and_b32 s7, s2, 0xffe
; VI-NEXT: s_and_b32 s2, s1, 0x1ff
; VI-NEXT: s_or_b32 s0, s2, s0
-; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
; VI-NEXT: s_bfe_u32 s1, s1, 0xb0014
@@ -4209,10 +4198,9 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
; GFX9-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_lshr_b32 s5, s3, 8
-; GFX9-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX9-NEXT: s_and_b32 s5, s5, 0xffe
+; GFX9-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX9-NEXT: s_or_b32 s2, s6, s2
-; GFX9-NEXT: s_cmp_lg_u32 s2, 0
; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7]
; GFX9-NEXT: s_bfe_u32 s6, s3, 0xb0014
@@ -4254,7 +4242,6 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
; GFX9-NEXT: s_and_b32 s6, s2, 0xffe
; GFX9-NEXT: s_and_b32 s2, s1, 0x1ff
; GFX9-NEXT: s_or_b32 s0, s2, s0
-; GFX9-NEXT: s_cmp_lg_u32 s0, 0
; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
@@ -4301,11 +4288,10 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
;
; GFX11-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_b32 s5, s3, 0x1ff
-; GFX11-NEXT: s_lshr_b32 s6, s3, 8
-; GFX11-NEXT: s_or_b32 s2, s5, s2
-; GFX11-NEXT: s_and_b32 s5, s6, 0xffe
-; GFX11-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-NEXT: s_lshr_b32 s5, s3, 8
+; GFX11-NEXT: s_and_b32 s6, s3, 0x1ff
+; GFX11-NEXT: s_and_b32 s5, s5, 0xffe
+; GFX11-NEXT: s_or_b32 s2, s6, s2
; GFX11-NEXT: s_cselect_b32 s2, -1, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
@@ -4348,13 +4334,12 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x40f
; GFX11-NEXT: s_cselect_b32 s2, s5, s6
; GFX11-NEXT: s_lshr_b32 s3, s3, 16
-; GFX11-NEXT: s_and_b32 s6, s1, 0x1ff
; GFX11-NEXT: s_lshr_b32 s5, s1, 8
; GFX11-NEXT: s_and_b32 s3, s3, 0x8000
-; GFX11-NEXT: s_or_b32 s0, s6, s0
+; GFX11-NEXT: s_and_b32 s6, s1, 0x1ff
; GFX11-NEXT: s_and_b32 s5, s5, 0xffe
; GFX11-NEXT: s_or_b32 s2, s3, s2
-; GFX11-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-NEXT: s_or_b32 s0, s6, s0
; GFX11-NEXT: s_cselect_b32 s0, -1, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll
index acb32d4..11476a6 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll
@@ -127,7 +127,7 @@ define amdgpu_kernel void @s_fdiv_v4f64(ptr addrspace(1) %out, <4 x double> %num
; GCN-LABEL: {{^}}div_fast_2_x_pat_f64:
; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0.5
; GCN: buffer_store_dwordx2 [[MUL]]
-define amdgpu_kernel void @div_fast_2_x_pat_f64(ptr addrspace(1) %out) #1 {
+define amdgpu_kernel void @div_fast_2_x_pat_f64(ptr addrspace(1) %out) #0 {
%x = load double, ptr addrspace(1) poison
%rcp = fdiv fast double %x, 2.0
store double %rcp, ptr addrspace(1) %out, align 4
@@ -139,7 +139,7 @@ define amdgpu_kernel void @div_fast_2_x_pat_f64(ptr addrspace(1) %out) #1 {
; GCN-DAG: v_mov_b32_e32 v[[K_HI:[0-9]+]], 0x3fb99999
; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, v[[[K_LO]]:[[K_HI]]]
; GCN: buffer_store_dwordx2 [[MUL]]
-define amdgpu_kernel void @div_fast_k_x_pat_f64(ptr addrspace(1) %out) #1 {
+define amdgpu_kernel void @div_fast_k_x_pat_f64(ptr addrspace(1) %out) #0 {
%x = load double, ptr addrspace(1) poison
%rcp = fdiv fast double %x, 10.0
store double %rcp, ptr addrspace(1) %out, align 4
@@ -151,7 +151,7 @@ define amdgpu_kernel void @div_fast_k_x_pat_f64(ptr addrspace(1) %out) #1 {
; GCN-DAG: v_mov_b32_e32 v[[K_HI:[0-9]+]], 0xbfb99999
; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, v[[[K_LO]]:[[K_HI]]]
; GCN: buffer_store_dwordx2 [[MUL]]
-define amdgpu_kernel void @div_fast_neg_k_x_pat_f64(ptr addrspace(1) %out) #1 {
+define amdgpu_kernel void @div_fast_neg_k_x_pat_f64(ptr addrspace(1) %out) #0 {
%x = load double, ptr addrspace(1) poison
%rcp = fdiv fast double %x, -10.0
store double %rcp, ptr addrspace(1) %out, align 4
@@ -159,4 +159,3 @@ define amdgpu_kernel void @div_fast_neg_k_x_pat_f64(ptr addrspace(1) %out) #1 {
}
attributes #0 = { nounwind }
-attributes #1 = { nounwind "unsafe-fp-math"="true" }
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
index b5b2655..31344c7 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
@@ -2080,21 +2080,13 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_or_i64_imm_offset_16(ptr addrs
}
define amdgpu_ps float @flat_load_saddr_i8_offset_or_i64_imm_offset_4160(ptr addrspace(6) inreg %sbase, i32 %idx) {
-; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_or_i64_imm_offset_4160:
-; GFX1250-SDAG: ; %bb.0:
-; GFX1250-SDAG-NEXT: v_or_b32_e32 v0, 0x1040, v0
-; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: flat_load_u8 v0, v[0:1]
-; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-SDAG-NEXT: ; return to shader part epilog
-;
-; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_or_i64_imm_offset_4160:
-; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-GISEL-NEXT: v_or_b32_e32 v0, 0x1040, v0
-; GFX1250-GISEL-NEXT: flat_load_u8 v0, v[0:1]
-; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-GISEL-NEXT: ; return to shader part epilog
+; GFX1250-LABEL: flat_load_saddr_i8_offset_or_i64_imm_offset_4160:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_or_b32_e32 v0, 0x1040, v0
+; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: flat_load_u8 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ; return to shader part epilog
%zext.idx = zext i32 %idx to i64
%or = or i64 %zext.idx, 4160
%addr = inttoptr i64 %or to ptr
diff --git a/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll b/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll
index 92eb4a6..0a266bc 100644
--- a/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll
@@ -284,4 +284,4 @@ define <2 x float> @unsafe_fast_fmul_fsub_ditribute_post_legalize(float %arg0, <
ret <2 x float> %tmp1
}
-attributes #0 = { "no-infs-fp-math"="true" "unsafe-fp-math"="true" }
+attributes #0 = { "no-infs-fp-math"="true" }
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum.ll b/llvm/test/CodeGen/AMDGPU/fmaximum.ll
index e59fbad..62ec010 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum.ll
@@ -1,117 +1,296 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
define amdgpu_ps float @test_fmaximum_f32_vv(float %a, float %b) {
-; GCN-LABEL: test_fmaximum_f32_vv:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_maximum_f32 v0, v0, v1
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_f32_vv:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_max_f32_e32 v2, v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_f32_vv:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_maximum_f32 v0, v0, v1
+; GFX12-NEXT: ; return to shader part epilog
%val = call float @llvm.maximum.f32(float %a, float %b)
ret float %val
}
define amdgpu_ps float @test_fmaximum_f32_ss(float inreg %a, float inreg %b) {
-; GCN-LABEL: test_fmaximum_f32_ss:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_maximum_f32 s0, s0, s1
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
-; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_f32_ss:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: v_max_f32_e32 v1, s0, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_f32_ss:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_maximum_f32 s0, s0, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: ; return to shader part epilog
%val = call float @llvm.maximum.f32(float %a, float %b)
ret float %val
}
define amdgpu_ps float @test_fmaximum_f32_vs(float %a, float inreg %b) {
-; GCN-LABEL: test_fmaximum_f32_vs:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_maximum_f32 v0, v0, s0
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_f32_vs:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_max_f32_e32 v1, s0, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_f32_vs:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_maximum_f32 v0, v0, s0
+; GFX12-NEXT: ; return to shader part epilog
%val = call float @llvm.maximum.f32(float %a, float %b)
ret float %val
}
define amdgpu_ps float @test_fmaximum_nnan_f32(float %a, float %b) {
-; GCN-LABEL: test_fmaximum_nnan_f32:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_maximum_f32 v0, v0, v1
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_nnan_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_nnan_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_maximum_f32 v0, v0, v1
+; GFX12-NEXT: ; return to shader part epilog
%val = call nnan float @llvm.maximum.f32(float %a, float %b)
ret float %val
}
+define amdgpu_ps float @test_fmaximum_nsz_f32(float %a, float %b) {
+; GFX9-LABEL: test_fmaximum_nsz_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_max_f32_e32 v2, v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_nsz_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_maximum_f32 v0, v0, v1
+; GFX12-NEXT: ; return to shader part epilog
+ %val = call nsz float @llvm.maximum.f32(float %a, float %b)
+ ret float %val
+}
+
+define amdgpu_ps float @test_fmaximum_signed_zero_f32() {
+; GFX9-LABEL: test_fmaximum_signed_zero_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_signed_zero_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: ; return to shader part epilog
+ %val = call float @llvm.maximum.f32(float -0.0, float 0.0)
+ ret float %val
+}
+
define amdgpu_ps <2 x float> @test_fmaximum_v2f32(<2 x float> %a, <2 x float> %b) {
-; GCN-LABEL: test_fmaximum_v2f32:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_maximum_f32 v0, v0, v2
-; GCN-NEXT: v_maximum_f32 v1, v1, v3
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_v2f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_max_f32_e32 v4, v0, v2
+; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fc00000
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-NEXT: v_max_f32_e32 v2, v1, v3
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v2f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_maximum_f32 v0, v0, v2
+; GFX12-NEXT: v_maximum_f32 v1, v1, v3
+; GFX12-NEXT: ; return to shader part epilog
%val = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b)
ret <2 x float> %val
}
define amdgpu_ps <2 x float> @test_fmaximum_v2f32_ss(<2 x float> inreg %a, <2 x float> inreg %b) {
-; GCN-LABEL: test_fmaximum_v2f32_ss:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_maximum_f32 s0, s0, s2
-; GCN-NEXT: s_maximum_f32 s1, s1, s3
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
-; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_v2f32_ss:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_max_f32_e32 v1, s0, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: v_max_f32_e32 v3, s1, v1
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v2f32_ss:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_maximum_f32 s0, s0, s2
+; GFX12-NEXT: s_maximum_f32 s1, s1, s3
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: ; return to shader part epilog
%val = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b)
ret <2 x float> %val
}
define amdgpu_ps <3 x float> @test_fmaximum_v3f32(<3 x float> %a, <3 x float> %b) {
-; GCN-LABEL: test_fmaximum_v3f32:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_maximum_f32 v0, v0, v3
-; GCN-NEXT: v_maximum_f32 v1, v1, v4
-; GCN-NEXT: v_maximum_f32 v2, v2, v5
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_v3f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_max_f32_e32 v6, v0, v3
+; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT: v_max_f32_e32 v3, v1, v4
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v4
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc
+; GFX9-NEXT: v_max_f32_e32 v3, v2, v5
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v5
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v3f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_maximum_f32 v0, v0, v3
+; GFX12-NEXT: v_maximum_f32 v1, v1, v4
+; GFX12-NEXT: v_maximum_f32 v2, v2, v5
+; GFX12-NEXT: ; return to shader part epilog
%val = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> %b)
ret <3 x float> %val
}
define amdgpu_ps <4 x float> @test_fmaximum_v4f32(<4 x float> %a, <4 x float> %b) {
-; GCN-LABEL: test_fmaximum_v4f32:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_maximum_f32 v0, v0, v4
-; GCN-NEXT: v_maximum_f32 v1, v1, v5
-; GCN-NEXT: v_maximum_f32 v2, v2, v6
-; GCN-NEXT: v_maximum_f32 v3, v3, v7
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_v4f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_max_f32_e32 v8, v0, v4
+; GFX9-NEXT: v_mov_b32_e32 v9, 0x7fc00000
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v4
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc
+; GFX9-NEXT: v_max_f32_e32 v4, v1, v5
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v5
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc
+; GFX9-NEXT: v_max_f32_e32 v4, v2, v6
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v6
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc
+; GFX9-NEXT: v_max_f32_e32 v4, v3, v7
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v7
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v4f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_maximum_f32 v0, v0, v4
+; GFX12-NEXT: v_maximum_f32 v1, v1, v5
+; GFX12-NEXT: v_maximum_f32 v2, v2, v6
+; GFX12-NEXT: v_maximum_f32 v3, v3, v7
+; GFX12-NEXT: ; return to shader part epilog
%val = call <4 x float> @llvm.maximum.v4f32(<4 x float> %a, <4 x float> %b)
ret <4 x float> %val
}
define amdgpu_ps <16 x float> @test_fmaximum_v16f32(<16 x float> %a, <16 x float> %b) {
-; GCN-LABEL: test_fmaximum_v16f32:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_maximum_f32 v0, v0, v16
-; GCN-NEXT: v_maximum_f32 v1, v1, v17
-; GCN-NEXT: v_maximum_f32 v2, v2, v18
-; GCN-NEXT: v_maximum_f32 v3, v3, v19
-; GCN-NEXT: v_maximum_f32 v4, v4, v20
-; GCN-NEXT: v_maximum_f32 v5, v5, v21
-; GCN-NEXT: v_maximum_f32 v6, v6, v22
-; GCN-NEXT: v_maximum_f32 v7, v7, v23
-; GCN-NEXT: v_maximum_f32 v8, v8, v24
-; GCN-NEXT: v_maximum_f32 v9, v9, v25
-; GCN-NEXT: v_maximum_f32 v10, v10, v26
-; GCN-NEXT: v_maximum_f32 v11, v11, v27
-; GCN-NEXT: v_maximum_f32 v12, v12, v28
-; GCN-NEXT: v_maximum_f32 v13, v13, v29
-; GCN-NEXT: v_maximum_f32 v14, v14, v30
-; GCN-NEXT: v_maximum_f32 v15, v15, v31
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_v16f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_max_f32_e32 v32, v1, v17
+; GFX9-NEXT: v_mov_b32_e32 v33, 0x7fc00000
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v17
+; GFX9-NEXT: v_max_f32_e32 v1, v0, v16
+; GFX9-NEXT: v_cmp_o_f32_e64 s[12:13], v0, v16
+; GFX9-NEXT: v_max_f32_e32 v17, v2, v18
+; GFX9-NEXT: v_cmp_o_f32_e64 s[0:1], v2, v18
+; GFX9-NEXT: v_max_f32_e32 v18, v3, v19
+; GFX9-NEXT: v_cmp_o_f32_e64 s[2:3], v3, v19
+; GFX9-NEXT: v_max_f32_e32 v19, v4, v20
+; GFX9-NEXT: v_cmp_o_f32_e64 s[4:5], v4, v20
+; GFX9-NEXT: v_max_f32_e32 v20, v5, v21
+; GFX9-NEXT: v_cmp_o_f32_e64 s[6:7], v5, v21
+; GFX9-NEXT: v_max_f32_e32 v21, v6, v22
+; GFX9-NEXT: v_cmp_o_f32_e64 s[8:9], v6, v22
+; GFX9-NEXT: v_max_f32_e32 v22, v7, v23
+; GFX9-NEXT: v_cmp_o_f32_e64 s[10:11], v7, v23
+; GFX9-NEXT: v_max_f32_e32 v23, v8, v24
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v33, v1, s[12:13]
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v33, v32, vcc
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v8, v24
+; GFX9-NEXT: v_max_f32_e32 v34, v9, v25
+; GFX9-NEXT: v_cndmask_b32_e32 v8, v33, v23, vcc
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v9, v25
+; GFX9-NEXT: v_max_f32_e32 v35, v10, v26
+; GFX9-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v10, v26
+; GFX9-NEXT: v_max_f32_e32 v36, v11, v27
+; GFX9-NEXT: v_cndmask_b32_e32 v10, v33, v35, vcc
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v11, v27
+; GFX9-NEXT: v_max_f32_e32 v37, v12, v28
+; GFX9-NEXT: v_cndmask_b32_e32 v11, v33, v36, vcc
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v12, v28
+; GFX9-NEXT: v_max_f32_e32 v16, v13, v29
+; GFX9-NEXT: v_cndmask_b32_e32 v12, v33, v37, vcc
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v13, v29
+; GFX9-NEXT: v_cndmask_b32_e32 v13, v33, v16, vcc
+; GFX9-NEXT: v_max_f32_e32 v16, v14, v30
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v14, v30
+; GFX9-NEXT: v_cndmask_b32_e32 v14, v33, v16, vcc
+; GFX9-NEXT: v_max_f32_e32 v16, v15, v31
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v15, v31
+; GFX9-NEXT: v_cndmask_b32_e64 v2, v33, v17, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v3, v33, v18, s[2:3]
+; GFX9-NEXT: v_cndmask_b32_e64 v4, v33, v19, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v5, v33, v20, s[6:7]
+; GFX9-NEXT: v_cndmask_b32_e64 v6, v33, v21, s[8:9]
+; GFX9-NEXT: v_cndmask_b32_e64 v7, v33, v22, s[10:11]
+; GFX9-NEXT: v_cndmask_b32_e32 v15, v33, v16, vcc
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v16f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_maximum_f32 v0, v0, v16
+; GFX12-NEXT: v_maximum_f32 v1, v1, v17
+; GFX12-NEXT: v_maximum_f32 v2, v2, v18
+; GFX12-NEXT: v_maximum_f32 v3, v3, v19
+; GFX12-NEXT: v_maximum_f32 v4, v4, v20
+; GFX12-NEXT: v_maximum_f32 v5, v5, v21
+; GFX12-NEXT: v_maximum_f32 v6, v6, v22
+; GFX12-NEXT: v_maximum_f32 v7, v7, v23
+; GFX12-NEXT: v_maximum_f32 v8, v8, v24
+; GFX12-NEXT: v_maximum_f32 v9, v9, v25
+; GFX12-NEXT: v_maximum_f32 v10, v10, v26
+; GFX12-NEXT: v_maximum_f32 v11, v11, v27
+; GFX12-NEXT: v_maximum_f32 v12, v12, v28
+; GFX12-NEXT: v_maximum_f32 v13, v13, v29
+; GFX12-NEXT: v_maximum_f32 v14, v14, v30
+; GFX12-NEXT: v_maximum_f32 v15, v15, v31
+; GFX12-NEXT: ; return to shader part epilog
%val = call <16 x float> @llvm.maximum.v16f32(<16 x float> %a, <16 x float> %b)
ret <16 x float> %val
}
define amdgpu_ps half @test_fmaximum_f16_vv(half %a, half %b) {
+; GFX9-LABEL: test_fmaximum_f16_vv:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_max_f16_e32 v2, v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT: ; return to shader part epilog
+;
; GFX12-SDAG-TRUE16-LABEL: test_fmaximum_f16_vv:
; GFX12-SDAG-TRUE16: ; %bb.0:
; GFX12-SDAG-TRUE16-NEXT: v_maximum_f16 v0.l, v0.l, v1.l
@@ -136,35 +315,131 @@ define amdgpu_ps half @test_fmaximum_f16_vv(half %a, half %b) {
}
define amdgpu_ps half @test_fmaximum_f16_ss(half inreg %a, half inreg %b) {
-; GCN-LABEL: test_fmaximum_f16_ss:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_maximum_f16 s0, s0, s1
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
-; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_f16_ss:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: v_max_f16_e32 v1, s0, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00
+; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_f16_ss:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_maximum_f16 s0, s0, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: ; return to shader part epilog
%val = call half @llvm.maximum.f16(half %a, half %b)
ret half %val
}
define amdgpu_ps <2 x half> @test_fmaximum_v2f16_vv(<2 x half> %a, <2 x half> %b) {
-; GCN-LABEL: test_fmaximum_v2f16_vv:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_pk_maximum_f16 v0, v0, v1
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fmaximum_v2f16_vv:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: v_pk_max_f16 v2, v0, v1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x7e00
+; GFX9-SDAG-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc
+; GFX9-SDAG-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT: v_cndmask_b32_sdwa v0, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT: s_mov_b32 s0, 0x5040100
+; GFX9-SDAG-NEXT: v_perm_b32 v0, v0, v4, s0
+; GFX9-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_v2f16_vv:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: v_pk_max_f16 v2, v0, v1
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x7e00
+; GFX9-GISEL-NEXT: v_cmp_o_f16_e64 s[0:1], v0, v1
+; GFX9-GISEL-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[0:1]
+; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-GISEL-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v2f16_vv:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v1
+; GFX12-NEXT: ; return to shader part epilog
%val = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
ret <2 x half> %val
}
define amdgpu_ps <2 x half> @test_fmaximum_v2f16_ss(<2 x half> inreg %a, <2 x half> inreg %b) {
-; GCN-LABEL: test_fmaximum_v2f16_ss:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_pk_maximum_f16 v0, s0, s1
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fmaximum_v2f16_ss:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT: s_lshr_b32 s1, s1, 16
+; GFX9-SDAG-NEXT: v_pk_max_f16 v1, s0, v1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0x7e00
+; GFX9-SDAG-NEXT: v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-SDAG-NEXT: s_lshr_b32 s0, s0, 16
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-SDAG-NEXT: v_cmp_o_f16_e32 vcc, s0, v3
+; GFX9-SDAG-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-SDAG-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_v2f16_ss:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-GISEL-NEXT: s_lshr_b32 s1, s1, 16
+; GFX9-GISEL-NEXT: s_lshr_b32 s2, s0, 16
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-GISEL-NEXT: v_pk_max_f16 v1, s0, v0
+; GFX9-GISEL-NEXT: v_cmp_o_f16_e32 vcc, s2, v2
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x7e00
+; GFX9-GISEL-NEXT: v_cmp_o_f16_e64 s[0:1], s0, v0
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[0:1]
+; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-GISEL-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v2f16_ss:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_pk_maximum_f16 v0, s0, s1
+; GFX12-NEXT: ; return to shader part epilog
%val = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
ret <2 x half> %val
}
define amdgpu_ps <3 x half> @test_fmaximum_v3f16_vv(<3 x half> %a, <3 x half> %b) {
+; GFX9-SDAG-LABEL: test_fmaximum_v3f16_vv:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: v_pk_max_f16 v4, v1, v3
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0x7e00
+; GFX9-SDAG-NEXT: v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-SDAG-NEXT: v_pk_max_f16 v3, v0, v2
+; GFX9-SDAG-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX9-SDAG-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT: v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT: s_mov_b32 s0, 0x5040100
+; GFX9-SDAG-NEXT: v_perm_b32 v0, v0, v4, s0
+; GFX9-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_v3f16_vv:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: v_pk_max_f16 v4, v0, v2
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0x7e00
+; GFX9-GISEL-NEXT: v_cmp_o_f16_e64 s[0:1], v0, v2
+; GFX9-GISEL-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v5, v4, s[0:1]
+; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v2, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT: v_pk_max_f16 v4, v1, v3
+; GFX9-GISEL-NEXT: v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-GISEL-NEXT: ; return to shader part epilog
+;
; GFX12-SDAG-LABEL: test_fmaximum_v3f16_vv:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: v_pk_maximum_f16 v0, v0, v2
@@ -187,6 +462,49 @@ define amdgpu_ps <3 x half> @test_fmaximum_v3f16_vv(<3 x half> %a, <3 x half> %b
}
define amdgpu_ps <3 x half> @test_fmaximum_v3f16_ss(<3 x half> inreg %a, <3 x half> inreg %b) {
+; GFX9-SDAG-LABEL: test_fmaximum_v3f16_ss:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-SDAG-NEXT: v_pk_max_f16 v1, s1, v1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0x7e00
+; GFX9-SDAG-NEXT: v_cmp_o_f16_e32 vcc, s1, v0
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-SDAG-NEXT: s_lshr_b32 s1, s2, 16
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-SDAG-NEXT: v_pk_max_f16 v3, s0, v3
+; GFX9-SDAG-NEXT: v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-SDAG-NEXT: s_lshr_b32 s0, s0, 16
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, s1
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX9-SDAG-NEXT: v_cmp_o_f16_e32 vcc, s0, v4
+; GFX9-SDAG-NEXT: v_cndmask_b32_sdwa v2, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-SDAG-NEXT: v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_v3f16_ss:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_lshr_b32 s5, s2, 16
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-GISEL-NEXT: s_lshr_b32 s4, s0, 16
+; GFX9-GISEL-NEXT: v_pk_max_f16 v1, s0, v0
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX9-GISEL-NEXT: v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-GISEL-NEXT: v_cmp_o_f16_e32 vcc, s4, v2
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX9-GISEL-NEXT: v_pk_max_f16 v3, s1, v1
+; GFX9-GISEL-NEXT: v_cmp_o_f16_e32 vcc, s1, v1
+; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-GISEL-NEXT: ; return to shader part epilog
+;
; GFX12-SDAG-LABEL: test_fmaximum_v3f16_ss:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: v_pk_maximum_f16 v0, s0, s2
@@ -206,97 +524,384 @@ define amdgpu_ps <3 x half> @test_fmaximum_v3f16_ss(<3 x half> inreg %a, <3 x ha
}
define amdgpu_ps <4 x half> @test_fmaximum_v4f16(<4 x half> %a, <4 x half> %b) {
-; GCN-LABEL: test_fmaximum_v4f16:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_pk_maximum_f16 v0, v0, v2
-; GCN-NEXT: v_pk_maximum_f16 v1, v1, v3
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fmaximum_v4f16:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: v_pk_max_f16 v4, v1, v3
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0x7e00
+; GFX9-SDAG-NEXT: v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc
+; GFX9-SDAG-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT: v_cndmask_b32_sdwa v1, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT: v_pk_max_f16 v3, v0, v2
+; GFX9-SDAG-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX9-SDAG-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT: v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT: s_mov_b32 s0, 0x5040100
+; GFX9-SDAG-NEXT: v_perm_b32 v0, v0, v4, s0
+; GFX9-SDAG-NEXT: v_perm_b32 v1, v1, v6, s0
+; GFX9-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_v4f16:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: v_pk_max_f16 v4, v0, v2
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v6, 0x7e00
+; GFX9-GISEL-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v4
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9-GISEL-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc
+; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v4
+; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v2
+; GFX9-GISEL-NEXT: v_pk_max_f16 v2, v1, v3
+; GFX9-GISEL-NEXT: v_cmp_o_f16_e64 s[0:1], v1, v3
+; GFX9-GISEL-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, v6, v2, s[0:1]
+; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v2, v6, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1
+; GFX9-GISEL-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v4f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2
+; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT: ; return to shader part epilog
%val = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> %b)
ret <4 x half> %val
}
define amdgpu_ps <4 x half> @test_fmaximum_v4f16_ss(<4 x half> inreg %a, <4 x half> inreg %b) {
-; GCN-LABEL: test_fmaximum_v4f16_ss:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_pk_maximum_f16 v0, s0, s2
-; GCN-NEXT: v_pk_maximum_f16 v1, s1, s3
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fmaximum_v4f16_ss:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-SDAG-NEXT: s_lshr_b32 s3, s3, 16
+; GFX9-SDAG-NEXT: v_pk_max_f16 v1, s1, v1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0x7e00
+; GFX9-SDAG-NEXT: v_cmp_o_f16_e32 vcc, s1, v0
+; GFX9-SDAG-NEXT: s_lshr_b32 s1, s1, 16
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v3, v2, v1, vcc
+; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT: v_cmp_o_f16_e32 vcc, s1, v0
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, s2
+; GFX9-SDAG-NEXT: s_lshr_b32 s1, s2, 16
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-SDAG-NEXT: v_pk_max_f16 v4, s0, v4
+; GFX9-SDAG-NEXT: v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-SDAG-NEXT: s_lshr_b32 s0, s0, 16
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, s1
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX9-SDAG-NEXT: v_cmp_o_f16_e32 vcc, s0, v5
+; GFX9-SDAG-NEXT: v_cndmask_b32_sdwa v2, v2, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-SDAG-NEXT: v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v3
+; GFX9-SDAG-NEXT: v_lshl_or_b32 v1, v1, 16, v2
+; GFX9-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_v4f16_ss:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-GISEL-NEXT: s_lshr_b32 s2, s2, 16
+; GFX9-GISEL-NEXT: v_pk_max_f16 v1, s0, v0
+; GFX9-GISEL-NEXT: s_lshr_b32 s4, s0, 16
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX9-GISEL-NEXT: v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-GISEL-NEXT: v_cmp_o_f16_e32 vcc, s4, v2
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT: s_lshr_b32 s2, s3, 16
+; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-GISEL-NEXT: s_lshr_b32 s0, s1, 16
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-GISEL-NEXT: v_pk_max_f16 v2, s1, v1
+; GFX9-GISEL-NEXT: v_cmp_o_f16_e32 vcc, s0, v3
+; GFX9-GISEL-NEXT: v_cmp_o_f16_e64 s[0:1], s1, v1
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, v4, v2, s[0:1]
+; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v2, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1
+; GFX9-GISEL-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v4f16_ss:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_pk_maximum_f16 v0, s0, s2
+; GFX12-NEXT: v_pk_maximum_f16 v1, s1, s3
+; GFX12-NEXT: ; return to shader part epilog
%val = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> %b)
ret <4 x half> %val
}
define amdgpu_ps <2 x float> @test_fmaximum_f64_vv(double %a, double %b) {
-; GCN-LABEL: test_fmaximum_f64_vv:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_maximum_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fmaximum_f64_vv:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-SDAG-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_f64_vv:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-GISEL-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_f64_vv:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT: ; return to shader part epilog
%val = call double @llvm.maximum.f64(double %a, double %b)
%ret = bitcast double %val to <2 x float>
ret <2 x float> %ret
}
define amdgpu_ps <2 x float> @test_fmaximum_f64_ss(double inreg %a, double inreg %b) {
-; GCN-LABEL: test_fmaximum_f64_ss:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_maximum_f64 v[0:1], s[0:1], s[2:3]
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fmaximum_f64_ss:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-SDAG-NEXT: v_max_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-SDAG-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_f64_ss:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-GISEL-NEXT: v_max_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-GISEL-NEXT: v_cmp_o_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-GISEL-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_f64_ss:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_maximum_f64 v[0:1], s[0:1], s[2:3]
+; GFX12-NEXT: ; return to shader part epilog
%val = call double @llvm.maximum.f64(double %a, double %b)
%ret = bitcast double %val to <2 x float>
ret <2 x float> %ret
}
define amdgpu_ps <4 x float> @test_fmaximum_v2f64_ss(<2 x double> inreg %a, <2 x double> inreg %b) {
-; GCN-LABEL: test_fmaximum_v2f64_ss:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_maximum_f64 v[0:1], s[0:1], s[4:5]
-; GCN-NEXT: v_maximum_f64 v[2:3], s[2:3], s[6:7]
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fmaximum_v2f64_ss:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-SDAG-NEXT: v_max_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-SDAG-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-SDAG-NEXT: v_max_f64 v[4:5], s[2:3], v[0:1]
+; GFX9-SDAG-NEXT: v_cmp_u_f64_e64 s[0:1], s[2:3], v[0:1]
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v2, v4, 0, s[0:1]
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[0:1]
+; GFX9-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_v2f64_ss:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT: v_max_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-GISEL-NEXT: v_cmp_o_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-GISEL-NEXT: v_max_f64 v[4:5], s[2:3], v[0:1]
+; GFX9-GISEL-NEXT: v_cmp_o_f64_e64 s[0:1], s[2:3], v[0:1]
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[0:1]
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v3, v6, v5, s[0:1]
+; GFX9-GISEL-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v2f64_ss:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_maximum_f64 v[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: v_maximum_f64 v[2:3], s[2:3], s[6:7]
+; GFX12-NEXT: ; return to shader part epilog
%val = call <2 x double> @llvm.maximum.v2f64(<2 x double> %a, <2 x double> %b)
%ret = bitcast <2 x double> %val to <4 x float>
ret <4 x float> %ret
}
define amdgpu_ps <8 x float> @test_fmaximum_v4f64(<4 x double> %a, <4 x double> %b) {
-; GCN-LABEL: test_fmaximum_v4f64:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_maximum_f64 v[0:1], v[0:1], v[8:9]
-; GCN-NEXT: v_maximum_f64 v[2:3], v[2:3], v[10:11]
-; GCN-NEXT: v_maximum_f64 v[4:5], v[4:5], v[12:13]
-; GCN-NEXT: v_maximum_f64 v[6:7], v[6:7], v[14:15]
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fmaximum_v4f64:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: v_max_f64 v[16:17], v[0:1], v[8:9]
+; GFX9-SDAG-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9]
+; GFX9-SDAG-NEXT: v_max_f64 v[8:9], v[2:3], v[10:11]
+; GFX9-SDAG-NEXT: v_cmp_u_f64_e64 s[0:1], v[2:3], v[10:11]
+; GFX9-SDAG-NEXT: v_max_f64 v[10:11], v[4:5], v[12:13]
+; GFX9-SDAG-NEXT: v_cmp_u_f64_e64 s[2:3], v[4:5], v[12:13]
+; GFX9-SDAG-NEXT: v_max_f64 v[12:13], v[6:7], v[14:15]
+; GFX9-SDAG-NEXT: v_cmp_u_f64_e64 s[4:5], v[6:7], v[14:15]
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v7, 0x7ff80000
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v17, v7, vcc
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v2, v8, 0, s[0:1]
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, v9, v7, s[0:1]
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v4, v10, 0, s[2:3]
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v5, v11, v7, s[2:3]
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v6, v12, 0, s[4:5]
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[4:5]
+; GFX9-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_v4f64:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: v_max_f64 v[16:17], v[0:1], v[8:9]
+; GFX9-GISEL-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[8:9]
+; GFX9-GISEL-NEXT: v_max_f64 v[8:9], v[2:3], v[10:11]
+; GFX9-GISEL-NEXT: v_cmp_o_f64_e64 s[0:1], v[2:3], v[10:11]
+; GFX9-GISEL-NEXT: v_max_f64 v[10:11], v[4:5], v[12:13]
+; GFX9-GISEL-NEXT: v_cmp_o_f64_e64 s[2:3], v[4:5], v[12:13]
+; GFX9-GISEL-NEXT: v_max_f64 v[12:13], v[6:7], v[14:15]
+; GFX9-GISEL-NEXT: v_cmp_o_f64_e64 s[4:5], v[6:7], v[14:15]
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v18, 0x7ff80000
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v16, vcc
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v18, v17, vcc
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v8, s[0:1]
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v3, v18, v9, s[0:1]
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v10, s[2:3]
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v5, v18, v11, s[2:3]
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v12, s[4:5]
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v7, v18, v13, s[4:5]
+; GFX9-GISEL-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v4f64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[8:9]
+; GFX12-NEXT: v_maximum_f64 v[2:3], v[2:3], v[10:11]
+; GFX12-NEXT: v_maximum_f64 v[4:5], v[4:5], v[12:13]
+; GFX12-NEXT: v_maximum_f64 v[6:7], v[6:7], v[14:15]
+; GFX12-NEXT: ; return to shader part epilog
%val = call <4 x double> @llvm.maximum.v4f64(<4 x double> %a, <4 x double> %b)
%ret = bitcast <4 x double> %val to <8 x float>
ret <8 x float> %ret
}
define amdgpu_ps <8 x float> @test_fmaximum_v4f64_ss(<4 x double> inreg %a, <4 x double> inreg %b) {
-; GCN-LABEL: test_fmaximum_v4f64_ss:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_maximum_f64 v[0:1], s[0:1], s[8:9]
-; GCN-NEXT: v_maximum_f64 v[2:3], s[2:3], s[10:11]
-; GCN-NEXT: v_maximum_f64 v[4:5], s[4:5], s[12:13]
-; GCN-NEXT: v_maximum_f64 v[6:7], s[6:7], s[14:15]
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fmaximum_v4f64_ss:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-SDAG-NEXT: v_max_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-SDAG-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v10, 0x7ff80000
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s10
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s11
+; GFX9-SDAG-NEXT: v_max_f64 v[4:5], s[2:3], v[1:2]
+; GFX9-SDAG-NEXT: v_cmp_u_f64_e64 s[0:1], s[2:3], v[1:2]
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s12
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s13
+; GFX9-SDAG-NEXT: v_max_f64 v[6:7], s[4:5], v[1:2]
+; GFX9-SDAG-NEXT: v_cmp_u_f64_e64 s[2:3], s[4:5], v[1:2]
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s15
+; GFX9-SDAG-NEXT: v_max_f64 v[8:9], s[6:7], v[1:2]
+; GFX9-SDAG-NEXT: v_cmp_u_f64_e64 s[4:5], s[6:7], v[1:2]
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v10, vcc
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v2, v4, 0, s[0:1]
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, v5, v10, s[0:1]
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v4, v6, 0, s[2:3]
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v5, v7, v10, s[2:3]
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v6, v8, 0, s[4:5]
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v7, v9, v10, s[4:5]
+; GFX9-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_v4f64_ss:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-GISEL-NEXT: v_max_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-GISEL-NEXT: v_cmp_o_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s10
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s11
+; GFX9-GISEL-NEXT: v_max_f64 v[4:5], s[2:3], v[0:1]
+; GFX9-GISEL-NEXT: v_cmp_o_f64_e64 s[0:1], s[2:3], v[0:1]
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s12
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s13
+; GFX9-GISEL-NEXT: v_max_f64 v[6:7], s[4:5], v[0:1]
+; GFX9-GISEL-NEXT: v_cmp_o_f64_e64 s[2:3], s[4:5], v[0:1]
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s14
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s15
+; GFX9-GISEL-NEXT: v_max_f64 v[8:9], s[6:7], v[0:1]
+; GFX9-GISEL-NEXT: v_cmp_o_f64_e64 s[4:5], s[6:7], v[0:1]
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v10, 0x7ff80000
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[0:1]
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v3, v10, v5, s[0:1]
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v6, s[2:3]
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v5, v10, v7, s[2:3]
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v8, s[4:5]
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v7, v10, v9, s[4:5]
+; GFX9-GISEL-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v4f64_ss:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_maximum_f64 v[0:1], s[0:1], s[8:9]
+; GFX12-NEXT: v_maximum_f64 v[2:3], s[2:3], s[10:11]
+; GFX12-NEXT: v_maximum_f64 v[4:5], s[4:5], s[12:13]
+; GFX12-NEXT: v_maximum_f64 v[6:7], s[6:7], s[14:15]
+; GFX12-NEXT: ; return to shader part epilog
%val = call <4 x double> @llvm.maximum.v4f64(<4 x double> %a, <4 x double> %b)
%ret = bitcast <4 x double> %val to <8 x float>
ret <8 x float> %ret
}
define amdgpu_kernel void @fmaximumi_f32_move_to_valu(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
-; GCN-LABEL: fmaximumi_f32_move_to_valu:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_clause 0x1
-; GCN-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GCN-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
-; GCN-NEXT: s_wait_loadcnt 0x0
-; GCN-NEXT: global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS
-; GCN-NEXT: s_wait_loadcnt 0x0
-; GCN-NEXT: v_maximum_f32 v1, v1, v2
-; GCN-NEXT: global_store_b32 v0, v1, s[0:1]
-; GCN-NEXT: s_endpgm
+; GFX9-LABEL: fmaximumi_f32_move_to_valu:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_f32_e32 v4, v1, v2
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX12-LABEL: fmaximumi_f32_move_to_valu:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_maximum_f32 v1, v1, v2
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_endpgm
%a = load volatile float, ptr addrspace(1) %aptr, align 4
%b = load volatile float, ptr addrspace(1) %bptr, align 4
%v = call float @llvm.maximum.f32(float %a, float %b)
@@ -305,6 +910,23 @@ define amdgpu_kernel void @fmaximumi_f32_move_to_valu(ptr addrspace(1) %out, ptr
}
define amdgpu_kernel void @fmaximum_f16_move_to_valu(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
+; GFX9-LABEL: fmaximum_f16_move_to_valu:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_f16_e32 v4, v1, v2
+; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
; GFX12-SDAG-TRUE16-LABEL: fmaximum_f16_move_to_valu:
; GFX12-SDAG-TRUE16: ; %bb.0:
; GFX12-SDAG-TRUE16-NEXT: s_clause 0x1
@@ -371,6 +993,40 @@ define amdgpu_kernel void @fmaximum_f16_move_to_valu(ptr addrspace(1) %out, ptr
ret void
}
+define amdgpu_ps float @test_fmaximum_f32_ieee_on(float %a, float %b) #0 {
+; GFX9-LABEL: test_fmaximum_f32_ieee_on:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_max_f32_e32 v2, v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_f32_ieee_on:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_maximum_f32 v0, v0, v1
+; GFX12-NEXT: ; return to shader part epilog
+ %val = call float @llvm.maximum.f32(float %a, float %b)
+ ret float %val
+}
+
+define amdgpu_ps float @test_fmaximum_f32_ieee_off(float %a, float %b) #1 {
+; GFX9-LABEL: test_fmaximum_f32_ieee_off:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_max_f32_e32 v2, v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_f32_ieee_off:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_maximum_f32 v0, v0, v1
+; GFX12-NEXT: ; return to shader part epilog
+ %val = call float @llvm.maximum.f32(float %a, float %b)
+ ret float %val
+}
+
declare float @llvm.maximum.f32(float, float)
declare <2 x float> @llvm.maximum.v2f32(<2 x float>, <2 x float>)
declare <3 x float> @llvm.maximum.v3f32(<3 x float>, <3 x float>)
@@ -383,3 +1039,6 @@ declare <4 x half> @llvm.maximum.v4f16(<4 x half>, <4 x half>)
declare double @llvm.maximum.f64(double, double)
declare <2 x double> @llvm.maximum.v2f64(<2 x double>, <2 x double>)
declare <4 x double> @llvm.maximum.v4f64(<4 x double>, <4 x double>)
+
+attributes #0 = { nounwind "amdgpu-ieee"="true" }
+attributes #1 = { nounwind "amdgpu-ieee"="false" }
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll b/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll
index bc85dc2..3e513de 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll
@@ -219,8 +219,8 @@ define <2 x bfloat> @v_test_fmed3_r_i_i_v2bf16_minimumnum_maximumnum(<2 x bfloat
}
attributes #0 = { nounwind readnone }
-attributes #1 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="false" }
-attributes #2 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" }
+attributes #1 = { nounwind "no-nans-fp-math"="false" }
+attributes #2 = { nounwind "no-nans-fp-math"="true" }
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX11: {{.*}}
; GFX11-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll
index 3145a27..60ac0b9 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll
@@ -8905,4 +8905,4 @@ declare half @llvm.minnum.f16(half, half) #0
declare half @llvm.maxnum.f16(half, half) #0
attributes #0 = { nounwind readnone }
-attributes #2 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" }
+attributes #2 = { nounwind "no-nans-fp-math"="true" }
diff --git a/llvm/test/CodeGen/AMDGPU/fminimum.ll b/llvm/test/CodeGen/AMDGPU/fminimum.ll
index b25120f..474ac7c 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum.ll
@@ -1,117 +1,296 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
define amdgpu_ps float @test_fminimum_f32_vv(float %a, float %b) {
-; GCN-LABEL: test_fminimum_f32_vv:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_minimum_f32 v0, v0, v1
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_f32_vv:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_min_f32_e32 v2, v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_f32_vv:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_minimum_f32 v0, v0, v1
+; GFX12-NEXT: ; return to shader part epilog
%val = call float @llvm.minimum.f32(float %a, float %b)
ret float %val
}
define amdgpu_ps float @test_fminimum_f32_ss(float inreg %a, float inreg %b) {
-; GCN-LABEL: test_fminimum_f32_ss:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_minimum_f32 s0, s0, s1
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
-; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_f32_ss:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: v_min_f32_e32 v1, s0, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_f32_ss:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_minimum_f32 s0, s0, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: ; return to shader part epilog
%val = call float @llvm.minimum.f32(float %a, float %b)
ret float %val
}
define amdgpu_ps float @test_fminimum_f32_vs(float %a, float inreg %b) {
-; GCN-LABEL: test_fminimum_f32_vs:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_minimum_f32 v0, v0, s0
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_f32_vs:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_min_f32_e32 v1, s0, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_f32_vs:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_minimum_f32 v0, v0, s0
+; GFX12-NEXT: ; return to shader part epilog
%val = call float @llvm.minimum.f32(float %a, float %b)
ret float %val
}
define amdgpu_ps float @test_fminimum_nnan_f32(float %a, float %b) {
-; GCN-LABEL: test_fminimum_nnan_f32:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_minimum_f32 v0, v0, v1
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_nnan_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_nnan_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_minimum_f32 v0, v0, v1
+; GFX12-NEXT: ; return to shader part epilog
%val = call nnan float @llvm.minimum.f32(float %a, float %b)
ret float %val
}
+define amdgpu_ps float @test_fminimum_nsz_f32(float %a, float %b) {
+; GFX9-LABEL: test_fminimum_nsz_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_min_f32_e32 v2, v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_nsz_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_minimum_f32 v0, v0, v1
+; GFX12-NEXT: ; return to shader part epilog
+ %val = call nsz float @llvm.minimum.f32(float %a, float %b)
+ ret float %val
+}
+
+define amdgpu_ps float @test_fminimum_signed_zero_f32() {
+; GFX9-LABEL: test_fminimum_signed_zero_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_bfrev_b32_e32 v0, 1
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_signed_zero_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_bfrev_b32_e32 v0, 1
+; GFX12-NEXT: ; return to shader part epilog
+ %val = call float @llvm.minimum.f32(float -0.0, float 0.0)
+ ret float %val
+}
+
define amdgpu_ps <2 x float> @test_fminimum_v2f32(<2 x float> %a, <2 x float> %b) {
-; GCN-LABEL: test_fminimum_v2f32:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_minimum_f32 v0, v0, v2
-; GCN-NEXT: v_minimum_f32 v1, v1, v3
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_v2f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_min_f32_e32 v4, v0, v2
+; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fc00000
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-NEXT: v_min_f32_e32 v2, v1, v3
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v2f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_minimum_f32 v0, v0, v2
+; GFX12-NEXT: v_minimum_f32 v1, v1, v3
+; GFX12-NEXT: ; return to shader part epilog
%val = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b)
ret <2 x float> %val
}
define amdgpu_ps <2 x float> @test_fminimum_v2f32_ss(<2 x float> inreg %a, <2 x float> inreg %b) {
-; GCN-LABEL: test_fminimum_v2f32_ss:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_minimum_f32 s0, s0, s2
-; GCN-NEXT: s_minimum_f32 s1, s1, s3
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
-; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_v2f32_ss:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_min_f32_e32 v1, s0, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: v_min_f32_e32 v3, s1, v1
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v2f32_ss:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_minimum_f32 s0, s0, s2
+; GFX12-NEXT: s_minimum_f32 s1, s1, s3
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: ; return to shader part epilog
%val = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b)
ret <2 x float> %val
}
define amdgpu_ps <3 x float> @test_fminimum_v3f32(<3 x float> %a, <3 x float> %b) {
-; GCN-LABEL: test_fminimum_v3f32:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_minimum_f32 v0, v0, v3
-; GCN-NEXT: v_minimum_f32 v1, v1, v4
-; GCN-NEXT: v_minimum_f32 v2, v2, v5
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_v3f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_min_f32_e32 v6, v0, v3
+; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT: v_min_f32_e32 v3, v1, v4
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v4
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc
+; GFX9-NEXT: v_min_f32_e32 v3, v2, v5
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v5
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v3f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_minimum_f32 v0, v0, v3
+; GFX12-NEXT: v_minimum_f32 v1, v1, v4
+; GFX12-NEXT: v_minimum_f32 v2, v2, v5
+; GFX12-NEXT: ; return to shader part epilog
%val = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> %b)
ret <3 x float> %val
}
define amdgpu_ps <4 x float> @test_fminimum_v4f32(<4 x float> %a, <4 x float> %b) {
-; GCN-LABEL: test_fminimum_v4f32:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_minimum_f32 v0, v0, v4
-; GCN-NEXT: v_minimum_f32 v1, v1, v5
-; GCN-NEXT: v_minimum_f32 v2, v2, v6
-; GCN-NEXT: v_minimum_f32 v3, v3, v7
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_v4f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_min_f32_e32 v8, v0, v4
+; GFX9-NEXT: v_mov_b32_e32 v9, 0x7fc00000
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v4
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc
+; GFX9-NEXT: v_min_f32_e32 v4, v1, v5
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v5
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc
+; GFX9-NEXT: v_min_f32_e32 v4, v2, v6
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v6
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc
+; GFX9-NEXT: v_min_f32_e32 v4, v3, v7
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v7
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v4f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_minimum_f32 v0, v0, v4
+; GFX12-NEXT: v_minimum_f32 v1, v1, v5
+; GFX12-NEXT: v_minimum_f32 v2, v2, v6
+; GFX12-NEXT: v_minimum_f32 v3, v3, v7
+; GFX12-NEXT: ; return to shader part epilog
%val = call <4 x float> @llvm.minimum.v4f32(<4 x float> %a, <4 x float> %b)
ret <4 x float> %val
}
define amdgpu_ps <16 x float> @test_fminimum_v16f32(<16 x float> %a, <16 x float> %b) {
-; GCN-LABEL: test_fminimum_v16f32:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_minimum_f32 v0, v0, v16
-; GCN-NEXT: v_minimum_f32 v1, v1, v17
-; GCN-NEXT: v_minimum_f32 v2, v2, v18
-; GCN-NEXT: v_minimum_f32 v3, v3, v19
-; GCN-NEXT: v_minimum_f32 v4, v4, v20
-; GCN-NEXT: v_minimum_f32 v5, v5, v21
-; GCN-NEXT: v_minimum_f32 v6, v6, v22
-; GCN-NEXT: v_minimum_f32 v7, v7, v23
-; GCN-NEXT: v_minimum_f32 v8, v8, v24
-; GCN-NEXT: v_minimum_f32 v9, v9, v25
-; GCN-NEXT: v_minimum_f32 v10, v10, v26
-; GCN-NEXT: v_minimum_f32 v11, v11, v27
-; GCN-NEXT: v_minimum_f32 v12, v12, v28
-; GCN-NEXT: v_minimum_f32 v13, v13, v29
-; GCN-NEXT: v_minimum_f32 v14, v14, v30
-; GCN-NEXT: v_minimum_f32 v15, v15, v31
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_v16f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_min_f32_e32 v32, v1, v17
+; GFX9-NEXT: v_mov_b32_e32 v33, 0x7fc00000
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v17
+; GFX9-NEXT: v_min_f32_e32 v1, v0, v16
+; GFX9-NEXT: v_cmp_o_f32_e64 s[12:13], v0, v16
+; GFX9-NEXT: v_min_f32_e32 v17, v2, v18
+; GFX9-NEXT: v_cmp_o_f32_e64 s[0:1], v2, v18
+; GFX9-NEXT: v_min_f32_e32 v18, v3, v19
+; GFX9-NEXT: v_cmp_o_f32_e64 s[2:3], v3, v19
+; GFX9-NEXT: v_min_f32_e32 v19, v4, v20
+; GFX9-NEXT: v_cmp_o_f32_e64 s[4:5], v4, v20
+; GFX9-NEXT: v_min_f32_e32 v20, v5, v21
+; GFX9-NEXT: v_cmp_o_f32_e64 s[6:7], v5, v21
+; GFX9-NEXT: v_min_f32_e32 v21, v6, v22
+; GFX9-NEXT: v_cmp_o_f32_e64 s[8:9], v6, v22
+; GFX9-NEXT: v_min_f32_e32 v22, v7, v23
+; GFX9-NEXT: v_cmp_o_f32_e64 s[10:11], v7, v23
+; GFX9-NEXT: v_min_f32_e32 v23, v8, v24
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v33, v1, s[12:13]
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v33, v32, vcc
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v8, v24
+; GFX9-NEXT: v_min_f32_e32 v34, v9, v25
+; GFX9-NEXT: v_cndmask_b32_e32 v8, v33, v23, vcc
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v9, v25
+; GFX9-NEXT: v_min_f32_e32 v35, v10, v26
+; GFX9-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v10, v26
+; GFX9-NEXT: v_min_f32_e32 v36, v11, v27
+; GFX9-NEXT: v_cndmask_b32_e32 v10, v33, v35, vcc
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v11, v27
+; GFX9-NEXT: v_min_f32_e32 v37, v12, v28
+; GFX9-NEXT: v_cndmask_b32_e32 v11, v33, v36, vcc
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v12, v28
+; GFX9-NEXT: v_min_f32_e32 v16, v13, v29
+; GFX9-NEXT: v_cndmask_b32_e32 v12, v33, v37, vcc
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v13, v29
+; GFX9-NEXT: v_cndmask_b32_e32 v13, v33, v16, vcc
+; GFX9-NEXT: v_min_f32_e32 v16, v14, v30
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v14, v30
+; GFX9-NEXT: v_cndmask_b32_e32 v14, v33, v16, vcc
+; GFX9-NEXT: v_min_f32_e32 v16, v15, v31
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v15, v31
+; GFX9-NEXT: v_cndmask_b32_e64 v2, v33, v17, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v3, v33, v18, s[2:3]
+; GFX9-NEXT: v_cndmask_b32_e64 v4, v33, v19, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v5, v33, v20, s[6:7]
+; GFX9-NEXT: v_cndmask_b32_e64 v6, v33, v21, s[8:9]
+; GFX9-NEXT: v_cndmask_b32_e64 v7, v33, v22, s[10:11]
+; GFX9-NEXT: v_cndmask_b32_e32 v15, v33, v16, vcc
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v16f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_minimum_f32 v0, v0, v16
+; GFX12-NEXT: v_minimum_f32 v1, v1, v17
+; GFX12-NEXT: v_minimum_f32 v2, v2, v18
+; GFX12-NEXT: v_minimum_f32 v3, v3, v19
+; GFX12-NEXT: v_minimum_f32 v4, v4, v20
+; GFX12-NEXT: v_minimum_f32 v5, v5, v21
+; GFX12-NEXT: v_minimum_f32 v6, v6, v22
+; GFX12-NEXT: v_minimum_f32 v7, v7, v23
+; GFX12-NEXT: v_minimum_f32 v8, v8, v24
+; GFX12-NEXT: v_minimum_f32 v9, v9, v25
+; GFX12-NEXT: v_minimum_f32 v10, v10, v26
+; GFX12-NEXT: v_minimum_f32 v11, v11, v27
+; GFX12-NEXT: v_minimum_f32 v12, v12, v28
+; GFX12-NEXT: v_minimum_f32 v13, v13, v29
+; GFX12-NEXT: v_minimum_f32 v14, v14, v30
+; GFX12-NEXT: v_minimum_f32 v15, v15, v31
+; GFX12-NEXT: ; return to shader part epilog
%val = call <16 x float> @llvm.minimum.v16f32(<16 x float> %a, <16 x float> %b)
ret <16 x float> %val
}
define amdgpu_ps half @test_fminimum_f16_vv(half %a, half %b) {
+; GFX9-LABEL: test_fminimum_f16_vv:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_min_f16_e32 v2, v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT: ; return to shader part epilog
+;
; GFX12-SDAG-TRUE16-LABEL: test_fminimum_f16_vv:
; GFX12-SDAG-TRUE16: ; %bb.0:
; GFX12-SDAG-TRUE16-NEXT: v_minimum_f16 v0.l, v0.l, v1.l
@@ -136,35 +315,131 @@ define amdgpu_ps half @test_fminimum_f16_vv(half %a, half %b) {
}
define amdgpu_ps half @test_fminimum_f16_ss(half inreg %a, half inreg %b) {
-; GCN-LABEL: test_fminimum_f16_ss:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_minimum_f16 s0, s0, s1
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
-; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_f16_ss:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: v_min_f16_e32 v1, s0, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00
+; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_f16_ss:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_minimum_f16 s0, s0, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: ; return to shader part epilog
%val = call half @llvm.minimum.f16(half %a, half %b)
ret half %val
}
define amdgpu_ps <2 x half> @test_fminimum_v2f16_vv(<2 x half> %a, <2 x half> %b) {
-; GCN-LABEL: test_fminimum_v2f16_vv:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_pk_minimum_f16 v0, v0, v1
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fminimum_v2f16_vv:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: v_pk_min_f16 v2, v0, v1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x7e00
+; GFX9-SDAG-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc
+; GFX9-SDAG-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT: v_cndmask_b32_sdwa v0, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT: s_mov_b32 s0, 0x5040100
+; GFX9-SDAG-NEXT: v_perm_b32 v0, v0, v4, s0
+; GFX9-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_v2f16_vv:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: v_pk_min_f16 v2, v0, v1
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x7e00
+; GFX9-GISEL-NEXT: v_cmp_o_f16_e64 s[0:1], v0, v1
+; GFX9-GISEL-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[0:1]
+; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-GISEL-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v2f16_vv:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v1
+; GFX12-NEXT: ; return to shader part epilog
%val = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b)
ret <2 x half> %val
}
define amdgpu_ps <2 x half> @test_fminimum_v2f16_ss(<2 x half> inreg %a, <2 x half> inreg %b) {
-; GCN-LABEL: test_fminimum_v2f16_ss:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_pk_minimum_f16 v0, s0, s1
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fminimum_v2f16_ss:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT: s_lshr_b32 s1, s1, 16
+; GFX9-SDAG-NEXT: v_pk_min_f16 v1, s0, v1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0x7e00
+; GFX9-SDAG-NEXT: v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-SDAG-NEXT: s_lshr_b32 s0, s0, 16
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-SDAG-NEXT: v_cmp_o_f16_e32 vcc, s0, v3
+; GFX9-SDAG-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-SDAG-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_v2f16_ss:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-GISEL-NEXT: s_lshr_b32 s1, s1, 16
+; GFX9-GISEL-NEXT: s_lshr_b32 s2, s0, 16
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-GISEL-NEXT: v_pk_min_f16 v1, s0, v0
+; GFX9-GISEL-NEXT: v_cmp_o_f16_e32 vcc, s2, v2
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x7e00
+; GFX9-GISEL-NEXT: v_cmp_o_f16_e64 s[0:1], s0, v0
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[0:1]
+; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-GISEL-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v2f16_ss:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_pk_minimum_f16 v0, s0, s1
+; GFX12-NEXT: ; return to shader part epilog
%val = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b)
ret <2 x half> %val
}
define amdgpu_ps <3 x half> @test_fminimum_v3f16_vv(<3 x half> %a, <3 x half> %b) {
+; GFX9-SDAG-LABEL: test_fminimum_v3f16_vv:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: v_pk_min_f16 v4, v1, v3
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0x7e00
+; GFX9-SDAG-NEXT: v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-SDAG-NEXT: v_pk_min_f16 v3, v0, v2
+; GFX9-SDAG-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX9-SDAG-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT: v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT: s_mov_b32 s0, 0x5040100
+; GFX9-SDAG-NEXT: v_perm_b32 v0, v0, v4, s0
+; GFX9-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_v3f16_vv:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: v_pk_min_f16 v4, v0, v2
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0x7e00
+; GFX9-GISEL-NEXT: v_cmp_o_f16_e64 s[0:1], v0, v2
+; GFX9-GISEL-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v5, v4, s[0:1]
+; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v2, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT: v_pk_min_f16 v4, v1, v3
+; GFX9-GISEL-NEXT: v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-GISEL-NEXT: ; return to shader part epilog
+;
; GFX12-SDAG-LABEL: test_fminimum_v3f16_vv:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: v_pk_minimum_f16 v0, v0, v2
@@ -187,6 +462,49 @@ define amdgpu_ps <3 x half> @test_fminimum_v3f16_vv(<3 x half> %a, <3 x half> %b
}
define amdgpu_ps <3 x half> @test_fminimum_v3f16_ss(<3 x half> inreg %a, <3 x half> inreg %b) {
+; GFX9-SDAG-LABEL: test_fminimum_v3f16_ss:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-SDAG-NEXT: v_pk_min_f16 v1, s1, v1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0x7e00
+; GFX9-SDAG-NEXT: v_cmp_o_f16_e32 vcc, s1, v0
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-SDAG-NEXT: s_lshr_b32 s1, s2, 16
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-SDAG-NEXT: v_pk_min_f16 v3, s0, v3
+; GFX9-SDAG-NEXT: v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-SDAG-NEXT: s_lshr_b32 s0, s0, 16
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, s1
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX9-SDAG-NEXT: v_cmp_o_f16_e32 vcc, s0, v4
+; GFX9-SDAG-NEXT: v_cndmask_b32_sdwa v2, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-SDAG-NEXT: v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_v3f16_ss:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_lshr_b32 s5, s2, 16
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-GISEL-NEXT: s_lshr_b32 s4, s0, 16
+; GFX9-GISEL-NEXT: v_pk_min_f16 v1, s0, v0
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX9-GISEL-NEXT: v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-GISEL-NEXT: v_cmp_o_f16_e32 vcc, s4, v2
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX9-GISEL-NEXT: v_pk_min_f16 v3, s1, v1
+; GFX9-GISEL-NEXT: v_cmp_o_f16_e32 vcc, s1, v1
+; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-GISEL-NEXT: ; return to shader part epilog
+;
; GFX12-SDAG-LABEL: test_fminimum_v3f16_ss:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: v_pk_minimum_f16 v0, s0, s2
@@ -206,97 +524,384 @@ define amdgpu_ps <3 x half> @test_fminimum_v3f16_ss(<3 x half> inreg %a, <3 x ha
}
define amdgpu_ps <4 x half> @test_fminimum_v4f16(<4 x half> %a, <4 x half> %b) {
-; GCN-LABEL: test_fminimum_v4f16:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_pk_minimum_f16 v0, v0, v2
-; GCN-NEXT: v_pk_minimum_f16 v1, v1, v3
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fminimum_v4f16:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: v_pk_min_f16 v4, v1, v3
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0x7e00
+; GFX9-SDAG-NEXT: v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc
+; GFX9-SDAG-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT: v_cndmask_b32_sdwa v1, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT: v_pk_min_f16 v3, v0, v2
+; GFX9-SDAG-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX9-SDAG-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT: v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT: s_mov_b32 s0, 0x5040100
+; GFX9-SDAG-NEXT: v_perm_b32 v0, v0, v4, s0
+; GFX9-SDAG-NEXT: v_perm_b32 v1, v1, v6, s0
+; GFX9-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_v4f16:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: v_pk_min_f16 v4, v0, v2
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v6, 0x7e00
+; GFX9-GISEL-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v4
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9-GISEL-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc
+; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v4
+; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v2
+; GFX9-GISEL-NEXT: v_pk_min_f16 v2, v1, v3
+; GFX9-GISEL-NEXT: v_cmp_o_f16_e64 s[0:1], v1, v3
+; GFX9-GISEL-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, v6, v2, s[0:1]
+; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v2, v6, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1
+; GFX9-GISEL-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v4f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2
+; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT: ; return to shader part epilog
%val = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> %b)
ret <4 x half> %val
}
define amdgpu_ps <4 x half> @test_fminimum_v4f16_ss(<4 x half> inreg %a, <4 x half> inreg %b) {
-; GCN-LABEL: test_fminimum_v4f16_ss:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_pk_minimum_f16 v0, s0, s2
-; GCN-NEXT: v_pk_minimum_f16 v1, s1, s3
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fminimum_v4f16_ss:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-SDAG-NEXT: s_lshr_b32 s3, s3, 16
+; GFX9-SDAG-NEXT: v_pk_min_f16 v1, s1, v1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0x7e00
+; GFX9-SDAG-NEXT: v_cmp_o_f16_e32 vcc, s1, v0
+; GFX9-SDAG-NEXT: s_lshr_b32 s1, s1, 16
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v3, v2, v1, vcc
+; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT: v_cmp_o_f16_e32 vcc, s1, v0
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, s2
+; GFX9-SDAG-NEXT: s_lshr_b32 s1, s2, 16
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-SDAG-NEXT: v_pk_min_f16 v4, s0, v4
+; GFX9-SDAG-NEXT: v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-SDAG-NEXT: s_lshr_b32 s0, s0, 16
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, s1
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX9-SDAG-NEXT: v_cmp_o_f16_e32 vcc, s0, v5
+; GFX9-SDAG-NEXT: v_cndmask_b32_sdwa v2, v2, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-SDAG-NEXT: v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v3
+; GFX9-SDAG-NEXT: v_lshl_or_b32 v1, v1, 16, v2
+; GFX9-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_v4f16_ss:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-GISEL-NEXT: s_lshr_b32 s2, s2, 16
+; GFX9-GISEL-NEXT: v_pk_min_f16 v1, s0, v0
+; GFX9-GISEL-NEXT: s_lshr_b32 s4, s0, 16
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX9-GISEL-NEXT: v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-GISEL-NEXT: v_cmp_o_f16_e32 vcc, s4, v2
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT: s_lshr_b32 s2, s3, 16
+; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-GISEL-NEXT: s_lshr_b32 s0, s1, 16
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-GISEL-NEXT: v_pk_min_f16 v2, s1, v1
+; GFX9-GISEL-NEXT: v_cmp_o_f16_e32 vcc, s0, v3
+; GFX9-GISEL-NEXT: v_cmp_o_f16_e64 s[0:1], s1, v1
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, v4, v2, s[0:1]
+; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v2, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1
+; GFX9-GISEL-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v4f16_ss:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_pk_minimum_f16 v0, s0, s2
+; GFX12-NEXT: v_pk_minimum_f16 v1, s1, s3
+; GFX12-NEXT: ; return to shader part epilog
%val = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> %b)
ret <4 x half> %val
}
define amdgpu_ps <2 x float> @test_fminimum_f64_vv(double %a, double %b) {
-; GCN-LABEL: test_fminimum_f64_vv:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_minimum_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fminimum_f64_vv:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-SDAG-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_f64_vv:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-GISEL-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_f64_vv:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT: ; return to shader part epilog
%val = call double @llvm.minimum.f64(double %a, double %b)
%ret = bitcast double %val to <2 x float>
ret <2 x float> %ret
}
define amdgpu_ps <2 x float> @test_fminimum_f64_ss(double inreg %a, double inreg %b) {
-; GCN-LABEL: test_fminimum_f64_ss:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_minimum_f64 v[0:1], s[0:1], s[2:3]
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fminimum_f64_ss:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-SDAG-NEXT: v_min_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-SDAG-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_f64_ss:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-GISEL-NEXT: v_min_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-GISEL-NEXT: v_cmp_o_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-GISEL-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_f64_ss:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_minimum_f64 v[0:1], s[0:1], s[2:3]
+; GFX12-NEXT: ; return to shader part epilog
%val = call double @llvm.minimum.f64(double %a, double %b)
%ret = bitcast double %val to <2 x float>
ret <2 x float> %ret
}
define amdgpu_ps <4 x float> @test_fminimum_v2f64_ss(<2 x double> inreg %a, <2 x double> inreg %b) {
-; GCN-LABEL: test_fminimum_v2f64_ss:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_minimum_f64 v[0:1], s[0:1], s[4:5]
-; GCN-NEXT: v_minimum_f64 v[2:3], s[2:3], s[6:7]
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fminimum_v2f64_ss:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-SDAG-NEXT: v_min_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-SDAG-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-SDAG-NEXT: v_min_f64 v[4:5], s[2:3], v[0:1]
+; GFX9-SDAG-NEXT: v_cmp_u_f64_e64 s[0:1], s[2:3], v[0:1]
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v2, v4, 0, s[0:1]
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[0:1]
+; GFX9-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_v2f64_ss:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT: v_min_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-GISEL-NEXT: v_cmp_o_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-GISEL-NEXT: v_min_f64 v[4:5], s[2:3], v[0:1]
+; GFX9-GISEL-NEXT: v_cmp_o_f64_e64 s[0:1], s[2:3], v[0:1]
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[0:1]
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v3, v6, v5, s[0:1]
+; GFX9-GISEL-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v2f64_ss:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_minimum_f64 v[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: v_minimum_f64 v[2:3], s[2:3], s[6:7]
+; GFX12-NEXT: ; return to shader part epilog
%val = call <2 x double> @llvm.minimum.v2f64(<2 x double> %a, <2 x double> %b)
%ret = bitcast <2 x double> %val to <4 x float>
ret <4 x float> %ret
}
define amdgpu_ps <8 x float> @test_fminimum_v4f64(<4 x double> %a, <4 x double> %b) {
-; GCN-LABEL: test_fminimum_v4f64:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_minimum_f64 v[0:1], v[0:1], v[8:9]
-; GCN-NEXT: v_minimum_f64 v[2:3], v[2:3], v[10:11]
-; GCN-NEXT: v_minimum_f64 v[4:5], v[4:5], v[12:13]
-; GCN-NEXT: v_minimum_f64 v[6:7], v[6:7], v[14:15]
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fminimum_v4f64:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: v_min_f64 v[16:17], v[0:1], v[8:9]
+; GFX9-SDAG-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9]
+; GFX9-SDAG-NEXT: v_min_f64 v[8:9], v[2:3], v[10:11]
+; GFX9-SDAG-NEXT: v_cmp_u_f64_e64 s[0:1], v[2:3], v[10:11]
+; GFX9-SDAG-NEXT: v_min_f64 v[10:11], v[4:5], v[12:13]
+; GFX9-SDAG-NEXT: v_cmp_u_f64_e64 s[2:3], v[4:5], v[12:13]
+; GFX9-SDAG-NEXT: v_min_f64 v[12:13], v[6:7], v[14:15]
+; GFX9-SDAG-NEXT: v_cmp_u_f64_e64 s[4:5], v[6:7], v[14:15]
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v7, 0x7ff80000
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v17, v7, vcc
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v2, v8, 0, s[0:1]
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, v9, v7, s[0:1]
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v4, v10, 0, s[2:3]
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v5, v11, v7, s[2:3]
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v6, v12, 0, s[4:5]
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[4:5]
+; GFX9-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_v4f64:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: v_min_f64 v[16:17], v[0:1], v[8:9]
+; GFX9-GISEL-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[8:9]
+; GFX9-GISEL-NEXT: v_min_f64 v[8:9], v[2:3], v[10:11]
+; GFX9-GISEL-NEXT: v_cmp_o_f64_e64 s[0:1], v[2:3], v[10:11]
+; GFX9-GISEL-NEXT: v_min_f64 v[10:11], v[4:5], v[12:13]
+; GFX9-GISEL-NEXT: v_cmp_o_f64_e64 s[2:3], v[4:5], v[12:13]
+; GFX9-GISEL-NEXT: v_min_f64 v[12:13], v[6:7], v[14:15]
+; GFX9-GISEL-NEXT: v_cmp_o_f64_e64 s[4:5], v[6:7], v[14:15]
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v18, 0x7ff80000
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v16, vcc
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v18, v17, vcc
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v8, s[0:1]
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v3, v18, v9, s[0:1]
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v10, s[2:3]
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v5, v18, v11, s[2:3]
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v12, s[4:5]
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v7, v18, v13, s[4:5]
+; GFX9-GISEL-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v4f64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[8:9]
+; GFX12-NEXT: v_minimum_f64 v[2:3], v[2:3], v[10:11]
+; GFX12-NEXT: v_minimum_f64 v[4:5], v[4:5], v[12:13]
+; GFX12-NEXT: v_minimum_f64 v[6:7], v[6:7], v[14:15]
+; GFX12-NEXT: ; return to shader part epilog
%val = call <4 x double> @llvm.minimum.v4f64(<4 x double> %a, <4 x double> %b)
%ret = bitcast <4 x double> %val to <8 x float>
ret <8 x float> %ret
}
define amdgpu_ps <8 x float> @test_fminimum_v4f64_ss(<4 x double> inreg %a, <4 x double> inreg %b) {
-; GCN-LABEL: test_fminimum_v4f64_ss:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_minimum_f64 v[0:1], s[0:1], s[8:9]
-; GCN-NEXT: v_minimum_f64 v[2:3], s[2:3], s[10:11]
-; GCN-NEXT: v_minimum_f64 v[4:5], s[4:5], s[12:13]
-; GCN-NEXT: v_minimum_f64 v[6:7], s[6:7], s[14:15]
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fminimum_v4f64_ss:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-SDAG-NEXT: v_min_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-SDAG-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v10, 0x7ff80000
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s10
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s11
+; GFX9-SDAG-NEXT: v_min_f64 v[4:5], s[2:3], v[1:2]
+; GFX9-SDAG-NEXT: v_cmp_u_f64_e64 s[0:1], s[2:3], v[1:2]
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s12
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s13
+; GFX9-SDAG-NEXT: v_min_f64 v[6:7], s[4:5], v[1:2]
+; GFX9-SDAG-NEXT: v_cmp_u_f64_e64 s[2:3], s[4:5], v[1:2]
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s15
+; GFX9-SDAG-NEXT: v_min_f64 v[8:9], s[6:7], v[1:2]
+; GFX9-SDAG-NEXT: v_cmp_u_f64_e64 s[4:5], s[6:7], v[1:2]
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v10, vcc
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v2, v4, 0, s[0:1]
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, v5, v10, s[0:1]
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v4, v6, 0, s[2:3]
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v5, v7, v10, s[2:3]
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v6, v8, 0, s[4:5]
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v7, v9, v10, s[4:5]
+; GFX9-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_v4f64_ss:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-GISEL-NEXT: v_min_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-GISEL-NEXT: v_cmp_o_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s10
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s11
+; GFX9-GISEL-NEXT: v_min_f64 v[4:5], s[2:3], v[0:1]
+; GFX9-GISEL-NEXT: v_cmp_o_f64_e64 s[0:1], s[2:3], v[0:1]
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s12
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s13
+; GFX9-GISEL-NEXT: v_min_f64 v[6:7], s[4:5], v[0:1]
+; GFX9-GISEL-NEXT: v_cmp_o_f64_e64 s[2:3], s[4:5], v[0:1]
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s14
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s15
+; GFX9-GISEL-NEXT: v_min_f64 v[8:9], s[6:7], v[0:1]
+; GFX9-GISEL-NEXT: v_cmp_o_f64_e64 s[4:5], s[6:7], v[0:1]
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v10, 0x7ff80000
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[0:1]
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v3, v10, v5, s[0:1]
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v6, s[2:3]
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v5, v10, v7, s[2:3]
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v8, s[4:5]
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v7, v10, v9, s[4:5]
+; GFX9-GISEL-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v4f64_ss:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_minimum_f64 v[0:1], s[0:1], s[8:9]
+; GFX12-NEXT: v_minimum_f64 v[2:3], s[2:3], s[10:11]
+; GFX12-NEXT: v_minimum_f64 v[4:5], s[4:5], s[12:13]
+; GFX12-NEXT: v_minimum_f64 v[6:7], s[6:7], s[14:15]
+; GFX12-NEXT: ; return to shader part epilog
%val = call <4 x double> @llvm.minimum.v4f64(<4 x double> %a, <4 x double> %b)
%ret = bitcast <4 x double> %val to <8 x float>
ret <8 x float> %ret
}
define amdgpu_kernel void @fminimumi_f32_move_to_valu(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
-; GCN-LABEL: fminimumi_f32_move_to_valu:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_clause 0x1
-; GCN-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GCN-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
-; GCN-NEXT: s_wait_loadcnt 0x0
-; GCN-NEXT: global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS
-; GCN-NEXT: s_wait_loadcnt 0x0
-; GCN-NEXT: v_minimum_f32 v1, v1, v2
-; GCN-NEXT: global_store_b32 v0, v1, s[0:1]
-; GCN-NEXT: s_endpgm
+; GFX9-LABEL: fminimumi_f32_move_to_valu:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_min_f32_e32 v4, v1, v2
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX12-LABEL: fminimumi_f32_move_to_valu:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_minimum_f32 v1, v1, v2
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_endpgm
%a = load volatile float, ptr addrspace(1) %aptr, align 4
%b = load volatile float, ptr addrspace(1) %bptr, align 4
%v = call float @llvm.minimum.f32(float %a, float %b)
@@ -305,6 +910,23 @@ define amdgpu_kernel void @fminimumi_f32_move_to_valu(ptr addrspace(1) %out, ptr
}
define amdgpu_kernel void @fminimum_f16_move_to_valu(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
+; GFX9-LABEL: fminimum_f16_move_to_valu:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_min_f16_e32 v4, v1, v2
+; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
; GFX12-SDAG-TRUE16-LABEL: fminimum_f16_move_to_valu:
; GFX12-SDAG-TRUE16: ; %bb.0:
; GFX12-SDAG-TRUE16-NEXT: s_clause 0x1
@@ -371,6 +993,40 @@ define amdgpu_kernel void @fminimum_f16_move_to_valu(ptr addrspace(1) %out, ptr
ret void
}
+define amdgpu_ps float @test_fminimum_f32_ieee_on(float %a, float %b) #0 {
+; GFX9-LABEL: test_fminimum_f32_ieee_on:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_min_f32_e32 v2, v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_f32_ieee_on:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_minimum_f32 v0, v0, v1
+; GFX12-NEXT: ; return to shader part epilog
+ %val = call float @llvm.minimum.f32(float %a, float %b)
+ ret float %val
+}
+
+define amdgpu_ps float @test_fminimum_f32_ieee_off(float %a, float %b) #1 {
+; GFX9-LABEL: test_fminimum_f32_ieee_off:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_min_f32_e32 v2, v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_f32_ieee_off:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_minimum_f32 v0, v0, v1
+; GFX12-NEXT: ; return to shader part epilog
+ %val = call float @llvm.minimum.f32(float %a, float %b)
+ ret float %val
+}
+
declare float @llvm.minimum.f32(float, float)
declare <2 x float> @llvm.minimum.v2f32(<2 x float>, <2 x float>)
declare <3 x float> @llvm.minimum.v3f32(<3 x float>, <3 x float>)
@@ -383,3 +1039,6 @@ declare <4 x half> @llvm.minimum.v4f16(<4 x half>, <4 x half>)
declare double @llvm.minimum.f64(double, double)
declare <2 x double> @llvm.minimum.v2f64(<2 x double>, <2 x double>)
declare <4 x double> @llvm.minimum.v4f64(<4 x double>, <4 x double>)
+
+attributes #0 = { nounwind "amdgpu-ieee"="true" }
+attributes #1 = { nounwind "amdgpu-ieee"="false" }
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.legal.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.legal.f16.ll
index d8bbda1..69d1ee3f 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.legal.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.legal.f16.ll
@@ -159,7 +159,7 @@ declare half @llvm.amdgcn.interp.p2.f16(float, float, i32, i32, i1, i32) #0
attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind "unsafe-fp-math"="true" }
+attributes #2 = { nounwind }
attributes #3 = { nounwind "no-signed-zeros-fp-math"="true" }
attributes #4 = { nounwind "amdgpu-ieee"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
index aaea4f7..b3202cb 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
@@ -8006,7 +8006,7 @@ declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #0
attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind "unsafe-fp-math"="true" }
+attributes #2 = { nounwind }
attributes #3 = { nounwind "no-signed-zeros-fp-math"="true" }
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GCN-NSZ: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
index b0dd187..c28b25c7 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
@@ -599,10 +599,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; SI-GISEL-NEXT: s_addk_i32 s3, 0xfc10
; SI-GISEL-NEXT: s_and_b32 s6, s6, 0xffe
; SI-GISEL-NEXT: s_or_b32 s4, s7, s4
-; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; SI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; SI-GISEL-NEXT: s_or_b32 s4, s6, s4
-; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; SI-GISEL-NEXT: s_cselect_b32 s6, 1, 0
; SI-GISEL-NEXT: s_lshl_b32 s6, s6, 9
; SI-GISEL-NEXT: s_lshl_b32 s7, s3, 12
@@ -711,10 +709,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; VI-GISEL-NEXT: s_addk_i32 s4, 0xfc10
; VI-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
; VI-GISEL-NEXT: s_or_b32 s2, s6, s2
-; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; VI-GISEL-NEXT: s_cselect_b32 s2, 1, 0
; VI-GISEL-NEXT: s_or_b32 s2, s5, s2
-; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; VI-GISEL-NEXT: s_sub_i32 s7, 1, s4
; VI-GISEL-NEXT: s_lshl_b32 s6, s4, 12
@@ -824,10 +820,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX9-GISEL-NEXT: s_addk_i32 s4, 0xfc10
; GFX9-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
; GFX9-GISEL-NEXT: s_or_b32 s2, s6, s2
-; GFX9-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX9-GISEL-NEXT: s_cselect_b32 s2, 1, 0
; GFX9-GISEL-NEXT: s_or_b32 s2, s5, s2
-; GFX9-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX9-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX9-GISEL-NEXT: s_sub_i32 s7, 1, s4
; GFX9-GISEL-NEXT: s_lshl_b32 s6, s4, 12
@@ -937,10 +931,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX950-GISEL-NEXT: s_addk_i32 s4, 0xfc10
; GFX950-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
; GFX950-GISEL-NEXT: s_or_b32 s2, s6, s2
-; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX950-GISEL-NEXT: s_cselect_b32 s2, 1, 0
; GFX950-GISEL-NEXT: s_or_b32 s2, s5, s2
-; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX950-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX950-GISEL-NEXT: s_sub_i32 s7, 1, s4
; GFX950-GISEL-NEXT: s_lshl_b32 s6, s4, 12
@@ -1118,17 +1110,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-TRUE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX11-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s3, 0xb0014
; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s3, 8
-; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2
+; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX11-GISEL-TRUE16-NEXT: s_addk_i32 s4, 0xfc10
; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe
-; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s2, 1, 0
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s5, s2
-; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX11-GISEL-TRUE16-NEXT: s_sub_i32 s6, 1, s4
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s8, s2, 0x1000
@@ -1175,17 +1165,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-FAKE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX11-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s3, 0xb0014
; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s3, 8
-; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2
+; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX11-GISEL-FAKE16-NEXT: s_addk_i32 s4, 0xfc10
; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe
-; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s2, 1, 0
-; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s5, s2
-; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX11-GISEL-FAKE16-NEXT: s_sub_i32 s6, 1, s4
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s8, s2, 0x1000
@@ -1366,17 +1354,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-TRUE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX1250-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s3, 0xb0014
; GFX1250-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s3, 8
-; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2
+; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX1250-GISEL-TRUE16-NEXT: s_addk_co_i32 s4, 0xfc10
; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe
-; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2
; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s2, 1, 0
-; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s2, s5, s2
-; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX1250-GISEL-TRUE16-NEXT: s_sub_co_i32 s6, 1, s4
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s8, s2, 0x1000
@@ -1423,17 +1409,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-FAKE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX1250-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s3, 0xb0014
; GFX1250-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s3, 8
-; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2
+; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX1250-GISEL-FAKE16-NEXT: s_addk_co_i32 s4, 0xfc10
; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe
-; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2
; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s2, 1, 0
-; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s2, s5, s2
-; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX1250-GISEL-FAKE16-NEXT: s_sub_co_i32 s6, 1, s4
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s8, s2, 0x1000
@@ -2154,10 +2138,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; SI-GISEL-NEXT: s_addk_i32 s3, 0xfc10
; SI-GISEL-NEXT: s_and_b32 s8, s8, 0xffe
; SI-GISEL-NEXT: s_or_b32 s4, s9, s4
-; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; SI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; SI-GISEL-NEXT: s_or_b32 s4, s8, s4
-; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; SI-GISEL-NEXT: s_cselect_b32 s8, 1, 0
; SI-GISEL-NEXT: s_lshl_b32 s8, s8, 9
; SI-GISEL-NEXT: s_lshl_b32 s9, s3, 12
@@ -2193,12 +2175,10 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; SI-GISEL-NEXT: s_and_b32 s4, s4, 0x8000
; SI-GISEL-NEXT: s_addk_i32 s5, 0xfc10
; SI-GISEL-NEXT: s_and_b32 s8, s8, 0xffe
-; SI-GISEL-NEXT: s_or_b32 s6, s9, s6
; SI-GISEL-NEXT: s_or_b32 s3, s4, s3
-; SI-GISEL-NEXT: s_cmp_lg_u32 s6, 0
+; SI-GISEL-NEXT: s_or_b32 s4, s9, s6
; SI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; SI-GISEL-NEXT: s_or_b32 s4, s8, s4
-; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; SI-GISEL-NEXT: s_cselect_b32 s6, 1, 0
; SI-GISEL-NEXT: s_lshl_b32 s6, s6, 9
; SI-GISEL-NEXT: s_lshl_b32 s8, s5, 12
@@ -2355,10 +2335,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; VI-GISEL-NEXT: s_addk_i32 s2, 0xfc10
; VI-GISEL-NEXT: s_and_b32 s3, s3, 0xffe
; VI-GISEL-NEXT: s_or_b32 s4, s8, s4
-; VI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; VI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; VI-GISEL-NEXT: s_or_b32 s3, s3, s4
-; VI-GISEL-NEXT: s_cmp_lg_u32 s3, 0
; VI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; VI-GISEL-NEXT: s_sub_i32 s9, 1, s2
; VI-GISEL-NEXT: s_lshl_b32 s8, s2, 12
@@ -2392,14 +2370,12 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; VI-GISEL-NEXT: s_or_b32 s2, s3, s2
; VI-GISEL-NEXT: s_bfe_u32 s3, s7, 0xb0014
; VI-GISEL-NEXT: s_lshr_b32 s4, s7, 8
-; VI-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff
; VI-GISEL-NEXT: s_addk_i32 s3, 0xfc10
; VI-GISEL-NEXT: s_and_b32 s4, s4, 0xffe
+; VI-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff
; VI-GISEL-NEXT: s_or_b32 s5, s5, s6
-; VI-GISEL-NEXT: s_cmp_lg_u32 s5, 0
; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; VI-GISEL-NEXT: s_or_b32 s4, s4, s5
-; VI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; VI-GISEL-NEXT: s_sub_i32 s8, 1, s3
; VI-GISEL-NEXT: s_lshl_b32 s6, s3, 12
@@ -2555,10 +2531,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX9-GISEL-NEXT: s_addk_i32 s2, 0xfc10
; GFX9-GISEL-NEXT: s_and_b32 s3, s3, 0xffe
; GFX9-GISEL-NEXT: s_or_b32 s4, s8, s4
-; GFX9-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; GFX9-GISEL-NEXT: s_or_b32 s3, s3, s4
-; GFX9-GISEL-NEXT: s_cmp_lg_u32 s3, 0
; GFX9-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; GFX9-GISEL-NEXT: s_sub_i32 s9, 1, s2
; GFX9-GISEL-NEXT: s_lshl_b32 s8, s2, 12
@@ -2592,14 +2566,12 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX9-GISEL-NEXT: s_or_b32 s2, s3, s2
; GFX9-GISEL-NEXT: s_bfe_u32 s3, s7, 0xb0014
; GFX9-GISEL-NEXT: s_lshr_b32 s4, s7, 8
-; GFX9-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff
; GFX9-GISEL-NEXT: s_addk_i32 s3, 0xfc10
; GFX9-GISEL-NEXT: s_and_b32 s4, s4, 0xffe
+; GFX9-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff
; GFX9-GISEL-NEXT: s_or_b32 s5, s5, s6
-; GFX9-GISEL-NEXT: s_cmp_lg_u32 s5, 0
; GFX9-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX9-GISEL-NEXT: s_or_b32 s4, s4, s5
-; GFX9-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX9-GISEL-NEXT: s_sub_i32 s8, 1, s3
; GFX9-GISEL-NEXT: s_lshl_b32 s6, s3, 12
@@ -2752,10 +2724,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX950-GISEL-NEXT: s_addk_i32 s2, 0xfc10
; GFX950-GISEL-NEXT: s_and_b32 s3, s3, 0xffe
; GFX950-GISEL-NEXT: s_or_b32 s4, s8, s4
-; GFX950-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; GFX950-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; GFX950-GISEL-NEXT: s_or_b32 s3, s3, s4
-; GFX950-GISEL-NEXT: s_cmp_lg_u32 s3, 0
; GFX950-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; GFX950-GISEL-NEXT: s_sub_i32 s9, 1, s2
; GFX950-GISEL-NEXT: s_lshl_b32 s8, s2, 12
@@ -2789,14 +2759,12 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX950-GISEL-NEXT: s_or_b32 s2, s3, s2
; GFX950-GISEL-NEXT: s_bfe_u32 s3, s7, 0xb0014
; GFX950-GISEL-NEXT: s_lshr_b32 s4, s7, 8
-; GFX950-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff
; GFX950-GISEL-NEXT: s_addk_i32 s3, 0xfc10
; GFX950-GISEL-NEXT: s_and_b32 s4, s4, 0xffe
+; GFX950-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff
; GFX950-GISEL-NEXT: s_or_b32 s5, s5, s6
-; GFX950-GISEL-NEXT: s_cmp_lg_u32 s5, 0
; GFX950-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX950-GISEL-NEXT: s_or_b32 s4, s4, s5
-; GFX950-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; GFX950-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX950-GISEL-NEXT: s_sub_i32 s8, 1, s3
; GFX950-GISEL-NEXT: s_lshl_b32 s6, s3, 12
@@ -3073,17 +3041,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff
; GFX11-GISEL-TRUE16-NEXT: s_bfe_u32 s2, s5, 0xb0014
; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 8
-; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4
+; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff
; GFX11-GISEL-TRUE16-NEXT: s_addk_i32 s2, 0xfc10
; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0xffe
-; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s4, 0
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s3, s4
-; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0
; GFX11-GISEL-TRUE16-NEXT: s_sub_i32 s8, 1, s2
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s10, s3, 0x1000
@@ -3115,19 +3081,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX11-GISEL-TRUE16-NEXT: s_cmpk_eq_i32 s2, 0x40f
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s2, s4, s3
; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 16
-; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff
; GFX11-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s7, 0xb0014
; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s7, 8
; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0x8000
-; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s6, s8, s6
+; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff
; GFX11-GISEL-TRUE16-NEXT: s_addk_i32 s4, 0xfc10
; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s3, s2
-; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s6, 0
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s8, s6
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s3, 1, 0
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s5, s3
-; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX11-GISEL-TRUE16-NEXT: s_sub_i32 s6, 1, s4
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s9, s3, 0x1000
@@ -3176,17 +3140,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff
; GFX11-GISEL-FAKE16-NEXT: s_bfe_u32 s2, s5, 0xb0014
; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 8
-; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4
+; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff
; GFX11-GISEL-FAKE16-NEXT: s_addk_i32 s2, 0xfc10
; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0xffe
-; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s4, 0
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0
-; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s3, s4
-; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0
; GFX11-GISEL-FAKE16-NEXT: s_sub_i32 s8, 1, s2
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s10, s3, 0x1000
@@ -3218,19 +3180,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX11-GISEL-FAKE16-NEXT: s_cmpk_eq_i32 s2, 0x40f
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s2, s4, s3
; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 16
-; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff
; GFX11-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s7, 0xb0014
; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s7, 8
; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0x8000
-; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s6, s8, s6
+; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff
; GFX11-GISEL-FAKE16-NEXT: s_addk_i32 s4, 0xfc10
; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s3, s2
-; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s6, 0
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s8, s6
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s3, 1, 0
-; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s5, s3
-; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX11-GISEL-FAKE16-NEXT: s_sub_i32 s6, 1, s4
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s9, s3, 0x1000
@@ -3511,17 +3471,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-TRUE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff
; GFX1250-GISEL-TRUE16-NEXT: s_bfe_u32 s2, s5, 0xb0014
; GFX1250-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 8
-; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4
+; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff
; GFX1250-GISEL-TRUE16-NEXT: s_addk_co_i32 s2, 0xfc10
; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0xffe
-; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4
; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0
-; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s3, s3, s4
-; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0
; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0
; GFX1250-GISEL-TRUE16-NEXT: s_sub_co_i32 s8, 1, s2
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s10, s3, 0x1000
@@ -3553,19 +3511,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX1250-GISEL-TRUE16-NEXT: s_cmp_eq_u32 s2, 0x40f
; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s2, s4, s3
; GFX1250-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 16
-; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff
; GFX1250-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s7, 0xb0014
; GFX1250-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s7, 8
; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0x8000
-; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s6, s8, s6
+; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff
; GFX1250-GISEL-TRUE16-NEXT: s_addk_co_i32 s4, 0xfc10
; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s2, s3, s2
-; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s6, 0
+; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s3, s8, s6
; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s3, 1, 0
-; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s3, s5, s3
-; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0
; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX1250-GISEL-TRUE16-NEXT: s_sub_co_i32 s6, 1, s4
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s9, s3, 0x1000
@@ -3614,17 +3570,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-FAKE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff
; GFX1250-GISEL-FAKE16-NEXT: s_bfe_u32 s2, s5, 0xb0014
; GFX1250-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 8
-; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4
+; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff
; GFX1250-GISEL-FAKE16-NEXT: s_addk_co_i32 s2, 0xfc10
; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0xffe
-; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4
; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0
-; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s3, s3, s4
-; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0
; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0
; GFX1250-GISEL-FAKE16-NEXT: s_sub_co_i32 s8, 1, s2
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s10, s3, 0x1000
@@ -3656,19 +3610,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX1250-GISEL-FAKE16-NEXT: s_cmp_eq_u32 s2, 0x40f
; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s2, s4, s3
; GFX1250-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 16
-; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff
; GFX1250-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s7, 0xb0014
; GFX1250-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s7, 8
; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0x8000
-; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s6, s8, s6
+; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff
; GFX1250-GISEL-FAKE16-NEXT: s_addk_co_i32 s4, 0xfc10
; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s2, s3, s2
-; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s6, 0
+; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s3, s8, s6
; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s3, 1, 0
-; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s3, s5, s3
-; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0
; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX1250-GISEL-FAKE16-NEXT: s_sub_co_i32 s6, 1, s4
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s9, s3, 0x1000
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
index f1165491..b6b26a4 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
@@ -182,7 +182,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; SI-NEXT: s_and_b32 s1, s7, 0x1ff
; SI-NEXT: s_and_b32 s8, s0, 0xffe
; SI-NEXT: s_or_b32 s0, s1, s6
-; SI-NEXT: s_cmp_lg_u32 s0, 0
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; SI-NEXT: s_bfe_u32 s0, s7, 0xb0014
@@ -237,7 +236,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; VI-SDAG-NEXT: s_and_b32 s8, s4, 0xffe
; VI-SDAG-NEXT: s_and_b32 s4, s7, 0x1ff
; VI-SDAG-NEXT: s_or_b32 s4, s4, s6
-; VI-SDAG-NEXT: s_cmp_lg_u32 s4, 0
; VI-SDAG-NEXT: s_mov_b32 s1, s5
; VI-SDAG-NEXT: s_cselect_b64 s[4:5], -1, 0
; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
@@ -290,10 +288,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; VI-GISEL-NEXT: s_addk_i32 s4, 0xfc10
; VI-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
; VI-GISEL-NEXT: s_or_b32 s2, s6, s2
-; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; VI-GISEL-NEXT: s_cselect_b32 s2, 1, 0
; VI-GISEL-NEXT: s_or_b32 s2, s5, s2
-; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; VI-GISEL-NEXT: s_sub_i32 s7, 1, s4
; VI-GISEL-NEXT: s_lshl_b32 s6, s4, 12
@@ -335,11 +331,10 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX10-SDAG: ; %bb.0:
; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff
-; GFX10-SDAG-NEXT: s_lshr_b32 s5, s3, 8
-; GFX10-SDAG-NEXT: s_or_b32 s2, s4, s2
-; GFX10-SDAG-NEXT: s_and_b32 s4, s5, 0xffe
-; GFX10-SDAG-NEXT: s_cmp_lg_u32 s2, 0
+; GFX10-SDAG-NEXT: s_lshr_b32 s4, s3, 8
+; GFX10-SDAG-NEXT: s_and_b32 s5, s3, 0x1ff
+; GFX10-SDAG-NEXT: s_and_b32 s4, s4, 0xffe
+; GFX10-SDAG-NEXT: s_or_b32 s2, s5, s2
; GFX10-SDAG-NEXT: s_cselect_b32 s2, -1, 0
; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
; GFX10-SDAG-NEXT: s_bfe_u32 s2, s3, 0xb0014
@@ -387,16 +382,14 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX10-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014
; GFX10-GISEL-NEXT: s_lshr_b32 s5, s3, 8
-; GFX10-GISEL-NEXT: s_or_b32 s2, s6, s2
+; GFX10-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX10-GISEL-NEXT: s_addk_i32 s4, 0xfc10
; GFX10-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
-; GFX10-GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX10-GISEL-NEXT: s_or_b32 s2, s6, s2
; GFX10-GISEL-NEXT: s_cselect_b32 s2, 1, 0
; GFX10-GISEL-NEXT: s_or_b32 s2, s5, s2
-; GFX10-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX10-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX10-GISEL-NEXT: s_sub_i32 s6, 1, s4
; GFX10-GISEL-NEXT: s_or_b32 s8, s2, 0x1000
@@ -438,11 +431,10 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff
-; GFX11-SDAG-NEXT: s_lshr_b32 s5, s3, 8
-; GFX11-SDAG-NEXT: s_or_b32 s2, s4, s2
-; GFX11-SDAG-NEXT: s_and_b32 s4, s5, 0xffe
-; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-SDAG-NEXT: s_lshr_b32 s4, s3, 8
+; GFX11-SDAG-NEXT: s_and_b32 s5, s3, 0x1ff
+; GFX11-SDAG-NEXT: s_and_b32 s4, s4, 0xffe
+; GFX11-SDAG-NEXT: s_or_b32 s2, s5, s2
; GFX11-SDAG-NEXT: s_cselect_b32 s2, -1, 0
; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
@@ -498,17 +490,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX11-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014
; GFX11-GISEL-NEXT: s_lshr_b32 s5, s3, 8
-; GFX11-GISEL-NEXT: s_or_b32 s2, s6, s2
+; GFX11-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX11-GISEL-NEXT: s_addk_i32 s4, 0xfc10
; GFX11-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
-; GFX11-GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-NEXT: s_or_b32 s2, s6, s2
; GFX11-GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: s_or_b32 s2, s5, s2
-; GFX11-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX11-GISEL-NEXT: s_sub_i32 s6, 1, s4
; GFX11-GISEL-NEXT: s_or_b32 s8, s2, 0x1000
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 6f91222..d8cbdb1 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -2048,7 +2048,7 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
; GFX1200-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2
; GFX1200-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX1200-FAKE16-NEXT: s_endpgm
- ptr addrspace(1) %in2) #1 {
+ ptr addrspace(1) %in2) #0 {
%gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
%r0 = load half, ptr addrspace(1) %in1, align 4
%r1 = load half, ptr addrspace(1) %gep2, align 4
@@ -3417,7 +3417,7 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(
; GFX1200-NEXT: v_fmac_f32_e32 v1, v3, v2
; GFX1200-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX1200-NEXT: s_endpgm
- ptr addrspace(1) %in2) #1 {
+ ptr addrspace(1) %in2) #0 {
%gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
%r0 = load float, ptr addrspace(1) %in1, align 4
%r1 = load float, ptr addrspace(1) %gep2, align 4
@@ -4821,7 +4821,7 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
; GFX1200-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
; GFX1200-NEXT: global_store_b64 v12, v[0:1], s[0:1]
; GFX1200-NEXT: s_endpgm
- ptr addrspace(1) %in2) #1 {
+ ptr addrspace(1) %in2) #0 {
%r0 = load double, ptr addrspace(1) %in1, align 8
%r1 = load double, ptr addrspace(1) %in2, align 8
%r2 = frem afn double %r0, %r1
@@ -18918,7 +18918,4 @@ define amdgpu_kernel void @frem_v2f64_const(ptr addrspace(1) %out) #0 {
-attributes #0 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
-attributes #1 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
-
-
+attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
index 1b74ddf..9b97981 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
@@ -2870,7 +2870,7 @@ define double @v_sqrt_f64__enough_unsafe_attrs(double %x) #3 {
ret double %result
}
-define double @v_sqrt_f64__unsafe_attr(double %x) #4 {
+define double @v_sqrt_f64__unsafe_attr(double %x) {
; GFX6-SDAG-LABEL: v_sqrt_f64__unsafe_attr:
; GFX6-SDAG: ; %bb.0:
; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3449,7 +3449,6 @@ declare i32 @llvm.amdgcn.readfirstlane(i32) #1
attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #1 = { convergent nounwind willreturn memory(none) }
attributes #3 = { "no-nans-fp-math"="true" "no-infs-fp-math"="true" }
-attributes #4 = { "unsafe-fp-math"="true" }
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX6: {{.*}}
; GFX8: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.r600.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.r600.ll
index 9f19bcb..c93c077 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.r600.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.r600.ll
@@ -239,4 +239,4 @@ declare <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) #0
declare <4 x float> @llvm.sqrt.v4f32(<4 x float> %in) #0
attributes #0 = { nounwind readnone }
-attributes #1 = { nounwind "unsafe-fp-math"="true" }
+attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
index 94afa88..9ebf6ae 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
@@ -4666,21 +4666,13 @@ define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_16(ptr add
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
;
-; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_16:
-; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: v_or_b32_e32 v0, 16, v0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: ; return to shader part epilog
-;
-; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_16:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT: v_or_b32_e32 v0, 16, v0
-; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: ; return to shader part epilog
+; GFX12-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_or_b32_e32 v0, 16, v0
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: global_load_u8 v0, v[0:1], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: ; return to shader part epilog
%zext.idx = zext i32 %idx to i64
%or = or i64 %zext.idx, 16
%addr = inttoptr i64 %or to ptr addrspace(1)
@@ -4707,21 +4699,13 @@ define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_4160(ptr a
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
;
-; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_4160:
-; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: v_or_b32_e32 v0, 0x1040, v0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: ; return to shader part epilog
-;
-; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_4160:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT: v_or_b32_e32 v0, 0x1040, v0
-; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: ; return to shader part epilog
+; GFX12-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_4160:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_or_b32_e32 v0, 0x1040, v0
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: global_load_u8 v0, v[0:1], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: ; return to shader part epilog
%zext.idx = zext i32 %idx to i64
%or = or i64 %zext.idx, 4160
%addr = inttoptr i64 %or to ptr addrspace(1)
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 37756d1..31f277f 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -472,7 +472,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB1_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -536,11 +535,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -606,7 +604,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -660,12 +657,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f32_e32 v1, s3, v1
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1
; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -710,9 +706,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1
; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1690,7 +1685,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1754,11 +1748,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1824,7 +1817,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1878,12 +1870,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f32_e32 v1, s3, v1
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1
; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1928,9 +1919,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1
; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2968,7 +2958,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB5_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3032,11 +3021,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3102,7 +3090,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3156,12 +3143,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f32_e32 v1, s3, v1
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1
; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3206,9 +3192,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1
; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3742,7 +3727,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB6_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3806,11 +3790,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1064-NEXT: .LBB6_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3876,7 +3859,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3930,12 +3912,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f32_e32 v1, s3, v1
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1
; GFX1164-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3980,9 +3961,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1
; GFX1132-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5019,7 +4999,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB8_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5083,11 +5062,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX1064-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5153,7 +5131,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5207,12 +5184,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5270,9 +5246,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6284,7 +6259,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB10_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6354,7 +6328,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6424,7 +6397,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6485,8 +6457,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6550,7 +6520,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7717,7 +7686,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB12_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7787,7 +7755,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB12_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7857,7 +7824,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB12_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7918,8 +7884,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB12_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7983,7 +7947,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB12_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9150,7 +9113,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB14_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9220,7 +9182,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9290,7 +9251,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9351,8 +9311,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9416,7 +9374,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10065,7 +10022,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB15_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10135,7 +10091,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10205,7 +10160,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10266,8 +10220,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10331,7 +10283,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11498,7 +11449,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB17_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11568,7 +11518,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11638,7 +11587,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11699,8 +11647,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11764,7 +11710,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index 6351bb3..4581efc 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -381,13 +381,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX9-NEXT: .LBB1_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT: v_readlane_b32 s4, v0, s2
+; GFX9-NEXT: v_readlane_b32 s3, v0, s2
+; GFX9-NEXT: v_max_f32_e64 v1, s3, s3
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX9-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX9-NEXT: v_max_f32_e64 v2, s4, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: v_max_f32_e32 v2, v1, v2
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v1
; GFX9-NEXT: s_cbranch_scc1 .LBB1_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -457,7 +456,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -513,7 +511,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1
@@ -562,8 +559,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -610,11 +606,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1420,13 +1414,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX9-NEXT: .LBB3_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT: v_readlane_b32 s4, v0, s2
+; GFX9-NEXT: v_readlane_b32 s3, v0, s2
+; GFX9-NEXT: v_max_f32_e64 v1, s3, s3
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX9-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX9-NEXT: v_max_f32_e64 v2, s4, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: v_max_f32_e32 v2, v1, v2
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v1
; GFX9-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1496,7 +1489,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1552,7 +1544,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1
@@ -1601,8 +1592,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1649,11 +1639,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2459,13 +2447,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX9-NEXT: .LBB5_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT: v_readlane_b32 s4, v0, s2
+; GFX9-NEXT: v_readlane_b32 s3, v0, s2
+; GFX9-NEXT: v_max_f32_e64 v1, s3, s3
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX9-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX9-NEXT: v_max_f32_e64 v2, s4, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: v_max_f32_e32 v2, v1, v2
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v1
; GFX9-NEXT: s_cbranch_scc1 .LBB5_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2535,7 +2522,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2591,7 +2577,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1
@@ -2640,8 +2625,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2688,11 +2672,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3591,7 +3573,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX9-NEXT: s_cbranch_scc1 .LBB7_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3665,7 +3646,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3724,7 +3704,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3774,8 +3753,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3841,10 +3819,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4859,7 +4836,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX9-NEXT: s_cbranch_scc1 .LBB9_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4933,7 +4909,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4992,7 +4967,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5042,8 +5016,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5109,10 +5082,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6127,7 +6099,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX9-NEXT: s_cbranch_scc1 .LBB11_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6201,7 +6172,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6260,7 +6230,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6310,8 +6279,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6377,10 +6345,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index a9ac008..bd570d9 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -381,13 +381,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX9-NEXT: .LBB1_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT: v_readlane_b32 s4, v0, s2
+; GFX9-NEXT: v_readlane_b32 s3, v0, s2
+; GFX9-NEXT: v_max_f32_e64 v1, s3, s3
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX9-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX9-NEXT: v_max_f32_e64 v2, s4, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: v_min_f32_e32 v2, v1, v2
+; GFX9-NEXT: v_min_f32_e32 v2, v2, v1
; GFX9-NEXT: s_cbranch_scc1 .LBB1_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -457,7 +456,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -513,7 +511,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1
@@ -562,8 +559,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -610,11 +606,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1420,13 +1414,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX9-NEXT: .LBB3_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT: v_readlane_b32 s4, v0, s2
+; GFX9-NEXT: v_readlane_b32 s3, v0, s2
+; GFX9-NEXT: v_max_f32_e64 v1, s3, s3
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX9-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX9-NEXT: v_max_f32_e64 v2, s4, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: v_min_f32_e32 v2, v1, v2
+; GFX9-NEXT: v_min_f32_e32 v2, v2, v1
; GFX9-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1496,7 +1489,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1552,7 +1544,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1
@@ -1601,8 +1592,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1649,11 +1639,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2459,13 +2447,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX9-NEXT: .LBB5_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT: v_readlane_b32 s4, v0, s2
+; GFX9-NEXT: v_readlane_b32 s3, v0, s2
+; GFX9-NEXT: v_max_f32_e64 v1, s3, s3
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX9-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX9-NEXT: v_max_f32_e64 v2, s4, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: v_min_f32_e32 v2, v1, v2
+; GFX9-NEXT: v_min_f32_e32 v2, v2, v1
; GFX9-NEXT: s_cbranch_scc1 .LBB5_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2535,7 +2522,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2591,7 +2577,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1
@@ -2640,8 +2625,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2688,11 +2672,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3591,7 +3573,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX9-NEXT: s_cbranch_scc1 .LBB7_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3665,7 +3646,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3724,7 +3704,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3774,8 +3753,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3841,10 +3819,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4859,7 +4836,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX9-NEXT: s_cbranch_scc1 .LBB9_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4933,7 +4909,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4992,7 +4967,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5042,8 +5016,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5109,10 +5082,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6127,7 +6099,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX9-NEXT: s_cbranch_scc1 .LBB11_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6201,7 +6172,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6260,7 +6230,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6310,8 +6279,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6377,10 +6345,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index 6311143..1f2d70c 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -532,7 +532,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB1_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -596,11 +595,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -666,7 +664,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -720,12 +717,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -783,9 +779,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1862,7 +1857,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1926,11 +1920,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1996,7 +1989,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2050,12 +2042,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2113,9 +2104,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3192,7 +3182,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB5_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3256,11 +3245,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3326,7 +3314,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3380,12 +3367,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3443,9 +3429,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4018,7 +4003,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB6_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4082,11 +4066,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1064-NEXT: .LBB6_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4152,7 +4135,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4206,12 +4188,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4269,9 +4250,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5347,7 +5327,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB8_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5411,11 +5390,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX1064-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5481,7 +5459,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5535,12 +5512,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5598,9 +5574,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6612,7 +6587,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB10_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6682,7 +6656,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6752,7 +6725,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6813,8 +6785,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6878,7 +6848,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8044,7 +8013,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB12_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8114,7 +8082,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB12_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8184,7 +8151,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB12_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8245,8 +8211,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB12_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8310,7 +8274,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB12_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9477,7 +9440,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB14_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9547,7 +9509,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9617,7 +9578,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9678,8 +9638,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9743,7 +9701,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10392,7 +10349,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB15_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10462,7 +10418,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10532,7 +10487,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10593,8 +10547,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10658,7 +10610,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11824,7 +11775,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB17_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11894,7 +11844,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11964,7 +11913,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12025,8 +11973,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12090,7 +12036,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/inline-attr.ll b/llvm/test/CodeGen/AMDGPU/inline-attr.ll
index 4e93eca..c33b3344 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-attr.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-attr.ll
@@ -36,18 +36,18 @@ entry:
ret void
}
-attributes #0 = { nounwind "uniform-work-group-size"="false" "unsafe-fp-math"="true"}
-attributes #1 = { nounwind "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" }
+attributes #0 = { nounwind "uniform-work-group-size"="false"}
+attributes #1 = { nounwind "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" }
;.
-; UNSAFE: attributes #[[ATTR0]] = { nounwind "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
-; UNSAFE: attributes #[[ATTR1]] = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
+; UNSAFE: attributes #[[ATTR0]] = { nounwind "uniform-work-group-size"="false" }
+; UNSAFE: attributes #[[ATTR1]] = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "uniform-work-group-size"="false" }
;.
-; NONANS: attributes #[[ATTR0]] = { nounwind "no-nans-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
-; NONANS: attributes #[[ATTR1]] = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
+; NONANS: attributes #[[ATTR0]] = { nounwind "no-nans-fp-math"="true" "uniform-work-group-size"="false" }
+; NONANS: attributes #[[ATTR1]] = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="true" "uniform-work-group-size"="false" }
;.
-; NOINFS: attributes #[[ATTR0]] = { nounwind "no-infs-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
-; NOINFS: attributes #[[ATTR1]] = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="false" "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
+; NOINFS: attributes #[[ATTR0]] = { nounwind "no-infs-fp-math"="true" "uniform-work-group-size"="false" }
+; NOINFS: attributes #[[ATTR1]] = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="false" "uniform-work-group-size"="false" }
;.
; UNSAFE: [[META0]] = !{}
;.
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index eee232a..c3f3917 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -136,19 +136,17 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
; GFX11-NEXT: .LBB2_6: ; %bb18
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT: v_readfirstlane_b32 s13, v0
-; GFX11-NEXT: s_cmp_lg_u32 s1, 0
-; GFX11-NEXT: s_cselect_b32 s1, -1, 0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1
-; GFX11-NEXT: s_and_b32 s1, s8, s1
-; GFX11-NEXT: s_and_b32 s1, s1, exec_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: s_cselect_b32 s13, -1, 0
+; GFX11-NEXT: v_readfirstlane_b32 s1, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s13
+; GFX11-NEXT: s_and_b32 s13, s8, s13
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_and_b32 s13, s13, exec_lo
; GFX11-NEXT: v_readfirstlane_b32 s19, v2
-; GFX11-NEXT: s_cselect_b32 s1, s19, s13
-; GFX11-NEXT: s_and_b32 s13, 0xffff, s0
+; GFX11-NEXT: s_cselect_b32 s1, s19, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s1, s1, 1
-; GFX11-NEXT: s_cmp_lg_u32 s13, 0
+; GFX11-NEXT: s_and_b32 s13, 0xffff, s0
; GFX11-NEXT: s_cselect_b32 s13, -1, 0
; GFX11-NEXT: s_and_b32 s20, s9, exec_lo
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s13
diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
index 9684712..2f9182e 100644
--- a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
@@ -1066,13 +1066,13 @@ define double @uitofp_i128_to_f64(i128 %x) {
; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
; GISEL-NEXT: v_lshlrev_b64 v[8:9], 30, v[2:3]
; GISEL-NEXT: v_lshrrev_b32_e32 v5, 2, v1
-; GISEL-NEXT: v_or_b32_e32 v9, v5, v8
+; GISEL-NEXT: v_or_b32_e32 v9, v8, v5
; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GISEL-NEXT: ; %bb.11: ; %itofp-if-then20
; GISEL-NEXT: v_lshlrev_b64 v[2:3], 29, v[2:3]
; GISEL-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1]
; GISEL-NEXT: v_lshrrev_b32_e32 v0, 3, v1
-; GISEL-NEXT: v_or_b32_e32 v9, v0, v2
+; GISEL-NEXT: v_or_b32_e32 v9, v2, v0
; GISEL-NEXT: v_mov_b32_e32 v7, v6
; GISEL-NEXT: ; %bb.12: ; %Flow
; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.add.min.max.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.add.min.max.ll
new file mode 100644
index 0000000..99421d4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.add.min.max.ll
@@ -0,0 +1,191 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250-GISEL %s
+
+declare i32 @llvm.amdgcn.add.min.i32(i32, i32, i32, i1)
+declare i32 @llvm.amdgcn.add.max.i32(i32, i32, i32, i1)
+declare i32 @llvm.amdgcn.add.min.u32(i32, i32, i32, i1)
+declare i32 @llvm.amdgcn.add.max.u32(i32, i32, i32, i1)
+declare <2 x i16> @llvm.amdgcn.pk.add.min.i16(<2 x i16>, <2 x i16>, <2 x i16>, i1)
+declare <2 x i16> @llvm.amdgcn.pk.add.max.i16(<2 x i16>, <2 x i16>, <2 x i16>, i1)
+declare <2 x i16> @llvm.amdgcn.pk.add.min.u16(<2 x i16>, <2 x i16>, <2 x i16>, i1)
+declare <2 x i16> @llvm.amdgcn.pk.add.max.u16(<2 x i16>, <2 x i16>, <2 x i16>, i1)
+
+define i32 @test_add_min_i32_vvv(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: test_add_min_i32_vvv:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_add_min_i32 v0, v0, v1, v2
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ %ret = tail call i32 @llvm.amdgcn.add.min.i32(i32 %a, i32 %b, i32 %c, i1 0)
+ ret i32 %ret
+}
+
+define i32 @test_add_min_i32_ssi_clamp(i32 inreg %a, i32 inreg %b) {
+; GCN-LABEL: test_add_min_i32_ssi_clamp:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_add_min_i32 v0, s0, s1, 1 clamp
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ %ret = tail call i32 @llvm.amdgcn.add.min.i32(i32 %a, i32 %b, i32 1, i1 1)
+ ret i32 %ret
+}
+
+define i32 @test_add_min_u32_vvv(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: test_add_min_u32_vvv:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_add_min_u32 v0, v0, v1, v2
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ %ret = tail call i32 @llvm.amdgcn.add.min.u32(i32 %a, i32 %b, i32 %c, i1 0)
+ ret i32 %ret
+}
+
+define i32 @test_add_min_u32_ssi_clamp(i32 inreg %a, i32 inreg %b) {
+; GCN-LABEL: test_add_min_u32_ssi_clamp:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_add_min_u32 v0, s0, s1, 1 clamp
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ %ret = tail call i32 @llvm.amdgcn.add.min.u32(i32 %a, i32 %b, i32 1, i1 1)
+ ret i32 %ret
+}
+
+define i32 @test_add_max_i32_vvv(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: test_add_max_i32_vvv:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_add_max_i32 v0, v0, v1, v2
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ %ret = tail call i32 @llvm.amdgcn.add.max.i32(i32 %a, i32 %b, i32 %c, i1 0)
+ ret i32 %ret
+}
+
+define i32 @test_add_max_i32_ssi_clamp(i32 inreg %a, i32 inreg %b) {
+; GCN-LABEL: test_add_max_i32_ssi_clamp:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_add_max_i32 v0, s0, s1, 1 clamp
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ %ret = tail call i32 @llvm.amdgcn.add.max.i32(i32 %a, i32 %b, i32 1, i1 1)
+ ret i32 %ret
+}
+
+define i32 @test_add_max_u32_vvv(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: test_add_max_u32_vvv:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_add_max_u32 v0, v0, v1, v2
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ %ret = tail call i32 @llvm.amdgcn.add.max.u32(i32 %a, i32 %b, i32 %c, i1 0)
+ ret i32 %ret
+}
+
+define i32 @test_add_max_u32_ssi_clamp(i32 inreg %a, i32 inreg %b) {
+; GCN-LABEL: test_add_max_u32_ssi_clamp:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_add_max_u32 v0, s0, s1, 1 clamp
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ %ret = tail call i32 @llvm.amdgcn.add.max.u32(i32 %a, i32 %b, i32 1, i1 1)
+ ret i32 %ret
+}
+
+define <2 x i16> @test_add_min_i16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
+; GCN-LABEL: test_add_min_i16_vvv:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_pk_add_min_i16 v0, v0, v1, v2
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.min.i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, i1 0)
+ ret <2 x i16> %ret
+}
+
+define <2 x i16> @test_add_min_i16_ssi_clamp(<2 x i16> inreg %a, <2 x i16> inreg %b) {
+; GCN-LABEL: test_add_min_i16_ssi_clamp:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_pk_add_min_i16 v0, s0, s1, 1 op_sel_hi:[1,1,0] clamp
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.min.i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> <i16 1, i16 1>, i1 1)
+ ret <2 x i16> %ret
+}
+
+define <2 x i16> @test_add_min_u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
+; GCN-LABEL: test_add_min_u16_vvv:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_pk_add_min_u16 v0, v0, v1, v2
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.min.u16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, i1 0)
+ ret <2 x i16> %ret
+}
+
+define <2 x i16> @test_add_min_u16_ssi_clamp(<2 x i16> inreg %a, <2 x i16> inreg %b) {
+; GCN-LABEL: test_add_min_u16_ssi_clamp:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_pk_add_min_u16 v0, s0, s1, 1 op_sel_hi:[1,1,0] clamp
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.min.u16(<2 x i16> %a, <2 x i16> %b, <2 x i16> <i16 1, i16 1>, i1 1)
+ ret <2 x i16> %ret
+}
+
+define <2 x i16> @test_add_max_i16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
+; GCN-LABEL: test_add_max_i16_vvv:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_pk_add_max_i16 v0, v0, v1, v2
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.max.i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, i1 0)
+ ret <2 x i16> %ret
+}
+
+define <2 x i16> @test_add_max_i16_ssi_clamp(<2 x i16> inreg %a, <2 x i16> inreg %b) {
+; GCN-LABEL: test_add_max_i16_ssi_clamp:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_pk_add_max_i16 v0, s0, s1, 1 op_sel_hi:[1,1,0] clamp
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.max.i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> <i16 1, i16 1>, i1 1)
+ ret <2 x i16> %ret
+}
+
+define <2 x i16> @test_add_max_u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
+; GCN-LABEL: test_add_max_u16_vvv:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_pk_add_max_u16 v0, v0, v1, v2
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.max.u16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, i1 0)
+ ret <2 x i16> %ret
+}
+
+define <2 x i16> @test_add_max_u16_ssi_clamp(<2 x i16> inreg %a, <2 x i16> inreg %b) {
+; GCN-LABEL: test_add_max_u16_ssi_clamp:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_pk_add_max_u16 v0, s0, s1, 1 op_sel_hi:[1,1,0] clamp
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.max.u16(<2 x i16> %a, <2 x i16> %b, <2 x i16> <i16 1, i16 1>, i1 1)
+ ret <2 x i16> %ret
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1250-GISEL: {{.*}}
+; GFX1250-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
index 883db20..e30a586 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -1485,7 +1485,7 @@ define float @v_exp2_f32_fast(float %in) {
ret float %result
}
-define float @v_exp2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
+define float @v_exp2_f32_unsafe_math_attr(float %in) {
; SI-SDAG-LABEL: v_exp2_f32_unsafe_math_attr:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index 0854134..61a777f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -1907,7 +1907,7 @@ define float @v_log2_f32_fast(float %in) {
ret float %result
}
-define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
+define float @v_log2_f32_unsafe_math_attr(float %in) {
; SI-SDAG-LABEL: v_log2_f32_unsafe_math_attr:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
index 8748aff..6dc9199 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
@@ -8265,12 +8265,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_readlane_b32 s6, v1, s3
-; GFX12-NEXT: s_lshl_b32 s7, 1, s3
; GFX12-NEXT: v_writelane_b32 v0, s0, s3
+; GFX12-NEXT: s_lshl_b32 s3, 1, s3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 s1, s1, s7
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_cmp_lg_u32 s1, 0
+; GFX12-NEXT: s_and_not1_b32 s1, s1, s3
; GFX12-NEXT: s_add_f32 s0, s0, s6
; GFX12-NEXT: s_cbranch_scc1 .LBB28_5
; GFX12-NEXT: ; %bb.6: ; %ComputeEnd
@@ -8351,14 +8349,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX942-NEXT: .LBB28_5: ; %ComputeLoop
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX942-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX942-NEXT: v_readfirstlane_b32 s8, v1
-; GFX942-NEXT: v_readlane_b32 s9, v2, s3
+; GFX942-NEXT: v_readfirstlane_b32 s6, v1
; GFX942-NEXT: s_mov_b32 m0, s3
+; GFX942-NEXT: v_readlane_b32 s8, v2, s3
+; GFX942-NEXT: v_writelane_b32 v0, s6, m0
+; GFX942-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX942-NEXT: v_writelane_b32 v0, s8, m0
-; GFX942-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX942-NEXT: v_add_f32_e32 v1, s9, v1
+; GFX942-NEXT: v_add_f32_e32 v1, s8, v1
; GFX942-NEXT: s_cbranch_scc1 .LBB28_5
; GFX942-NEXT: ; %bb.6: ; %ComputeEnd
; GFX942-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8440,15 +8437,14 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX11-NEXT: .LBB28_5: ; %ComputeLoop
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_ctz_i32_b32 s1, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s3, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_readlane_b32 s6, v2, s1
-; GFX11-NEXT: s_lshl_b32 s7, 1, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 s0, s0, s7
; GFX11-NEXT: v_writelane_b32 v0, s3, s1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_add_f32_e32 v1, s6, v1
-; GFX11-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-NEXT: s_lshl_b32 s1, 1, s1
+; GFX11-NEXT: s_and_not1_b32 s0, s0, s1
; GFX11-NEXT: s_cbranch_scc1 .LBB28_5
; GFX11-NEXT: ; %bb.6: ; %ComputeEnd
; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8528,11 +8524,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX10-NEXT: s_ff1_i32_b32 s1, s0
; GFX10-NEXT: v_readfirstlane_b32 s3, v1
; GFX10-NEXT: v_readlane_b32 s6, v2, s1
-; GFX10-NEXT: s_lshl_b32 s7, 1, s1
-; GFX10-NEXT: s_andn2_b32 s0, s0, s7
; GFX10-NEXT: v_writelane_b32 v0, s3, s1
; GFX10-NEXT: v_add_f32_e32 v1, s6, v1
-; GFX10-NEXT: s_cmp_lg_u32 s0, 0
+; GFX10-NEXT: s_lshl_b32 s1, 1, s1
+; GFX10-NEXT: s_andn2_b32 s0, s0, s1
; GFX10-NEXT: s_cbranch_scc1 .LBB28_5
; GFX10-NEXT: ; %bb.6: ; %ComputeEnd
; GFX10-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8609,14 +8604,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX90A-NEXT: .LBB28_5: ; %ComputeLoop
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX90A-NEXT: v_readfirstlane_b32 s8, v1
-; GFX90A-NEXT: v_readlane_b32 s9, v2, s3
+; GFX90A-NEXT: v_readfirstlane_b32 s6, v1
; GFX90A-NEXT: s_mov_b32 m0, s3
+; GFX90A-NEXT: v_readlane_b32 s8, v2, s3
+; GFX90A-NEXT: v_writelane_b32 v0, s6, m0
+; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX90A-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX90A-NEXT: v_writelane_b32 v0, s8, m0
-; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX90A-NEXT: v_add_f32_e32 v1, s9, v1
+; GFX90A-NEXT: v_add_f32_e32 v1, s8, v1
; GFX90A-NEXT: s_cbranch_scc1 .LBB28_5
; GFX90A-NEXT: ; %bb.6: ; %ComputeEnd
; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8692,14 +8686,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX908-NEXT: .LBB28_5: ; %ComputeLoop
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX908-NEXT: v_readfirstlane_b32 s8, v1
-; GFX908-NEXT: v_readlane_b32 s9, v2, s3
+; GFX908-NEXT: v_readfirstlane_b32 s6, v1
; GFX908-NEXT: s_mov_b32 m0, s3
+; GFX908-NEXT: v_readlane_b32 s8, v2, s3
+; GFX908-NEXT: v_writelane_b32 v0, s6, m0
+; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX908-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX908-NEXT: v_writelane_b32 v0, s8, m0
-; GFX908-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX908-NEXT: v_add_f32_e32 v1, s9, v1
+; GFX908-NEXT: v_add_f32_e32 v1, s8, v1
; GFX908-NEXT: s_cbranch_scc1 .LBB28_5
; GFX908-NEXT: ; %bb.6: ; %ComputeEnd
; GFX908-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8776,14 +8769,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX8-NEXT: .LBB28_5: ; %ComputeLoop
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX8-NEXT: v_readfirstlane_b32 s8, v1
-; GFX8-NEXT: v_readlane_b32 s9, v2, s3
+; GFX8-NEXT: v_readfirstlane_b32 s6, v1
; GFX8-NEXT: s_mov_b32 m0, s3
+; GFX8-NEXT: v_readlane_b32 s8, v2, s3
+; GFX8-NEXT: v_writelane_b32 v0, s6, m0
+; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT: v_writelane_b32 v0, s8, m0
-; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX8-NEXT: v_add_f32_e32 v1, s9, v1
+; GFX8-NEXT: v_add_f32_e32 v1, s8, v1
; GFX8-NEXT: s_cbranch_scc1 .LBB28_5
; GFX8-NEXT: ; %bb.6: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9130,12 +9122,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_readlane_b32 s6, v1, s3
-; GFX12-NEXT: s_lshl_b32 s7, 1, s3
; GFX12-NEXT: v_writelane_b32 v0, s0, s3
+; GFX12-NEXT: s_lshl_b32 s3, 1, s3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 s1, s1, s7
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_cmp_lg_u32 s1, 0
+; GFX12-NEXT: s_and_not1_b32 s1, s1, s3
; GFX12-NEXT: s_add_f32 s0, s0, s6
; GFX12-NEXT: s_cbranch_scc1 .LBB29_5
; GFX12-NEXT: ; %bb.6: ; %ComputeEnd
@@ -9212,14 +9202,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX942-NEXT: .LBB29_5: ; %ComputeLoop
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX942-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX942-NEXT: v_readfirstlane_b32 s8, v1
-; GFX942-NEXT: v_readlane_b32 s9, v2, s3
+; GFX942-NEXT: v_readfirstlane_b32 s6, v1
; GFX942-NEXT: s_mov_b32 m0, s3
+; GFX942-NEXT: v_readlane_b32 s8, v2, s3
+; GFX942-NEXT: v_writelane_b32 v0, s6, m0
+; GFX942-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX942-NEXT: v_writelane_b32 v0, s8, m0
-; GFX942-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX942-NEXT: v_add_f32_e32 v1, s9, v1
+; GFX942-NEXT: v_add_f32_e32 v1, s8, v1
; GFX942-NEXT: s_cbranch_scc1 .LBB29_5
; GFX942-NEXT: ; %bb.6: ; %ComputeEnd
; GFX942-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9296,15 +9285,14 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX11-NEXT: .LBB29_5: ; %ComputeLoop
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_ctz_i32_b32 s1, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s3, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_readlane_b32 s6, v2, s1
-; GFX11-NEXT: s_lshl_b32 s7, 1, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 s0, s0, s7
; GFX11-NEXT: v_writelane_b32 v0, s3, s1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_add_f32_e32 v1, s6, v1
-; GFX11-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-NEXT: s_lshl_b32 s1, 1, s1
+; GFX11-NEXT: s_and_not1_b32 s0, s0, s1
; GFX11-NEXT: s_cbranch_scc1 .LBB29_5
; GFX11-NEXT: ; %bb.6: ; %ComputeEnd
; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9377,11 +9365,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX10-NEXT: s_ff1_i32_b32 s1, s0
; GFX10-NEXT: v_readfirstlane_b32 s3, v1
; GFX10-NEXT: v_readlane_b32 s6, v2, s1
-; GFX10-NEXT: s_lshl_b32 s7, 1, s1
-; GFX10-NEXT: s_andn2_b32 s0, s0, s7
; GFX10-NEXT: v_writelane_b32 v0, s3, s1
; GFX10-NEXT: v_add_f32_e32 v1, s6, v1
-; GFX10-NEXT: s_cmp_lg_u32 s0, 0
+; GFX10-NEXT: s_lshl_b32 s1, 1, s1
+; GFX10-NEXT: s_andn2_b32 s0, s0, s1
; GFX10-NEXT: s_cbranch_scc1 .LBB29_5
; GFX10-NEXT: ; %bb.6: ; %ComputeEnd
; GFX10-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9453,14 +9440,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX90A-NEXT: .LBB29_5: ; %ComputeLoop
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX90A-NEXT: v_readfirstlane_b32 s8, v1
-; GFX90A-NEXT: v_readlane_b32 s9, v2, s3
+; GFX90A-NEXT: v_readfirstlane_b32 s6, v1
; GFX90A-NEXT: s_mov_b32 m0, s3
+; GFX90A-NEXT: v_readlane_b32 s8, v2, s3
+; GFX90A-NEXT: v_writelane_b32 v0, s6, m0
+; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX90A-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX90A-NEXT: v_writelane_b32 v0, s8, m0
-; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX90A-NEXT: v_add_f32_e32 v1, s9, v1
+; GFX90A-NEXT: v_add_f32_e32 v1, s8, v1
; GFX90A-NEXT: s_cbranch_scc1 .LBB29_5
; GFX90A-NEXT: ; %bb.6: ; %ComputeEnd
; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9533,14 +9519,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX908-NEXT: .LBB29_5: ; %ComputeLoop
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX908-NEXT: v_readfirstlane_b32 s8, v1
-; GFX908-NEXT: v_readlane_b32 s9, v2, s3
+; GFX908-NEXT: v_readfirstlane_b32 s6, v1
; GFX908-NEXT: s_mov_b32 m0, s3
+; GFX908-NEXT: v_readlane_b32 s8, v2, s3
+; GFX908-NEXT: v_writelane_b32 v0, s6, m0
+; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX908-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX908-NEXT: v_writelane_b32 v0, s8, m0
-; GFX908-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX908-NEXT: v_add_f32_e32 v1, s9, v1
+; GFX908-NEXT: v_add_f32_e32 v1, s8, v1
; GFX908-NEXT: s_cbranch_scc1 .LBB29_5
; GFX908-NEXT: ; %bb.6: ; %ComputeEnd
; GFX908-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9614,14 +9599,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX8-NEXT: .LBB29_5: ; %ComputeLoop
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX8-NEXT: v_readfirstlane_b32 s8, v1
-; GFX8-NEXT: v_readlane_b32 s9, v2, s3
+; GFX8-NEXT: v_readfirstlane_b32 s6, v1
; GFX8-NEXT: s_mov_b32 m0, s3
+; GFX8-NEXT: v_readlane_b32 s8, v2, s3
+; GFX8-NEXT: v_writelane_b32 v0, s6, m0
+; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT: v_writelane_b32 v0, s8, m0
-; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX8-NEXT: v_add_f32_e32 v1, s9, v1
+; GFX8-NEXT: v_add_f32_e32 v1, s8, v1
; GFX8-NEXT: s_cbranch_scc1 .LBB29_5
; GFX8-NEXT: ; %bb.6: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
index 1e4b633..fc36ed9 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
@@ -45,27 +45,18 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
; GFX9-GISEL: ; %bb.0: ; %entry
; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-GISEL-NEXT: s_load_dword s7, s[8:9], 0x10
-; GFX9-GISEL-NEXT: s_mov_b32 s11, 0
-; GFX9-GISEL-NEXT: s_mov_b32 s4, s11
-; GFX9-GISEL-NEXT: s_mov_b32 s6, s11
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_mov_b32 s10, s1
+; GFX9-GISEL-NEXT: s_mov_b32 s4, s1
; GFX9-GISEL-NEXT: s_mov_b32 s5, s2
-; GFX9-GISEL-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5]
-; GFX9-GISEL-NEXT: s_mov_b32 s10, s3
-; GFX9-GISEL-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7]
+; GFX9-GISEL-NEXT: s_mov_b32 s6, s3
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc slc
; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20
; GFX9-GISEL-NEXT: s_load_dword s7, s[8:9], 0x30
-; GFX9-GISEL-NEXT: s_mov_b32 s4, s11
-; GFX9-GISEL-NEXT: s_mov_b32 s6, s11
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_mov_b32 s10, s1
+; GFX9-GISEL-NEXT: s_mov_b32 s4, s1
; GFX9-GISEL-NEXT: s_mov_b32 s5, s2
-; GFX9-GISEL-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5]
-; GFX9-GISEL-NEXT: s_mov_b32 s10, s3
-; GFX9-GISEL-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7]
+; GFX9-GISEL-NEXT: s_mov_b32 s6, s3
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s0
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen glc slc
@@ -105,27 +96,18 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
; GFX942-GISEL: ; %bb.0: ; %entry
; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX942-GISEL-NEXT: s_load_dword s11, s[4:5], 0x10
-; GFX942-GISEL-NEXT: s_mov_b32 s7, 0
-; GFX942-GISEL-NEXT: s_mov_b32 s8, s7
-; GFX942-GISEL-NEXT: s_mov_b32 s10, s7
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: s_mov_b32 s6, s1
+; GFX942-GISEL-NEXT: s_mov_b32 s8, s1
; GFX942-GISEL-NEXT: s_mov_b32 s9, s2
-; GFX942-GISEL-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9]
-; GFX942-GISEL-NEXT: s_mov_b32 s6, s3
-; GFX942-GISEL-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11]
+; GFX942-GISEL-NEXT: s_mov_b32 s10, s3
; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX942-GISEL-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen nt
; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20
-; GFX942-GISEL-NEXT: s_load_dword s9, s[4:5], 0x30
-; GFX942-GISEL-NEXT: s_mov_b32 s4, s7
-; GFX942-GISEL-NEXT: s_mov_b32 s8, s7
+; GFX942-GISEL-NEXT: s_load_dword s7, s[4:5], 0x30
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: s_mov_b32 s6, s1
+; GFX942-GISEL-NEXT: s_mov_b32 s4, s1
; GFX942-GISEL-NEXT: s_mov_b32 s5, s2
-; GFX942-GISEL-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
; GFX942-GISEL-NEXT: s_mov_b32 s6, s3
-; GFX942-GISEL-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, s0
; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX942-GISEL-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen nt
@@ -168,29 +150,22 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: s_clause 0x1
; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX10-GISEL-NEXT: s_load_dword s5, s[8:9], 0x10
-; GFX10-GISEL-NEXT: s_mov_b32 s7, 0
-; GFX10-GISEL-NEXT: s_mov_b32 s10, s7
-; GFX10-GISEL-NEXT: s_mov_b32 s4, s7
+; GFX10-GISEL-NEXT: s_load_dword s7, s[8:9], 0x10
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_mov_b32 s6, s1
-; GFX10-GISEL-NEXT: s_mov_b32 s11, s2
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-GISEL-NEXT: s_or_b64 s[0:1], s[6:7], s[10:11]
+; GFX10-GISEL-NEXT: s_mov_b32 s4, s1
+; GFX10-GISEL-NEXT: s_mov_b32 s5, s2
; GFX10-GISEL-NEXT: s_mov_b32 s6, s3
-; GFX10-GISEL-NEXT: s_or_b64 s[2:3], s[6:7], s[4:5]
-; GFX10-GISEL-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen slc
+; GFX10-GISEL-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen slc
; GFX10-GISEL-NEXT: s_clause 0x1
-; GFX10-GISEL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20
-; GFX10-GISEL-NEXT: s_load_dword s11, s[8:9], 0x30
+; GFX10-GISEL-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-GISEL-NEXT: s_load_dword s7, s[8:9], 0x30
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_mov_b32 s6, s1
-; GFX10-GISEL-NEXT: s_mov_b32 s5, s2
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s0
-; GFX10-GISEL-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
+; GFX10-GISEL-NEXT: s_mov_b32 s4, s1
+; GFX10-GISEL-NEXT: s_mov_b32 s5, s2
; GFX10-GISEL-NEXT: s_mov_b32 s6, s3
-; GFX10-GISEL-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11]
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen glc slc
; GFX10-GISEL-NEXT: s_endpgm
@@ -234,32 +209,21 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
; GFX11-GISEL: ; %bb.0: ; %entry
; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x10
-; GFX11-GISEL-NEXT: s_mov_b32 s9, 0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT: s_mov_b32 s10, s9
-; GFX11-GISEL-NEXT: s_mov_b32 s6, s9
+; GFX11-GISEL-NEXT: s_load_b32 s11, s[4:5], 0x10
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_mov_b32 s8, s1
-; GFX11-GISEL-NEXT: s_mov_b32 s11, s2
; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX11-GISEL-NEXT: s_or_b64 s[0:1], s[8:9], s[10:11]
-; GFX11-GISEL-NEXT: s_mov_b32 s8, s3
-; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT: s_or_b64 s[2:3], s[8:9], s[6:7]
-; GFX11-GISEL-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen slc dlc
+; GFX11-GISEL-NEXT: s_mov_b32 s8, s1
+; GFX11-GISEL-NEXT: s_mov_b32 s9, s2
+; GFX11-GISEL-NEXT: s_mov_b32 s10, s3
+; GFX11-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc
; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
; GFX11-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30
-; GFX11-GISEL-NEXT: s_mov_b32 s4, s9
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_mov_b32 s8, s1
-; GFX11-GISEL-NEXT: s_mov_b32 s5, s2
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-GISEL-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5]
-; GFX11-GISEL-NEXT: s_mov_b32 s8, s3
-; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7]
+; GFX11-GISEL-NEXT: s_mov_b32 s4, s1
+; GFX11-GISEL-NEXT: s_mov_b32 s5, s2
+; GFX11-GISEL-NEXT: s_mov_b32 s6, s3
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: buffer_store_b32 v0, v1, s[4:7], 0 offen glc slc dlc
; GFX11-GISEL-NEXT: s_endpgm
@@ -303,32 +267,21 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX12-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x10
-; GFX12-GISEL-NEXT: s_mov_b32 s9, 0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT: s_mov_b32 s10, s9
-; GFX12-GISEL-NEXT: s_mov_b32 s6, s9
+; GFX12-GISEL-NEXT: s_load_b32 s11, s[4:5], 0x10
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_mov_b32 s8, s1
-; GFX12-GISEL-NEXT: s_mov_b32 s11, s2
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX12-GISEL-NEXT: s_or_b64 s[0:1], s[8:9], s[10:11]
-; GFX12-GISEL-NEXT: s_mov_b32 s8, s3
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT: s_or_b64 s[2:3], s[8:9], s[6:7]
-; GFX12-GISEL-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen th:TH_LOAD_NT
+; GFX12-GISEL-NEXT: s_mov_b32 s8, s1
+; GFX12-GISEL-NEXT: s_mov_b32 s9, s2
+; GFX12-GISEL-NEXT: s_mov_b32 s10, s3
+; GFX12-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT
; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
; GFX12-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30
-; GFX12-GISEL-NEXT: s_mov_b32 s4, s9
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_mov_b32 s8, s1
-; GFX12-GISEL-NEXT: s_mov_b32 s5, s2
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-GISEL-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5]
-; GFX12-GISEL-NEXT: s_mov_b32 s8, s3
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7]
+; GFX12-GISEL-NEXT: s_mov_b32 s4, s1
+; GFX12-GISEL-NEXT: s_mov_b32 s5, s2
+; GFX12-GISEL-NEXT: s_mov_b32 s6, s3
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT
; GFX12-GISEL-NEXT: s_endpgm
@@ -374,28 +327,19 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX9-GISEL: ; %bb.0: ; %entry
; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-GISEL-NEXT: s_load_dword s7, s[8:9], 0x10
-; GFX9-GISEL-NEXT: s_mov_b32 s11, 0
-; GFX9-GISEL-NEXT: s_mov_b32 s4, s11
-; GFX9-GISEL-NEXT: s_mov_b32 s6, s11
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_mov_b32 s10, s1
+; GFX9-GISEL-NEXT: s_mov_b32 s4, s1
; GFX9-GISEL-NEXT: s_mov_b32 s5, s2
-; GFX9-GISEL-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5]
-; GFX9-GISEL-NEXT: s_mov_b32 s10, s3
-; GFX9-GISEL-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7]
+; GFX9-GISEL-NEXT: s_mov_b32 s6, s3
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20
; GFX9-GISEL-NEXT: s_load_dword s7, s[8:9], 0x30
-; GFX9-GISEL-NEXT: s_mov_b32 s4, s11
-; GFX9-GISEL-NEXT: s_mov_b32 s6, s11
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_mov_b32 s10, s1
+; GFX9-GISEL-NEXT: s_mov_b32 s4, s1
; GFX9-GISEL-NEXT: s_mov_b32 s5, s2
-; GFX9-GISEL-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5]
-; GFX9-GISEL-NEXT: s_mov_b32 s10, s3
-; GFX9-GISEL-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7]
+; GFX9-GISEL-NEXT: s_mov_b32 s6, s3
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s0
; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -436,28 +380,19 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX942-GISEL: ; %bb.0: ; %entry
; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX942-GISEL-NEXT: s_load_dword s11, s[4:5], 0x10
-; GFX942-GISEL-NEXT: s_mov_b32 s7, 0
-; GFX942-GISEL-NEXT: s_mov_b32 s8, s7
-; GFX942-GISEL-NEXT: s_mov_b32 s10, s7
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: s_mov_b32 s6, s1
+; GFX942-GISEL-NEXT: s_mov_b32 s8, s1
; GFX942-GISEL-NEXT: s_mov_b32 s9, s2
-; GFX942-GISEL-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9]
-; GFX942-GISEL-NEXT: s_mov_b32 s6, s3
-; GFX942-GISEL-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11]
+; GFX942-GISEL-NEXT: s_mov_b32 s10, s3
; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX942-GISEL-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen sc0 sc1
; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20
-; GFX942-GISEL-NEXT: s_load_dword s9, s[4:5], 0x30
-; GFX942-GISEL-NEXT: s_mov_b32 s4, s7
-; GFX942-GISEL-NEXT: s_mov_b32 s8, s7
+; GFX942-GISEL-NEXT: s_load_dword s7, s[4:5], 0x30
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: s_mov_b32 s6, s1
+; GFX942-GISEL-NEXT: s_mov_b32 s4, s1
; GFX942-GISEL-NEXT: s_mov_b32 s5, s2
-; GFX942-GISEL-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
; GFX942-GISEL-NEXT: s_mov_b32 s6, s3
-; GFX942-GISEL-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, s0
; GFX942-GISEL-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen sc0 sc1
; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -501,30 +436,23 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: s_clause 0x1
; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX10-GISEL-NEXT: s_load_dword s5, s[8:9], 0x10
-; GFX10-GISEL-NEXT: s_mov_b32 s7, 0
-; GFX10-GISEL-NEXT: s_mov_b32 s10, s7
-; GFX10-GISEL-NEXT: s_mov_b32 s4, s7
+; GFX10-GISEL-NEXT: s_load_dword s7, s[8:9], 0x10
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_mov_b32 s6, s1
-; GFX10-GISEL-NEXT: s_mov_b32 s11, s2
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-GISEL-NEXT: s_or_b64 s[0:1], s[6:7], s[10:11]
+; GFX10-GISEL-NEXT: s_mov_b32 s4, s1
+; GFX10-GISEL-NEXT: s_mov_b32 s5, s2
; GFX10-GISEL-NEXT: s_mov_b32 s6, s3
-; GFX10-GISEL-NEXT: s_or_b64 s[2:3], s[6:7], s[4:5]
-; GFX10-GISEL-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen glc dlc
+; GFX10-GISEL-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc dlc
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: s_clause 0x1
-; GFX10-GISEL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20
-; GFX10-GISEL-NEXT: s_load_dword s11, s[8:9], 0x30
+; GFX10-GISEL-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-GISEL-NEXT: s_load_dword s7, s[8:9], 0x30
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_mov_b32 s6, s1
-; GFX10-GISEL-NEXT: s_mov_b32 s5, s2
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s0
-; GFX10-GISEL-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
+; GFX10-GISEL-NEXT: s_mov_b32 s4, s1
+; GFX10-GISEL-NEXT: s_mov_b32 s5, s2
; GFX10-GISEL-NEXT: s_mov_b32 s6, s3
-; GFX10-GISEL-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11]
; GFX10-GISEL-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen
; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-GISEL-NEXT: s_endpgm
@@ -569,33 +497,22 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX11-GISEL: ; %bb.0: ; %entry
; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x10
-; GFX11-GISEL-NEXT: s_mov_b32 s9, 0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT: s_mov_b32 s10, s9
-; GFX11-GISEL-NEXT: s_mov_b32 s6, s9
+; GFX11-GISEL-NEXT: s_load_b32 s11, s[4:5], 0x10
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_mov_b32 s8, s1
-; GFX11-GISEL-NEXT: s_mov_b32 s11, s2
; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX11-GISEL-NEXT: s_or_b64 s[0:1], s[8:9], s[10:11]
-; GFX11-GISEL-NEXT: s_mov_b32 s8, s3
-; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT: s_or_b64 s[2:3], s[8:9], s[6:7]
-; GFX11-GISEL-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen glc dlc
+; GFX11-GISEL-NEXT: s_mov_b32 s8, s1
+; GFX11-GISEL-NEXT: s_mov_b32 s9, s2
+; GFX11-GISEL-NEXT: s_mov_b32 s10, s3
+; GFX11-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
; GFX11-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30
-; GFX11-GISEL-NEXT: s_mov_b32 s4, s9
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_mov_b32 s8, s1
-; GFX11-GISEL-NEXT: s_mov_b32 s5, s2
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-GISEL-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5]
-; GFX11-GISEL-NEXT: s_mov_b32 s8, s3
-; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7]
+; GFX11-GISEL-NEXT: s_mov_b32 s4, s1
+; GFX11-GISEL-NEXT: s_mov_b32 s5, s2
+; GFX11-GISEL-NEXT: s_mov_b32 s6, s3
; GFX11-GISEL-NEXT: buffer_store_b32 v0, v1, s[4:7], 0 offen dlc
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-GISEL-NEXT: s_endpgm
@@ -640,33 +557,22 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX12-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x10
-; GFX12-GISEL-NEXT: s_mov_b32 s9, 0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT: s_mov_b32 s10, s9
-; GFX12-GISEL-NEXT: s_mov_b32 s6, s9
+; GFX12-GISEL-NEXT: s_load_b32 s11, s[4:5], 0x10
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_mov_b32 s8, s1
-; GFX12-GISEL-NEXT: s_mov_b32 s11, s2
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX12-GISEL-NEXT: s_or_b64 s[0:1], s[8:9], s[10:11]
-; GFX12-GISEL-NEXT: s_mov_b32 s8, s3
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT: s_or_b64 s[2:3], s[8:9], s[6:7]
-; GFX12-GISEL-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_mov_b32 s8, s1
+; GFX12-GISEL-NEXT: s_mov_b32 s9, s2
+; GFX12-GISEL-NEXT: s_mov_b32 s10, s3
+; GFX12-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
; GFX12-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30
-; GFX12-GISEL-NEXT: s_mov_b32 s4, s9
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_mov_b32 s8, s1
-; GFX12-GISEL-NEXT: s_mov_b32 s5, s2
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-GISEL-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5]
-; GFX12-GISEL-NEXT: s_mov_b32 s8, s3
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7]
+; GFX12-GISEL-NEXT: s_mov_b32 s4, s1
+; GFX12-GISEL-NEXT: s_mov_b32 s5, s2
+; GFX12-GISEL-NEXT: s_mov_b32 s6, s3
; GFX12-GISEL-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/minmax.ll b/llvm/test/CodeGen/AMDGPU/minmax.ll
index d578d2e..60570bd 100644
--- a/llvm/test/CodeGen/AMDGPU/minmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/minmax.ll
@@ -1296,4 +1296,4 @@ declare half @llvm.minnum.f16(half, half)
declare half @llvm.maxnum.f16(half, half)
declare float @llvm.minnum.f32(float, float)
declare float @llvm.maxnum.f32(float, float)
-attributes #0 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" }
+attributes #0 = { nounwind "no-nans-fp-math"="true" }
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
index c1cf06e..fba42c4 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
+++ b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
@@ -388,9 +388,8 @@ body: |
; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY]], implicit-def $scc
- ; GCN-NEXT: S_NOP 0, implicit killed $scc
- ; GCN-NEXT: S_CMP_EQ_U32 killed [[S_AND_B32_]], 1, implicit-def $scc
+ ; GCN-NEXT: S_BITCMP1_B32 killed [[COPY]], 0, implicit-def $scc
+ ; GCN-NEXT: S_NOP 0, implicit $scc
; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc
; GCN-NEXT: S_BRANCH %bb.1
; GCN-NEXT: {{ $}}
@@ -417,6 +416,80 @@ body: |
S_ENDPGM 0
...
+---
+name: xor_1_cmp_lg_0_killed_scc
+body: |
+ ; GCN-LABEL: name: xor_1_cmp_lg_0_killed_scc
+ ; GCN: bb.0:
+ ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GCN-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 1, killed [[COPY]], implicit-def $scc
+ ; GCN-NEXT: S_NOP 0, implicit $scc
+ ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc
+ ; GCN-NEXT: S_BRANCH %bb.1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.2:
+ ; GCN-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ liveins: $sgpr0, $vgpr0_vgpr1
+
+ %0:sreg_32 = COPY $sgpr0
+ %1:sreg_32 = S_XOR_B32 1, killed %0, implicit-def $scc
+ S_NOP 0, implicit killed $scc
+ S_CMP_LG_U32 killed %1:sreg_32, 0, implicit-def $scc
+ S_CBRANCH_SCC0 %bb.2, implicit $scc
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.2(0x80000000)
+
+ bb.2:
+ S_ENDPGM 0
+
+...
+---
+name: absdiff_1_cmp_lg_0_killed_scc
+body: |
+ ; GCN-LABEL: name: absdiff_1_cmp_lg_0_killed_scc
+ ; GCN: bb.0:
+ ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GCN-NEXT: [[S_ABSDIFF_I32_:%[0-9]+]]:sreg_32 = S_ABSDIFF_I32 1, killed [[COPY]], implicit-def $scc
+ ; GCN-NEXT: S_NOP 0, implicit $scc
+ ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc
+ ; GCN-NEXT: S_BRANCH %bb.1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.2:
+ ; GCN-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ liveins: $sgpr0, $vgpr0_vgpr1
+
+ %0:sreg_32 = COPY $sgpr0
+ %1:sreg_32 = S_ABSDIFF_I32 1, killed %0, implicit-def $scc
+ S_NOP 0, implicit killed $scc
+ S_CMP_LG_U32 killed %1:sreg_32, 0, implicit-def $scc
+ S_CBRANCH_SCC0 %bb.2, implicit $scc
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.2(0x80000000)
+
+ bb.2:
+ S_ENDPGM 0
+
+...
---
name: and_1_cmp_eq_1_clobbered_scc
@@ -2070,8 +2143,7 @@ body: |
; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 3, killed [[COPY]], implicit-def dead $scc
- ; GCN-NEXT: S_CMP_LG_U32 killed [[S_AND_B32_]], 0, implicit-def $scc
+ ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 3, killed [[COPY]], implicit-def $scc
; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc
; GCN-NEXT: S_BRANCH %bb.1
; GCN-NEXT: {{ $}}
diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
index f53aaaa..dd5f838 100644
--- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s
declare i32 @llvm.ctpop.i32(i32)
declare i64 @llvm.ctpop.i64(i64)
@@ -10,7 +10,6 @@ define amdgpu_ps i32 @shl32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: shl32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_lshl_b32 s0, s0, s1
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -25,7 +24,6 @@ define amdgpu_ps i32 @shl64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: shl64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -40,7 +38,6 @@ define amdgpu_ps i32 @lshr32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: lshr32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_lshr_b32 s0, s0, s1
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -55,7 +52,6 @@ define amdgpu_ps i32 @lshr64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: lshr64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -70,7 +66,6 @@ define amdgpu_ps i32 @ashr32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: ashr32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_ashr_i32 s0, s0, s1
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -85,7 +80,6 @@ define amdgpu_ps i32 @ashr64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: ashr64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_ashr_i64 s[0:1], s[0:1], s2
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -100,7 +94,6 @@ define amdgpu_ps i32 @abs32(i32 inreg %val0) {
; CHECK-LABEL: abs32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_abs_i32 s0, s0
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s0
; CHECK-NEXT: ;;#ASMEND
@@ -121,7 +114,6 @@ define amdgpu_ps i32 @and32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: and32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_and_b32 s0, s0, s1
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -136,7 +128,6 @@ define amdgpu_ps i32 @and64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: and64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -151,7 +142,6 @@ define amdgpu_ps i32 @or32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: or32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_or_b32 s0, s0, s1
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -166,7 +156,6 @@ define amdgpu_ps i32 @or64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: or64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -181,7 +170,6 @@ define amdgpu_ps i32 @xor32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: xor32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_xor_b32 s0, s0, s1
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -196,7 +184,6 @@ define amdgpu_ps i32 @xor64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: xor64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -211,7 +198,6 @@ define amdgpu_ps i32 @nand32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: nand32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_nand_b32 s0, s0, s1
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s0
; CHECK-NEXT: ;;#ASMEND
@@ -231,7 +217,6 @@ define amdgpu_ps i32 @nand64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: nand64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_nand_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:1]
; CHECK-NEXT: ;;#ASMEND
@@ -251,7 +236,6 @@ define amdgpu_ps i32 @nor32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: nor32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_nor_b32 s0, s0, s1
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s0
; CHECK-NEXT: ;;#ASMEND
@@ -271,7 +255,6 @@ define amdgpu_ps i32 @nor64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: nor64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_nor_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:1]
; CHECK-NEXT: ;;#ASMEND
@@ -291,7 +274,6 @@ define amdgpu_ps i32 @xnor32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: xnor32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_xnor_b32 s0, s0, s1
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s0
; CHECK-NEXT: ;;#ASMEND
@@ -311,7 +293,6 @@ define amdgpu_ps i32 @xnor64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: xnor64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_xnor_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:1]
; CHECK-NEXT: ;;#ASMEND
@@ -331,7 +312,6 @@ define amdgpu_ps i32 @andn232(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: andn232:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_andn2_b32 s0, s0, s1
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -347,7 +327,6 @@ define amdgpu_ps i32 @nandn264(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: nandn264:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -363,7 +342,6 @@ define amdgpu_ps i32 @orn232(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: orn232:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_orn2_b32 s0, s0, s1
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -379,7 +357,6 @@ define amdgpu_ps i32 @orn264(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: orn264:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_orn2_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -395,7 +372,6 @@ define amdgpu_ps i32 @bfe_i32(i32 inreg %val0) {
; CHECK-LABEL: bfe_i32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_bfe_i32 s0, s0, 0x80010
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -433,7 +409,6 @@ define amdgpu_ps i32 @bfe_u32(i32 inreg %val0) {
; CHECK-LABEL: bfe_u32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_bfe_u32 s0, s0, 0x80010
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -513,7 +488,6 @@ define amdgpu_ps i32 @bcnt132(i32 inreg %val0) {
; CHECK-LABEL: bcnt132:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_bcnt1_i32_b32 s0, s0
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s0
; CHECK-NEXT: ;;#ASMEND
@@ -552,7 +526,6 @@ define amdgpu_ps i32 @quadmask32(i32 inreg %val0) {
; CHECK-LABEL: quadmask32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_quadmask_b32 s0, s0
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s0
; CHECK-NEXT: ;;#ASMEND
@@ -571,7 +544,6 @@ define amdgpu_ps i32 @quadmask64(i64 inreg %val0) {
; CHECK-LABEL: quadmask64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_quadmask_b64 s[0:1], s[0:1]
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:1]
; CHECK-NEXT: ;;#ASMEND
@@ -590,7 +562,6 @@ define amdgpu_ps i32 @not32(i32 inreg %val0) {
; CHECK-LABEL: not32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_not_b32 s0, s0
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s0
; CHECK-NEXT: ;;#ASMEND
@@ -609,7 +580,6 @@ define amdgpu_ps i32 @not64(i64 inreg %val0) {
; CHECK-LABEL: not64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_not_b64 s[0:1], s[0:1]
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:1]
; CHECK-NEXT: ;;#ASMEND
@@ -623,3 +593,35 @@ define amdgpu_ps i32 @not64(i64 inreg %val0) {
%zext = zext i1 %cmp to i32
ret i32 %zext
}
+
+
+; --------------------------------------------------------------------------------
+; Negative tests
+; --------------------------------------------------------------------------------
+
+@1 = extern_weak dso_local addrspace(4) constant i32
+
+define amdgpu_ps i32 @si_pc_add_rel_offset_must_not_optimize() {
+; CHECK-LABEL: si_pc_add_rel_offset_must_not_optimize:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_getpc_b64 s[0:1]
+; CHECK-NEXT: s_add_u32 s0, s0, __unnamed_1@rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s1, s1, __unnamed_1@rel32@hi+12
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: s_cbranch_scc0 .LBB35_2
+; CHECK-NEXT: ; %bb.1: ; %endif
+; CHECK-NEXT: s_mov_b32 s0, 1
+; CHECK-NEXT: s_branch .LBB35_3
+; CHECK-NEXT: .LBB35_2: ; %if
+; CHECK-NEXT: s_mov_b32 s0, 0
+; CHECK-NEXT: s_branch .LBB35_3
+; CHECK-NEXT: .LBB35_3:
+ %cmp = icmp ne ptr addrspace(4) @1, null
+ br i1 %cmp, label %endif, label %if
+
+if:
+ ret i32 0
+
+endif:
+ ret i32 1
+}
diff --git a/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll b/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll
index a828ee0..7552f6b 100644
--- a/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll
@@ -12,8 +12,6 @@ define amdgpu_ps i32 @s_uaddo_pseudo(i32 inreg %val0) {
; CHECK-LABEL: s_uaddo_pseudo:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 s0, s0, 1
-; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_addc_u32 s0, 1, 0
; CHECK-NEXT: ; return to shader part epilog
%pair = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %val0, i32 1)
@@ -32,8 +30,6 @@ define amdgpu_ps i32 @s_usubo_pseudo(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: s_usubo_pseudo:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_sub_u32 s0, s0, 1
-; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
; CHECK-NEXT: s_subb_u32 s0, s1, 0
; CHECK-NEXT: ; return to shader part epilog
%pair = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %val0, i32 1)
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 5f6d622..71f5a94 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -56,10 +56,9 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: s_addc_u32 s15, 0, s16
; GCN-NEXT: s_add_u32 s16, s0, s1
; GCN-NEXT: v_mov_b32_e32 v0, s16
-; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
; GCN-NEXT: v_mul_hi_u32 v0, s12, v0
+; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_addc_u32 s14, s14, s15
; GCN-NEXT: s_mul_i32 s0, s12, s14
; GCN-NEXT: v_readfirstlane_b32 s1, v0
@@ -90,7 +89,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: s_add_u32 s15, s16, s0
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_addc_u32 s14, s14, s12
; GCN-NEXT: s_ashr_i32 s12, s7, 31
; GCN-NEXT: s_add_u32 s0, s6, s12
@@ -116,52 +114,50 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: v_readfirstlane_b32 s4, v0
; GCN-NEXT: s_addc_u32 s4, s4, 0
; GCN-NEXT: s_mul_i32 s14, s7, s14
-; GCN-NEXT: s_add_u32 s14, s1, s14
-; GCN-NEXT: v_mov_b32_e32 v0, s14
+; GCN-NEXT: s_add_u32 s16, s1, s14
+; GCN-NEXT: v_mov_b32_e32 v0, s16
; GCN-NEXT: v_mul_hi_u32 v0, s10, v0
-; GCN-NEXT: s_addc_u32 s15, 0, s4
+; GCN-NEXT: s_addc_u32 s17, 0, s4
; GCN-NEXT: s_mov_b32 s1, s5
-; GCN-NEXT: s_mul_i32 s4, s10, s15
+; GCN-NEXT: s_mul_i32 s4, s10, s17
; GCN-NEXT: v_readfirstlane_b32 s5, v0
; GCN-NEXT: s_add_i32 s4, s5, s4
-; GCN-NEXT: s_mul_i32 s5, s11, s14
-; GCN-NEXT: s_add_i32 s16, s4, s5
-; GCN-NEXT: s_sub_i32 s17, s7, s16
-; GCN-NEXT: s_mul_i32 s4, s10, s14
+; GCN-NEXT: s_mul_i32 s5, s11, s16
+; GCN-NEXT: s_add_i32 s18, s4, s5
+; GCN-NEXT: s_sub_i32 s14, s7, s18
+; GCN-NEXT: s_mul_i32 s4, s10, s16
; GCN-NEXT: s_sub_u32 s6, s6, s4
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT: s_or_b32 s18, s4, s5
-; GCN-NEXT: s_cmp_lg_u32 s18, 0
-; GCN-NEXT: s_subb_u32 s17, s17, s11
-; GCN-NEXT: s_sub_u32 s19, s6, s10
-; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GCN-NEXT: s_or_b32 s15, s4, s5
+; GCN-NEXT: s_subb_u32 s19, s14, s11
+; GCN-NEXT: s_sub_u32 s20, s6, s10
+; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0
+; GCN-NEXT: s_or_b32 s14, s14, s15
+; GCN-NEXT: s_subb_u32 s14, s19, 0
+; GCN-NEXT: s_cmp_ge_u32 s14, s11
+; GCN-NEXT: s_cselect_b32 s15, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s20, s10
+; GCN-NEXT: s_cselect_b32 s19, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s14, s11
+; GCN-NEXT: s_cselect_b32 s14, s19, s15
+; GCN-NEXT: s_add_u32 s15, s16, 1
+; GCN-NEXT: s_addc_u32 s19, s17, 0
+; GCN-NEXT: s_add_u32 s20, s16, 2
+; GCN-NEXT: s_addc_u32 s21, s17, 0
+; GCN-NEXT: s_cmp_lg_u32 s14, 0
+; GCN-NEXT: s_cselect_b32 s14, s20, s15
+; GCN-NEXT: s_cselect_b32 s15, s21, s19
; GCN-NEXT: s_or_b32 s4, s4, s5
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
-; GCN-NEXT: s_subb_u32 s4, s17, 0
+; GCN-NEXT: s_subb_u32 s4, s7, s18
; GCN-NEXT: s_cmp_ge_u32 s4, s11
; GCN-NEXT: s_cselect_b32 s5, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s19, s10
-; GCN-NEXT: s_cselect_b32 s17, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s4, s11
-; GCN-NEXT: s_cselect_b32 s4, s17, s5
-; GCN-NEXT: s_add_u32 s5, s14, 1
-; GCN-NEXT: s_addc_u32 s17, s15, 0
-; GCN-NEXT: s_add_u32 s19, s14, 2
-; GCN-NEXT: s_addc_u32 s20, s15, 0
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
-; GCN-NEXT: s_cselect_b32 s4, s19, s5
-; GCN-NEXT: s_cselect_b32 s5, s20, s17
-; GCN-NEXT: s_cmp_lg_u32 s18, 0
-; GCN-NEXT: s_subb_u32 s7, s7, s16
-; GCN-NEXT: s_cmp_ge_u32 s7, s11
-; GCN-NEXT: s_cselect_b32 s16, -1, 0
; GCN-NEXT: s_cmp_ge_u32 s6, s10
; GCN-NEXT: s_cselect_b32 s6, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s7, s11
-; GCN-NEXT: s_cselect_b32 s6, s6, s16
-; GCN-NEXT: s_cmp_lg_u32 s6, 0
-; GCN-NEXT: s_cselect_b32 s5, s5, s15
-; GCN-NEXT: s_cselect_b32 s4, s4, s14
+; GCN-NEXT: s_cmp_eq_u32 s4, s11
+; GCN-NEXT: s_cselect_b32 s4, s6, s5
+; GCN-NEXT: s_cmp_lg_u32 s4, 0
+; GCN-NEXT: s_cselect_b32 s5, s15, s17
+; GCN-NEXT: s_cselect_b32 s4, s14, s16
; GCN-NEXT: s_xor_b64 s[6:7], s[12:13], s[8:9]
; GCN-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7]
; GCN-NEXT: s_sub_u32 s4, s4, s6
@@ -208,7 +204,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-IR-NEXT: s_add_u32 s18, s16, 1
; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
; GCN-IR-NEXT: s_or_b32 s10, s10, s11
-; GCN-IR-NEXT: s_cmp_lg_u32 s10, 0
; GCN-IR-NEXT: s_addc_u32 s10, s17, 0
; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
; GCN-IR-NEXT: s_sub_i32 s16, 63, s16
@@ -242,7 +237,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-IR-NEXT: s_add_u32 s14, s14, 1
; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0
; GCN-IR-NEXT: s_or_b32 s20, s20, s21
-; GCN-IR-NEXT: s_cmp_lg_u32 s20, 0
; GCN-IR-NEXT: s_addc_u32 s15, s15, 0
; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[12:13], s[8:9]
@@ -1195,10 +1189,9 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_addc_u32 s12, 0, s13
; GCN-NEXT: s_add_u32 s13, s8, s9
; GCN-NEXT: v_mov_b32_e32 v0, s13
-; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-NEXT: v_mul_hi_u32 v0, s2, v0
+; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_cmp_lg_u32 s8, 0
; GCN-NEXT: s_addc_u32 s11, s11, s12
; GCN-NEXT: s_mul_i32 s8, s2, s11
; GCN-NEXT: v_readfirstlane_b32 s9, v0
@@ -1229,7 +1222,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_add_u32 s2, s13, s2
; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_cmp_lg_u32 s8, 0
; GCN-NEXT: s_addc_u32 s8, s11, s10
; GCN-NEXT: v_mul_hi_u32 v1, s2, 24
; GCN-NEXT: v_mul_hi_u32 v0, s8, 24
@@ -1238,48 +1230,46 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: v_readfirstlane_b32 s10, v1
; GCN-NEXT: v_readfirstlane_b32 s9, v0
; GCN-NEXT: s_add_u32 s8, s10, s8
-; GCN-NEXT: s_addc_u32 s10, 0, s9
-; GCN-NEXT: v_mov_b32_e32 v0, s10
+; GCN-NEXT: s_addc_u32 s12, 0, s9
+; GCN-NEXT: v_mov_b32_e32 v0, s12
; GCN-NEXT: v_mul_hi_u32 v0, s6, v0
-; GCN-NEXT: s_mul_i32 s8, s7, s10
+; GCN-NEXT: s_mul_i32 s8, s7, s12
; GCN-NEXT: v_readfirstlane_b32 s9, v0
-; GCN-NEXT: s_add_i32 s11, s9, s8
-; GCN-NEXT: s_sub_i32 s12, 0, s11
-; GCN-NEXT: s_mul_i32 s8, s6, s10
-; GCN-NEXT: s_sub_u32 s13, 24, s8
-; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT: s_or_b32 s14, s8, s9
-; GCN-NEXT: s_cmp_lg_u32 s14, 0
-; GCN-NEXT: s_subb_u32 s12, s12, s7
-; GCN-NEXT: s_sub_u32 s15, s13, s6
+; GCN-NEXT: s_add_i32 s13, s9, s8
+; GCN-NEXT: s_sub_i32 s10, 0, s13
+; GCN-NEXT: s_mul_i32 s8, s6, s12
+; GCN-NEXT: s_sub_u32 s14, 24, s8
; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT: s_or_b32 s11, s8, s9
+; GCN-NEXT: s_subb_u32 s15, s10, s7
+; GCN-NEXT: s_sub_u32 s16, s14, s6
+; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT: s_or_b32 s10, s10, s11
+; GCN-NEXT: s_subb_u32 s10, s15, 0
+; GCN-NEXT: s_cmp_ge_u32 s10, s7
+; GCN-NEXT: s_cselect_b32 s11, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s16, s6
+; GCN-NEXT: s_cselect_b32 s15, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s10, s7
+; GCN-NEXT: s_cselect_b32 s10, s15, s11
+; GCN-NEXT: s_add_u32 s11, s12, 1
+; GCN-NEXT: s_addc_u32 s15, 0, 0
+; GCN-NEXT: s_add_u32 s16, s12, 2
+; GCN-NEXT: s_addc_u32 s17, 0, 0
+; GCN-NEXT: s_cmp_lg_u32 s10, 0
+; GCN-NEXT: s_cselect_b32 s10, s16, s11
+; GCN-NEXT: s_cselect_b32 s11, s17, s15
; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_cmp_lg_u32 s8, 0
-; GCN-NEXT: s_subb_u32 s8, s12, 0
+; GCN-NEXT: s_subb_u32 s8, 0, s13
; GCN-NEXT: s_cmp_ge_u32 s8, s7
; GCN-NEXT: s_cselect_b32 s9, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s15, s6
-; GCN-NEXT: s_cselect_b32 s12, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s8, s7
-; GCN-NEXT: s_cselect_b32 s8, s12, s9
-; GCN-NEXT: s_add_u32 s9, s10, 1
-; GCN-NEXT: s_addc_u32 s12, 0, 0
-; GCN-NEXT: s_add_u32 s15, s10, 2
-; GCN-NEXT: s_addc_u32 s16, 0, 0
-; GCN-NEXT: s_cmp_lg_u32 s8, 0
-; GCN-NEXT: s_cselect_b32 s8, s15, s9
-; GCN-NEXT: s_cselect_b32 s9, s16, s12
-; GCN-NEXT: s_cmp_lg_u32 s14, 0
-; GCN-NEXT: s_subb_u32 s11, 0, s11
-; GCN-NEXT: s_cmp_ge_u32 s11, s7
-; GCN-NEXT: s_cselect_b32 s12, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s13, s6
+; GCN-NEXT: s_cmp_ge_u32 s14, s6
; GCN-NEXT: s_cselect_b32 s6, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s11, s7
-; GCN-NEXT: s_cselect_b32 s6, s6, s12
+; GCN-NEXT: s_cmp_eq_u32 s8, s7
+; GCN-NEXT: s_cselect_b32 s6, s6, s9
; GCN-NEXT: s_cmp_lg_u32 s6, 0
-; GCN-NEXT: s_cselect_b32 s7, s9, 0
-; GCN-NEXT: s_cselect_b32 s6, s8, s10
+; GCN-NEXT: s_cselect_b32 s7, s11, 0
+; GCN-NEXT: s_cselect_b32 s6, s10, s12
; GCN-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5]
; GCN-NEXT: s_sub_u32 s6, s6, s4
; GCN-NEXT: s_subb_u32 s7, s7, s4
@@ -1315,7 +1305,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s12, s10, 1
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_or_b32 s8, s8, s9
-; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0
; GCN-IR-NEXT: s_addc_u32 s8, s11, 0
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_sub_i32 s10, 63, s10
@@ -1348,7 +1337,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s16, s16, 1
; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
; GCN-IR-NEXT: s_or_b32 s18, s18, s19
-; GCN-IR-NEXT: s_cmp_lg_u32 s18, 0
; GCN-IR-NEXT: s_addc_u32 s17, s17, 0
; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[10:11], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll
index bbd1793..e12e31b 100644
--- a/llvm/test/CodeGen/AMDGPU/srem.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem.ll
@@ -1513,7 +1513,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9
; GCN-NEXT: s_sub_u32 s3, 0, s8
-; GCN-NEXT: s_subb_u32 s12, 0, s9
+; GCN-NEXT: s_subb_u32 s10, 0, s9
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -1522,56 +1522,52 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_readfirstlane_b32 s13, v1
-; GCN-NEXT: v_readfirstlane_b32 s10, v0
-; GCN-NEXT: s_mul_i32 s11, s3, s13
-; GCN-NEXT: s_mul_hi_u32 s15, s3, s10
-; GCN-NEXT: s_mul_i32 s14, s12, s10
-; GCN-NEXT: s_add_i32 s11, s15, s11
-; GCN-NEXT: s_add_i32 s11, s11, s14
-; GCN-NEXT: s_mul_i32 s16, s3, s10
-; GCN-NEXT: s_mul_i32 s15, s10, s11
-; GCN-NEXT: s_mul_hi_u32 s17, s10, s16
-; GCN-NEXT: s_mul_hi_u32 s14, s10, s11
+; GCN-NEXT: v_readfirstlane_b32 s11, v1
+; GCN-NEXT: v_readfirstlane_b32 s12, v0
+; GCN-NEXT: s_mul_i32 s13, s3, s11
+; GCN-NEXT: s_mul_hi_u32 s15, s3, s12
+; GCN-NEXT: s_mul_i32 s14, s10, s12
+; GCN-NEXT: s_add_i32 s13, s15, s13
+; GCN-NEXT: s_add_i32 s13, s13, s14
+; GCN-NEXT: s_mul_i32 s16, s3, s12
+; GCN-NEXT: s_mul_i32 s15, s12, s13
+; GCN-NEXT: s_mul_hi_u32 s17, s12, s16
+; GCN-NEXT: s_mul_hi_u32 s14, s12, s13
; GCN-NEXT: s_add_u32 s15, s17, s15
; GCN-NEXT: s_addc_u32 s14, 0, s14
-; GCN-NEXT: s_mul_hi_u32 s18, s13, s16
-; GCN-NEXT: s_mul_i32 s16, s13, s16
+; GCN-NEXT: s_mul_hi_u32 s18, s11, s16
+; GCN-NEXT: s_mul_i32 s16, s11, s16
; GCN-NEXT: s_add_u32 s15, s15, s16
-; GCN-NEXT: s_mul_hi_u32 s17, s13, s11
+; GCN-NEXT: s_mul_hi_u32 s17, s11, s13
; GCN-NEXT: s_addc_u32 s14, s14, s18
; GCN-NEXT: s_addc_u32 s15, s17, 0
-; GCN-NEXT: s_mul_i32 s11, s13, s11
-; GCN-NEXT: s_add_u32 s11, s14, s11
+; GCN-NEXT: s_mul_i32 s13, s11, s13
+; GCN-NEXT: s_add_u32 s13, s14, s13
; GCN-NEXT: s_addc_u32 s14, 0, s15
-; GCN-NEXT: s_add_u32 s15, s10, s11
-; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GCN-NEXT: s_addc_u32 s13, s13, s14
-; GCN-NEXT: s_mul_i32 s10, s3, s13
-; GCN-NEXT: s_mul_hi_u32 s11, s3, s15
-; GCN-NEXT: s_add_i32 s10, s11, s10
-; GCN-NEXT: s_mul_i32 s12, s12, s15
-; GCN-NEXT: s_add_i32 s10, s10, s12
-; GCN-NEXT: s_mul_i32 s3, s3, s15
-; GCN-NEXT: s_mul_hi_u32 s12, s13, s3
-; GCN-NEXT: s_mul_i32 s14, s13, s3
-; GCN-NEXT: s_mul_i32 s17, s15, s10
-; GCN-NEXT: s_mul_hi_u32 s3, s15, s3
-; GCN-NEXT: s_mul_hi_u32 s16, s15, s10
+; GCN-NEXT: s_add_u32 s12, s12, s13
+; GCN-NEXT: s_addc_u32 s11, s11, s14
+; GCN-NEXT: s_mul_i32 s13, s3, s11
+; GCN-NEXT: s_mul_hi_u32 s14, s3, s12
+; GCN-NEXT: s_add_i32 s13, s14, s13
+; GCN-NEXT: s_mul_i32 s10, s10, s12
+; GCN-NEXT: s_add_i32 s13, s13, s10
+; GCN-NEXT: s_mul_i32 s3, s3, s12
+; GCN-NEXT: s_mul_hi_u32 s14, s11, s3
+; GCN-NEXT: s_mul_i32 s15, s11, s3
+; GCN-NEXT: s_mul_i32 s17, s12, s13
+; GCN-NEXT: s_mul_hi_u32 s3, s12, s3
+; GCN-NEXT: s_mul_hi_u32 s16, s12, s13
; GCN-NEXT: s_add_u32 s3, s3, s17
; GCN-NEXT: s_addc_u32 s16, 0, s16
-; GCN-NEXT: s_add_u32 s3, s3, s14
-; GCN-NEXT: s_mul_hi_u32 s11, s13, s10
-; GCN-NEXT: s_addc_u32 s3, s16, s12
-; GCN-NEXT: s_addc_u32 s11, s11, 0
-; GCN-NEXT: s_mul_i32 s10, s13, s10
-; GCN-NEXT: s_add_u32 s3, s3, s10
-; GCN-NEXT: s_addc_u32 s12, 0, s11
-; GCN-NEXT: s_add_u32 s3, s15, s3
-; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GCN-NEXT: s_addc_u32 s14, s13, s12
+; GCN-NEXT: s_add_u32 s3, s3, s15
+; GCN-NEXT: s_mul_hi_u32 s10, s11, s13
+; GCN-NEXT: s_addc_u32 s3, s16, s14
+; GCN-NEXT: s_addc_u32 s10, s10, 0
+; GCN-NEXT: s_mul_i32 s13, s11, s13
+; GCN-NEXT: s_add_u32 s3, s3, s13
+; GCN-NEXT: s_addc_u32 s10, 0, s10
+; GCN-NEXT: s_add_u32 s3, s12, s3
+; GCN-NEXT: s_addc_u32 s14, s11, s10
; GCN-NEXT: s_ashr_i32 s10, s5, 31
; GCN-NEXT: s_add_u32 s12, s4, s10
; GCN-NEXT: s_mov_b32 s11, s10
@@ -1600,11 +1596,9 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GCN-NEXT: s_mul_i32 s3, s8, s3
; GCN-NEXT: s_sub_u32 s3, s12, s3
; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0
; GCN-NEXT: s_subb_u32 s12, s16, s9
; GCN-NEXT: s_sub_u32 s18, s3, s8
; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
; GCN-NEXT: s_subb_u32 s19, s12, 0
; GCN-NEXT: s_cmp_ge_u32 s19, s9
; GCN-NEXT: s_cselect_b32 s20, -1, 0
@@ -1614,12 +1608,10 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GCN-NEXT: s_cselect_b32 s20, s21, s20
; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
; GCN-NEXT: s_subb_u32 s12, s12, s9
-; GCN-NEXT: s_sub_u32 s21, s18, s8
-; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
+; GCN-NEXT: s_sub_u32 s16, s18, s8
; GCN-NEXT: s_subb_u32 s12, s12, 0
; GCN-NEXT: s_cmp_lg_u32 s20, 0
-; GCN-NEXT: s_cselect_b32 s16, s21, s18
+; GCN-NEXT: s_cselect_b32 s16, s16, s18
; GCN-NEXT: s_cselect_b32 s12, s12, s19
; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0
; GCN-NEXT: s_subb_u32 s5, s13, s5
@@ -1931,11 +1923,9 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
; TONGA-NEXT: v_readfirstlane_b32 s14, v0
; TONGA-NEXT: s_sub_u32 s12, s12, s14
; TONGA-NEXT: s_cselect_b64 s[14:15], -1, 0
-; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0
; TONGA-NEXT: s_subb_u32 s3, s3, s7
; TONGA-NEXT: s_sub_u32 s18, s12, s6
; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0
-; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
; TONGA-NEXT: s_subb_u32 s19, s3, 0
; TONGA-NEXT: s_cmp_ge_u32 s19, s7
; TONGA-NEXT: s_cselect_b32 s20, -1, 0
@@ -1945,12 +1935,10 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
; TONGA-NEXT: s_cselect_b32 s20, s21, s20
; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
; TONGA-NEXT: s_subb_u32 s3, s3, s7
-; TONGA-NEXT: s_sub_u32 s21, s18, s6
-; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0
-; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
+; TONGA-NEXT: s_sub_u32 s16, s18, s6
; TONGA-NEXT: s_subb_u32 s3, s3, 0
; TONGA-NEXT: s_cmp_lg_u32 s20, 0
-; TONGA-NEXT: s_cselect_b32 s16, s21, s18
+; TONGA-NEXT: s_cselect_b32 s16, s16, s18
; TONGA-NEXT: s_cselect_b32 s3, s3, s19
; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0
; TONGA-NEXT: s_subb_u32 s5, s13, s5
@@ -2730,7 +2718,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s7
; GCN-NEXT: s_sub_u32 s9, 0, s6
-; GCN-NEXT: s_subb_u32 s16, 0, s7
+; GCN-NEXT: s_subb_u32 s14, 0, s7
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2739,56 +2727,52 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_readfirstlane_b32 s17, v1
-; GCN-NEXT: v_readfirstlane_b32 s14, v0
-; GCN-NEXT: s_mul_i32 s15, s9, s17
-; GCN-NEXT: s_mul_hi_u32 s19, s9, s14
-; GCN-NEXT: s_mul_i32 s18, s16, s14
-; GCN-NEXT: s_add_i32 s15, s19, s15
-; GCN-NEXT: s_add_i32 s15, s15, s18
-; GCN-NEXT: s_mul_i32 s20, s9, s14
-; GCN-NEXT: s_mul_i32 s19, s14, s15
-; GCN-NEXT: s_mul_hi_u32 s21, s14, s20
-; GCN-NEXT: s_mul_hi_u32 s18, s14, s15
+; GCN-NEXT: v_readfirstlane_b32 s15, v1
+; GCN-NEXT: v_readfirstlane_b32 s16, v0
+; GCN-NEXT: s_mul_i32 s17, s9, s15
+; GCN-NEXT: s_mul_hi_u32 s19, s9, s16
+; GCN-NEXT: s_mul_i32 s18, s14, s16
+; GCN-NEXT: s_add_i32 s17, s19, s17
+; GCN-NEXT: s_add_i32 s17, s17, s18
+; GCN-NEXT: s_mul_i32 s20, s9, s16
+; GCN-NEXT: s_mul_i32 s19, s16, s17
+; GCN-NEXT: s_mul_hi_u32 s21, s16, s20
+; GCN-NEXT: s_mul_hi_u32 s18, s16, s17
; GCN-NEXT: s_add_u32 s19, s21, s19
; GCN-NEXT: s_addc_u32 s18, 0, s18
-; GCN-NEXT: s_mul_hi_u32 s22, s17, s20
-; GCN-NEXT: s_mul_i32 s20, s17, s20
+; GCN-NEXT: s_mul_hi_u32 s22, s15, s20
+; GCN-NEXT: s_mul_i32 s20, s15, s20
; GCN-NEXT: s_add_u32 s19, s19, s20
-; GCN-NEXT: s_mul_hi_u32 s21, s17, s15
+; GCN-NEXT: s_mul_hi_u32 s21, s15, s17
; GCN-NEXT: s_addc_u32 s18, s18, s22
; GCN-NEXT: s_addc_u32 s19, s21, 0
-; GCN-NEXT: s_mul_i32 s15, s17, s15
-; GCN-NEXT: s_add_u32 s15, s18, s15
+; GCN-NEXT: s_mul_i32 s17, s15, s17
+; GCN-NEXT: s_add_u32 s17, s18, s17
; GCN-NEXT: s_addc_u32 s18, 0, s19
-; GCN-NEXT: s_add_u32 s19, s14, s15
-; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0
-; GCN-NEXT: s_addc_u32 s17, s17, s18
-; GCN-NEXT: s_mul_i32 s14, s9, s17
-; GCN-NEXT: s_mul_hi_u32 s15, s9, s19
-; GCN-NEXT: s_add_i32 s14, s15, s14
-; GCN-NEXT: s_mul_i32 s16, s16, s19
-; GCN-NEXT: s_add_i32 s14, s14, s16
-; GCN-NEXT: s_mul_i32 s9, s9, s19
-; GCN-NEXT: s_mul_hi_u32 s16, s17, s9
-; GCN-NEXT: s_mul_i32 s18, s17, s9
-; GCN-NEXT: s_mul_i32 s21, s19, s14
-; GCN-NEXT: s_mul_hi_u32 s9, s19, s9
-; GCN-NEXT: s_mul_hi_u32 s20, s19, s14
+; GCN-NEXT: s_add_u32 s16, s16, s17
+; GCN-NEXT: s_addc_u32 s15, s15, s18
+; GCN-NEXT: s_mul_i32 s17, s9, s15
+; GCN-NEXT: s_mul_hi_u32 s18, s9, s16
+; GCN-NEXT: s_add_i32 s17, s18, s17
+; GCN-NEXT: s_mul_i32 s14, s14, s16
+; GCN-NEXT: s_add_i32 s17, s17, s14
+; GCN-NEXT: s_mul_i32 s9, s9, s16
+; GCN-NEXT: s_mul_hi_u32 s18, s15, s9
+; GCN-NEXT: s_mul_i32 s19, s15, s9
+; GCN-NEXT: s_mul_i32 s21, s16, s17
+; GCN-NEXT: s_mul_hi_u32 s9, s16, s9
+; GCN-NEXT: s_mul_hi_u32 s20, s16, s17
; GCN-NEXT: s_add_u32 s9, s9, s21
; GCN-NEXT: s_addc_u32 s20, 0, s20
-; GCN-NEXT: s_add_u32 s9, s9, s18
-; GCN-NEXT: s_mul_hi_u32 s15, s17, s14
-; GCN-NEXT: s_addc_u32 s9, s20, s16
-; GCN-NEXT: s_addc_u32 s15, s15, 0
-; GCN-NEXT: s_mul_i32 s14, s17, s14
-; GCN-NEXT: s_add_u32 s9, s9, s14
-; GCN-NEXT: s_addc_u32 s16, 0, s15
-; GCN-NEXT: s_add_u32 s9, s19, s9
-; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0
-; GCN-NEXT: s_addc_u32 s18, s17, s16
+; GCN-NEXT: s_add_u32 s9, s9, s19
+; GCN-NEXT: s_mul_hi_u32 s14, s15, s17
+; GCN-NEXT: s_addc_u32 s9, s20, s18
+; GCN-NEXT: s_addc_u32 s14, s14, 0
+; GCN-NEXT: s_mul_i32 s17, s15, s17
+; GCN-NEXT: s_add_u32 s9, s9, s17
+; GCN-NEXT: s_addc_u32 s14, 0, s14
+; GCN-NEXT: s_add_u32 s9, s16, s9
+; GCN-NEXT: s_addc_u32 s18, s15, s14
; GCN-NEXT: s_ashr_i32 s14, s11, 31
; GCN-NEXT: s_add_u32 s16, s10, s14
; GCN-NEXT: s_mov_b32 s15, s14
@@ -2817,11 +2801,9 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_mul_i32 s9, s6, s9
; GCN-NEXT: s_sub_u32 s9, s16, s9
; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
; GCN-NEXT: s_subb_u32 s16, s20, s7
; GCN-NEXT: s_sub_u32 s22, s9, s6
; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
; GCN-NEXT: s_subb_u32 s23, s16, 0
; GCN-NEXT: s_cmp_ge_u32 s23, s7
; GCN-NEXT: s_cselect_b32 s24, -1, 0
@@ -2831,12 +2813,10 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_cselect_b32 s24, s25, s24
; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
; GCN-NEXT: s_subb_u32 s16, s16, s7
-; GCN-NEXT: s_sub_u32 s25, s22, s6
-; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
+; GCN-NEXT: s_sub_u32 s20, s22, s6
; GCN-NEXT: s_subb_u32 s16, s16, 0
; GCN-NEXT: s_cmp_lg_u32 s24, 0
-; GCN-NEXT: s_cselect_b32 s20, s25, s22
+; GCN-NEXT: s_cselect_b32 s20, s20, s22
; GCN-NEXT: s_cselect_b32 s16, s16, s23
; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
; GCN-NEXT: s_subb_u32 s11, s17, s11
@@ -2887,7 +2867,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s11
; GCN-NEXT: s_sub_u32 s3, 0, s10
-; GCN-NEXT: s_subb_u32 s14, 0, s11
+; GCN-NEXT: s_subb_u32 s12, 0, s11
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2896,56 +2876,52 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_readfirstlane_b32 s15, v1
-; GCN-NEXT: v_readfirstlane_b32 s12, v0
-; GCN-NEXT: s_mul_i32 s13, s3, s15
-; GCN-NEXT: s_mul_hi_u32 s17, s3, s12
-; GCN-NEXT: s_mul_i32 s16, s14, s12
-; GCN-NEXT: s_add_i32 s13, s17, s13
-; GCN-NEXT: s_add_i32 s13, s13, s16
-; GCN-NEXT: s_mul_i32 s18, s3, s12
-; GCN-NEXT: s_mul_i32 s17, s12, s13
-; GCN-NEXT: s_mul_hi_u32 s19, s12, s18
-; GCN-NEXT: s_mul_hi_u32 s16, s12, s13
+; GCN-NEXT: v_readfirstlane_b32 s13, v1
+; GCN-NEXT: v_readfirstlane_b32 s14, v0
+; GCN-NEXT: s_mul_i32 s15, s3, s13
+; GCN-NEXT: s_mul_hi_u32 s17, s3, s14
+; GCN-NEXT: s_mul_i32 s16, s12, s14
+; GCN-NEXT: s_add_i32 s15, s17, s15
+; GCN-NEXT: s_add_i32 s15, s15, s16
+; GCN-NEXT: s_mul_i32 s18, s3, s14
+; GCN-NEXT: s_mul_i32 s17, s14, s15
+; GCN-NEXT: s_mul_hi_u32 s19, s14, s18
+; GCN-NEXT: s_mul_hi_u32 s16, s14, s15
; GCN-NEXT: s_add_u32 s17, s19, s17
; GCN-NEXT: s_addc_u32 s16, 0, s16
-; GCN-NEXT: s_mul_hi_u32 s20, s15, s18
-; GCN-NEXT: s_mul_i32 s18, s15, s18
+; GCN-NEXT: s_mul_hi_u32 s20, s13, s18
+; GCN-NEXT: s_mul_i32 s18, s13, s18
; GCN-NEXT: s_add_u32 s17, s17, s18
-; GCN-NEXT: s_mul_hi_u32 s19, s15, s13
+; GCN-NEXT: s_mul_hi_u32 s19, s13, s15
; GCN-NEXT: s_addc_u32 s16, s16, s20
; GCN-NEXT: s_addc_u32 s17, s19, 0
-; GCN-NEXT: s_mul_i32 s13, s15, s13
-; GCN-NEXT: s_add_u32 s13, s16, s13
+; GCN-NEXT: s_mul_i32 s15, s13, s15
+; GCN-NEXT: s_add_u32 s15, s16, s15
; GCN-NEXT: s_addc_u32 s16, 0, s17
-; GCN-NEXT: s_add_u32 s17, s12, s13
-; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0
-; GCN-NEXT: s_addc_u32 s15, s15, s16
-; GCN-NEXT: s_mul_i32 s12, s3, s15
-; GCN-NEXT: s_mul_hi_u32 s13, s3, s17
-; GCN-NEXT: s_add_i32 s12, s13, s12
-; GCN-NEXT: s_mul_i32 s14, s14, s17
-; GCN-NEXT: s_add_i32 s12, s12, s14
-; GCN-NEXT: s_mul_i32 s3, s3, s17
-; GCN-NEXT: s_mul_hi_u32 s14, s15, s3
-; GCN-NEXT: s_mul_i32 s16, s15, s3
-; GCN-NEXT: s_mul_i32 s19, s17, s12
-; GCN-NEXT: s_mul_hi_u32 s3, s17, s3
-; GCN-NEXT: s_mul_hi_u32 s18, s17, s12
+; GCN-NEXT: s_add_u32 s14, s14, s15
+; GCN-NEXT: s_addc_u32 s13, s13, s16
+; GCN-NEXT: s_mul_i32 s15, s3, s13
+; GCN-NEXT: s_mul_hi_u32 s16, s3, s14
+; GCN-NEXT: s_add_i32 s15, s16, s15
+; GCN-NEXT: s_mul_i32 s12, s12, s14
+; GCN-NEXT: s_add_i32 s15, s15, s12
+; GCN-NEXT: s_mul_i32 s3, s3, s14
+; GCN-NEXT: s_mul_hi_u32 s16, s13, s3
+; GCN-NEXT: s_mul_i32 s17, s13, s3
+; GCN-NEXT: s_mul_i32 s19, s14, s15
+; GCN-NEXT: s_mul_hi_u32 s3, s14, s3
+; GCN-NEXT: s_mul_hi_u32 s18, s14, s15
; GCN-NEXT: s_add_u32 s3, s3, s19
; GCN-NEXT: s_addc_u32 s18, 0, s18
-; GCN-NEXT: s_add_u32 s3, s3, s16
-; GCN-NEXT: s_mul_hi_u32 s13, s15, s12
-; GCN-NEXT: s_addc_u32 s3, s18, s14
-; GCN-NEXT: s_addc_u32 s13, s13, 0
-; GCN-NEXT: s_mul_i32 s12, s15, s12
-; GCN-NEXT: s_add_u32 s3, s3, s12
-; GCN-NEXT: s_addc_u32 s14, 0, s13
-; GCN-NEXT: s_add_u32 s3, s17, s3
-; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0
-; GCN-NEXT: s_addc_u32 s16, s15, s14
+; GCN-NEXT: s_add_u32 s3, s3, s17
+; GCN-NEXT: s_mul_hi_u32 s12, s13, s15
+; GCN-NEXT: s_addc_u32 s3, s18, s16
+; GCN-NEXT: s_addc_u32 s12, s12, 0
+; GCN-NEXT: s_mul_i32 s15, s13, s15
+; GCN-NEXT: s_add_u32 s3, s3, s15
+; GCN-NEXT: s_addc_u32 s12, 0, s12
+; GCN-NEXT: s_add_u32 s3, s14, s3
+; GCN-NEXT: s_addc_u32 s16, s13, s12
; GCN-NEXT: s_ashr_i32 s12, s5, 31
; GCN-NEXT: s_add_u32 s14, s4, s12
; GCN-NEXT: s_mov_b32 s13, s12
@@ -2974,11 +2950,9 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_mul_i32 s3, s10, s3
; GCN-NEXT: s_sub_u32 s3, s14, s3
; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
; GCN-NEXT: s_subb_u32 s14, s18, s11
; GCN-NEXT: s_sub_u32 s20, s3, s10
; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
; GCN-NEXT: s_subb_u32 s21, s14, 0
; GCN-NEXT: s_cmp_ge_u32 s21, s11
; GCN-NEXT: s_cselect_b32 s22, -1, 0
@@ -2988,12 +2962,10 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_cselect_b32 s22, s23, s22
; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
; GCN-NEXT: s_subb_u32 s14, s14, s11
-; GCN-NEXT: s_sub_u32 s23, s20, s10
-; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
+; GCN-NEXT: s_sub_u32 s18, s20, s10
; GCN-NEXT: s_subb_u32 s14, s14, 0
; GCN-NEXT: s_cmp_lg_u32 s22, 0
-; GCN-NEXT: s_cselect_b32 s18, s23, s20
+; GCN-NEXT: s_cselect_b32 s18, s18, s20
; GCN-NEXT: s_cselect_b32 s14, s14, s21
; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
; GCN-NEXT: s_subb_u32 s5, s15, s5
@@ -3463,11 +3435,9 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: v_readfirstlane_b32 s14, v0
; TONGA-NEXT: s_sub_u32 s12, s12, s14
; TONGA-NEXT: s_cselect_b64 s[14:15], -1, 0
-; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0
; TONGA-NEXT: s_subb_u32 s1, s1, s7
; TONGA-NEXT: s_sub_u32 s18, s12, s6
; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0
-; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
; TONGA-NEXT: s_subb_u32 s19, s1, 0
; TONGA-NEXT: s_cmp_ge_u32 s19, s7
; TONGA-NEXT: s_cselect_b32 s20, -1, 0
@@ -3477,12 +3447,10 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: s_cselect_b32 s20, s21, s20
; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
; TONGA-NEXT: s_subb_u32 s1, s1, s7
-; TONGA-NEXT: s_sub_u32 s21, s18, s6
-; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0
-; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
+; TONGA-NEXT: s_sub_u32 s16, s18, s6
; TONGA-NEXT: s_subb_u32 s1, s1, 0
; TONGA-NEXT: s_cmp_lg_u32 s20, 0
-; TONGA-NEXT: s_cselect_b32 s16, s21, s18
+; TONGA-NEXT: s_cselect_b32 s16, s16, s18
; TONGA-NEXT: s_cselect_b32 s1, s1, s19
; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0
; TONGA-NEXT: s_subb_u32 s3, s13, s3
@@ -4934,7 +4902,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s7
; GCN-NEXT: s_sub_u32 s17, 0, s6
-; GCN-NEXT: s_subb_u32 s24, 0, s7
+; GCN-NEXT: s_subb_u32 s22, 0, s7
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -4943,56 +4911,52 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_readfirstlane_b32 s25, v1
-; GCN-NEXT: v_readfirstlane_b32 s22, v0
-; GCN-NEXT: s_mul_i32 s23, s17, s25
-; GCN-NEXT: s_mul_hi_u32 s27, s17, s22
-; GCN-NEXT: s_mul_i32 s26, s24, s22
-; GCN-NEXT: s_add_i32 s23, s27, s23
-; GCN-NEXT: s_add_i32 s23, s23, s26
-; GCN-NEXT: s_mul_i32 s28, s17, s22
-; GCN-NEXT: s_mul_i32 s27, s22, s23
-; GCN-NEXT: s_mul_hi_u32 s29, s22, s28
-; GCN-NEXT: s_mul_hi_u32 s26, s22, s23
+; GCN-NEXT: v_readfirstlane_b32 s23, v1
+; GCN-NEXT: v_readfirstlane_b32 s24, v0
+; GCN-NEXT: s_mul_i32 s25, s17, s23
+; GCN-NEXT: s_mul_hi_u32 s27, s17, s24
+; GCN-NEXT: s_mul_i32 s26, s22, s24
+; GCN-NEXT: s_add_i32 s25, s27, s25
+; GCN-NEXT: s_add_i32 s25, s25, s26
+; GCN-NEXT: s_mul_i32 s28, s17, s24
+; GCN-NEXT: s_mul_i32 s27, s24, s25
+; GCN-NEXT: s_mul_hi_u32 s29, s24, s28
+; GCN-NEXT: s_mul_hi_u32 s26, s24, s25
; GCN-NEXT: s_add_u32 s27, s29, s27
; GCN-NEXT: s_addc_u32 s26, 0, s26
-; GCN-NEXT: s_mul_hi_u32 s30, s25, s28
-; GCN-NEXT: s_mul_i32 s28, s25, s28
+; GCN-NEXT: s_mul_hi_u32 s30, s23, s28
+; GCN-NEXT: s_mul_i32 s28, s23, s28
; GCN-NEXT: s_add_u32 s27, s27, s28
-; GCN-NEXT: s_mul_hi_u32 s29, s25, s23
+; GCN-NEXT: s_mul_hi_u32 s29, s23, s25
; GCN-NEXT: s_addc_u32 s26, s26, s30
; GCN-NEXT: s_addc_u32 s27, s29, 0
-; GCN-NEXT: s_mul_i32 s23, s25, s23
-; GCN-NEXT: s_add_u32 s23, s26, s23
+; GCN-NEXT: s_mul_i32 s25, s23, s25
+; GCN-NEXT: s_add_u32 s25, s26, s25
; GCN-NEXT: s_addc_u32 s26, 0, s27
-; GCN-NEXT: s_add_u32 s27, s22, s23
-; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0
-; GCN-NEXT: s_addc_u32 s25, s25, s26
-; GCN-NEXT: s_mul_i32 s22, s17, s25
-; GCN-NEXT: s_mul_hi_u32 s23, s17, s27
-; GCN-NEXT: s_add_i32 s22, s23, s22
-; GCN-NEXT: s_mul_i32 s24, s24, s27
-; GCN-NEXT: s_add_i32 s22, s22, s24
-; GCN-NEXT: s_mul_i32 s17, s17, s27
-; GCN-NEXT: s_mul_hi_u32 s24, s25, s17
-; GCN-NEXT: s_mul_i32 s26, s25, s17
-; GCN-NEXT: s_mul_i32 s29, s27, s22
-; GCN-NEXT: s_mul_hi_u32 s17, s27, s17
-; GCN-NEXT: s_mul_hi_u32 s28, s27, s22
+; GCN-NEXT: s_add_u32 s24, s24, s25
+; GCN-NEXT: s_addc_u32 s23, s23, s26
+; GCN-NEXT: s_mul_i32 s25, s17, s23
+; GCN-NEXT: s_mul_hi_u32 s26, s17, s24
+; GCN-NEXT: s_add_i32 s25, s26, s25
+; GCN-NEXT: s_mul_i32 s22, s22, s24
+; GCN-NEXT: s_add_i32 s25, s25, s22
+; GCN-NEXT: s_mul_i32 s17, s17, s24
+; GCN-NEXT: s_mul_hi_u32 s26, s23, s17
+; GCN-NEXT: s_mul_i32 s27, s23, s17
+; GCN-NEXT: s_mul_i32 s29, s24, s25
+; GCN-NEXT: s_mul_hi_u32 s17, s24, s17
+; GCN-NEXT: s_mul_hi_u32 s28, s24, s25
; GCN-NEXT: s_add_u32 s17, s17, s29
; GCN-NEXT: s_addc_u32 s28, 0, s28
-; GCN-NEXT: s_add_u32 s17, s17, s26
-; GCN-NEXT: s_mul_hi_u32 s23, s25, s22
-; GCN-NEXT: s_addc_u32 s17, s28, s24
-; GCN-NEXT: s_addc_u32 s23, s23, 0
-; GCN-NEXT: s_mul_i32 s22, s25, s22
-; GCN-NEXT: s_add_u32 s17, s17, s22
-; GCN-NEXT: s_addc_u32 s24, 0, s23
-; GCN-NEXT: s_add_u32 s17, s27, s17
-; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0
-; GCN-NEXT: s_addc_u32 s26, s25, s24
+; GCN-NEXT: s_add_u32 s17, s17, s27
+; GCN-NEXT: s_mul_hi_u32 s22, s23, s25
+; GCN-NEXT: s_addc_u32 s17, s28, s26
+; GCN-NEXT: s_addc_u32 s22, s22, 0
+; GCN-NEXT: s_mul_i32 s25, s23, s25
+; GCN-NEXT: s_add_u32 s17, s17, s25
+; GCN-NEXT: s_addc_u32 s22, 0, s22
+; GCN-NEXT: s_add_u32 s17, s24, s17
+; GCN-NEXT: s_addc_u32 s26, s23, s22
; GCN-NEXT: s_ashr_i32 s22, s19, 31
; GCN-NEXT: s_add_u32 s24, s18, s22
; GCN-NEXT: s_mov_b32 s23, s22
@@ -5021,11 +4985,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_mul_i32 s17, s6, s17
; GCN-NEXT: s_sub_u32 s17, s24, s17
; GCN-NEXT: s_cselect_b64 s[26:27], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0
; GCN-NEXT: s_subb_u32 s24, s28, s7
; GCN-NEXT: s_sub_u32 s30, s17, s6
; GCN-NEXT: s_cselect_b64 s[28:29], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[28:29], 0
; GCN-NEXT: s_subb_u32 s31, s24, 0
; GCN-NEXT: s_cmp_ge_u32 s31, s7
; GCN-NEXT: s_cselect_b32 s33, -1, 0
@@ -5035,12 +4997,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_cselect_b32 s33, s34, s33
; GCN-NEXT: s_cmp_lg_u64 s[28:29], 0
; GCN-NEXT: s_subb_u32 s24, s24, s7
-; GCN-NEXT: s_sub_u32 s34, s30, s6
-; GCN-NEXT: s_cselect_b64 s[28:29], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[28:29], 0
+; GCN-NEXT: s_sub_u32 s28, s30, s6
; GCN-NEXT: s_subb_u32 s24, s24, 0
; GCN-NEXT: s_cmp_lg_u32 s33, 0
-; GCN-NEXT: s_cselect_b32 s28, s34, s30
+; GCN-NEXT: s_cselect_b32 s28, s28, s30
; GCN-NEXT: s_cselect_b32 s24, s24, s31
; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0
; GCN-NEXT: s_subb_u32 s19, s25, s19
@@ -5091,7 +5051,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s18
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s19
; GCN-NEXT: s_sub_u32 s13, 0, s18
-; GCN-NEXT: s_subb_u32 s22, 0, s19
+; GCN-NEXT: s_subb_u32 s20, 0, s19
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -5100,56 +5060,52 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_readfirstlane_b32 s23, v1
-; GCN-NEXT: v_readfirstlane_b32 s20, v0
-; GCN-NEXT: s_mul_i32 s21, s13, s23
-; GCN-NEXT: s_mul_hi_u32 s25, s13, s20
-; GCN-NEXT: s_mul_i32 s24, s22, s20
-; GCN-NEXT: s_add_i32 s21, s25, s21
-; GCN-NEXT: s_add_i32 s21, s21, s24
-; GCN-NEXT: s_mul_i32 s26, s13, s20
-; GCN-NEXT: s_mul_i32 s25, s20, s21
-; GCN-NEXT: s_mul_hi_u32 s27, s20, s26
-; GCN-NEXT: s_mul_hi_u32 s24, s20, s21
+; GCN-NEXT: v_readfirstlane_b32 s21, v1
+; GCN-NEXT: v_readfirstlane_b32 s22, v0
+; GCN-NEXT: s_mul_i32 s23, s13, s21
+; GCN-NEXT: s_mul_hi_u32 s25, s13, s22
+; GCN-NEXT: s_mul_i32 s24, s20, s22
+; GCN-NEXT: s_add_i32 s23, s25, s23
+; GCN-NEXT: s_add_i32 s23, s23, s24
+; GCN-NEXT: s_mul_i32 s26, s13, s22
+; GCN-NEXT: s_mul_i32 s25, s22, s23
+; GCN-NEXT: s_mul_hi_u32 s27, s22, s26
+; GCN-NEXT: s_mul_hi_u32 s24, s22, s23
; GCN-NEXT: s_add_u32 s25, s27, s25
; GCN-NEXT: s_addc_u32 s24, 0, s24
-; GCN-NEXT: s_mul_hi_u32 s28, s23, s26
-; GCN-NEXT: s_mul_i32 s26, s23, s26
+; GCN-NEXT: s_mul_hi_u32 s28, s21, s26
+; GCN-NEXT: s_mul_i32 s26, s21, s26
; GCN-NEXT: s_add_u32 s25, s25, s26
-; GCN-NEXT: s_mul_hi_u32 s27, s23, s21
+; GCN-NEXT: s_mul_hi_u32 s27, s21, s23
; GCN-NEXT: s_addc_u32 s24, s24, s28
; GCN-NEXT: s_addc_u32 s25, s27, 0
-; GCN-NEXT: s_mul_i32 s21, s23, s21
-; GCN-NEXT: s_add_u32 s21, s24, s21
+; GCN-NEXT: s_mul_i32 s23, s21, s23
+; GCN-NEXT: s_add_u32 s23, s24, s23
; GCN-NEXT: s_addc_u32 s24, 0, s25
-; GCN-NEXT: s_add_u32 s25, s20, s21
-; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
-; GCN-NEXT: s_addc_u32 s23, s23, s24
-; GCN-NEXT: s_mul_i32 s20, s13, s23
-; GCN-NEXT: s_mul_hi_u32 s21, s13, s25
-; GCN-NEXT: s_add_i32 s20, s21, s20
-; GCN-NEXT: s_mul_i32 s22, s22, s25
-; GCN-NEXT: s_add_i32 s20, s20, s22
-; GCN-NEXT: s_mul_i32 s13, s13, s25
-; GCN-NEXT: s_mul_hi_u32 s22, s23, s13
-; GCN-NEXT: s_mul_i32 s24, s23, s13
-; GCN-NEXT: s_mul_i32 s27, s25, s20
-; GCN-NEXT: s_mul_hi_u32 s13, s25, s13
-; GCN-NEXT: s_mul_hi_u32 s26, s25, s20
+; GCN-NEXT: s_add_u32 s22, s22, s23
+; GCN-NEXT: s_addc_u32 s21, s21, s24
+; GCN-NEXT: s_mul_i32 s23, s13, s21
+; GCN-NEXT: s_mul_hi_u32 s24, s13, s22
+; GCN-NEXT: s_add_i32 s23, s24, s23
+; GCN-NEXT: s_mul_i32 s20, s20, s22
+; GCN-NEXT: s_add_i32 s23, s23, s20
+; GCN-NEXT: s_mul_i32 s13, s13, s22
+; GCN-NEXT: s_mul_hi_u32 s24, s21, s13
+; GCN-NEXT: s_mul_i32 s25, s21, s13
+; GCN-NEXT: s_mul_i32 s27, s22, s23
+; GCN-NEXT: s_mul_hi_u32 s13, s22, s13
+; GCN-NEXT: s_mul_hi_u32 s26, s22, s23
; GCN-NEXT: s_add_u32 s13, s13, s27
; GCN-NEXT: s_addc_u32 s26, 0, s26
-; GCN-NEXT: s_add_u32 s13, s13, s24
-; GCN-NEXT: s_mul_hi_u32 s21, s23, s20
-; GCN-NEXT: s_addc_u32 s13, s26, s22
-; GCN-NEXT: s_addc_u32 s21, s21, 0
-; GCN-NEXT: s_mul_i32 s20, s23, s20
-; GCN-NEXT: s_add_u32 s13, s13, s20
-; GCN-NEXT: s_addc_u32 s22, 0, s21
-; GCN-NEXT: s_add_u32 s13, s25, s13
-; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
-; GCN-NEXT: s_addc_u32 s24, s23, s22
+; GCN-NEXT: s_add_u32 s13, s13, s25
+; GCN-NEXT: s_mul_hi_u32 s20, s21, s23
+; GCN-NEXT: s_addc_u32 s13, s26, s24
+; GCN-NEXT: s_addc_u32 s20, s20, 0
+; GCN-NEXT: s_mul_i32 s23, s21, s23
+; GCN-NEXT: s_add_u32 s13, s13, s23
+; GCN-NEXT: s_addc_u32 s20, 0, s20
+; GCN-NEXT: s_add_u32 s13, s22, s13
+; GCN-NEXT: s_addc_u32 s24, s21, s20
; GCN-NEXT: s_ashr_i32 s20, s15, 31
; GCN-NEXT: s_add_u32 s22, s14, s20
; GCN-NEXT: s_mov_b32 s21, s20
@@ -5178,11 +5134,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_mul_i32 s13, s18, s13
; GCN-NEXT: s_sub_u32 s13, s22, s13
; GCN-NEXT: s_cselect_b64 s[24:25], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[24:25], 0
; GCN-NEXT: s_subb_u32 s22, s26, s19
; GCN-NEXT: s_sub_u32 s28, s13, s18
; GCN-NEXT: s_cselect_b64 s[26:27], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0
; GCN-NEXT: s_subb_u32 s29, s22, 0
; GCN-NEXT: s_cmp_ge_u32 s29, s19
; GCN-NEXT: s_cselect_b32 s30, -1, 0
@@ -5192,12 +5146,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_cselect_b32 s30, s31, s30
; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0
; GCN-NEXT: s_subb_u32 s22, s22, s19
-; GCN-NEXT: s_sub_u32 s31, s28, s18
-; GCN-NEXT: s_cselect_b64 s[26:27], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0
+; GCN-NEXT: s_sub_u32 s26, s28, s18
; GCN-NEXT: s_subb_u32 s22, s22, 0
; GCN-NEXT: s_cmp_lg_u32 s30, 0
-; GCN-NEXT: s_cselect_b32 s26, s31, s28
+; GCN-NEXT: s_cselect_b32 s26, s26, s28
; GCN-NEXT: s_cselect_b32 s22, s22, s29
; GCN-NEXT: s_cmp_lg_u64 s[24:25], 0
; GCN-NEXT: s_subb_u32 s15, s23, s15
@@ -5257,7 +5209,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s14
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s15
; GCN-NEXT: s_sub_u32 s9, 0, s14
-; GCN-NEXT: s_subb_u32 s18, 0, s15
+; GCN-NEXT: s_subb_u32 s16, 0, s15
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -5266,56 +5218,52 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_readfirstlane_b32 s19, v1
-; GCN-NEXT: v_readfirstlane_b32 s16, v0
-; GCN-NEXT: s_mul_i32 s17, s9, s19
-; GCN-NEXT: s_mul_hi_u32 s21, s9, s16
-; GCN-NEXT: s_mul_i32 s20, s18, s16
-; GCN-NEXT: s_add_i32 s17, s21, s17
-; GCN-NEXT: s_add_i32 s17, s17, s20
-; GCN-NEXT: s_mul_i32 s22, s9, s16
-; GCN-NEXT: s_mul_i32 s21, s16, s17
-; GCN-NEXT: s_mul_hi_u32 s23, s16, s22
-; GCN-NEXT: s_mul_hi_u32 s20, s16, s17
+; GCN-NEXT: v_readfirstlane_b32 s17, v1
+; GCN-NEXT: v_readfirstlane_b32 s18, v0
+; GCN-NEXT: s_mul_i32 s19, s9, s17
+; GCN-NEXT: s_mul_hi_u32 s21, s9, s18
+; GCN-NEXT: s_mul_i32 s20, s16, s18
+; GCN-NEXT: s_add_i32 s19, s21, s19
+; GCN-NEXT: s_add_i32 s19, s19, s20
+; GCN-NEXT: s_mul_i32 s22, s9, s18
+; GCN-NEXT: s_mul_i32 s21, s18, s19
+; GCN-NEXT: s_mul_hi_u32 s23, s18, s22
+; GCN-NEXT: s_mul_hi_u32 s20, s18, s19
; GCN-NEXT: s_add_u32 s21, s23, s21
; GCN-NEXT: s_addc_u32 s20, 0, s20
-; GCN-NEXT: s_mul_hi_u32 s24, s19, s22
-; GCN-NEXT: s_mul_i32 s22, s19, s22
+; GCN-NEXT: s_mul_hi_u32 s24, s17, s22
+; GCN-NEXT: s_mul_i32 s22, s17, s22
; GCN-NEXT: s_add_u32 s21, s21, s22
-; GCN-NEXT: s_mul_hi_u32 s23, s19, s17
+; GCN-NEXT: s_mul_hi_u32 s23, s17, s19
; GCN-NEXT: s_addc_u32 s20, s20, s24
; GCN-NEXT: s_addc_u32 s21, s23, 0
-; GCN-NEXT: s_mul_i32 s17, s19, s17
-; GCN-NEXT: s_add_u32 s17, s20, s17
+; GCN-NEXT: s_mul_i32 s19, s17, s19
+; GCN-NEXT: s_add_u32 s19, s20, s19
; GCN-NEXT: s_addc_u32 s20, 0, s21
-; GCN-NEXT: s_add_u32 s21, s16, s17
-; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
-; GCN-NEXT: s_addc_u32 s19, s19, s20
-; GCN-NEXT: s_mul_i32 s16, s9, s19
-; GCN-NEXT: s_mul_hi_u32 s17, s9, s21
-; GCN-NEXT: s_add_i32 s16, s17, s16
-; GCN-NEXT: s_mul_i32 s18, s18, s21
-; GCN-NEXT: s_add_i32 s16, s16, s18
-; GCN-NEXT: s_mul_i32 s9, s9, s21
-; GCN-NEXT: s_mul_hi_u32 s18, s19, s9
-; GCN-NEXT: s_mul_i32 s20, s19, s9
-; GCN-NEXT: s_mul_i32 s23, s21, s16
-; GCN-NEXT: s_mul_hi_u32 s9, s21, s9
-; GCN-NEXT: s_mul_hi_u32 s22, s21, s16
+; GCN-NEXT: s_add_u32 s18, s18, s19
+; GCN-NEXT: s_addc_u32 s17, s17, s20
+; GCN-NEXT: s_mul_i32 s19, s9, s17
+; GCN-NEXT: s_mul_hi_u32 s20, s9, s18
+; GCN-NEXT: s_add_i32 s19, s20, s19
+; GCN-NEXT: s_mul_i32 s16, s16, s18
+; GCN-NEXT: s_add_i32 s19, s19, s16
+; GCN-NEXT: s_mul_i32 s9, s9, s18
+; GCN-NEXT: s_mul_hi_u32 s20, s17, s9
+; GCN-NEXT: s_mul_i32 s21, s17, s9
+; GCN-NEXT: s_mul_i32 s23, s18, s19
+; GCN-NEXT: s_mul_hi_u32 s9, s18, s9
+; GCN-NEXT: s_mul_hi_u32 s22, s18, s19
; GCN-NEXT: s_add_u32 s9, s9, s23
; GCN-NEXT: s_addc_u32 s22, 0, s22
-; GCN-NEXT: s_add_u32 s9, s9, s20
-; GCN-NEXT: s_mul_hi_u32 s17, s19, s16
-; GCN-NEXT: s_addc_u32 s9, s22, s18
-; GCN-NEXT: s_addc_u32 s17, s17, 0
-; GCN-NEXT: s_mul_i32 s16, s19, s16
-; GCN-NEXT: s_add_u32 s9, s9, s16
-; GCN-NEXT: s_addc_u32 s18, 0, s17
-; GCN-NEXT: s_add_u32 s9, s21, s9
-; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
-; GCN-NEXT: s_addc_u32 s20, s19, s18
+; GCN-NEXT: s_add_u32 s9, s9, s21
+; GCN-NEXT: s_mul_hi_u32 s16, s17, s19
+; GCN-NEXT: s_addc_u32 s9, s22, s20
+; GCN-NEXT: s_addc_u32 s16, s16, 0
+; GCN-NEXT: s_mul_i32 s19, s17, s19
+; GCN-NEXT: s_add_u32 s9, s9, s19
+; GCN-NEXT: s_addc_u32 s16, 0, s16
+; GCN-NEXT: s_add_u32 s9, s18, s9
+; GCN-NEXT: s_addc_u32 s20, s17, s16
; GCN-NEXT: s_ashr_i32 s16, s11, 31
; GCN-NEXT: s_add_u32 s18, s10, s16
; GCN-NEXT: s_mov_b32 s17, s16
@@ -5344,11 +5292,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_mul_i32 s9, s14, s9
; GCN-NEXT: s_sub_u32 s9, s18, s9
; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
; GCN-NEXT: s_subb_u32 s18, s22, s15
; GCN-NEXT: s_sub_u32 s24, s9, s14
; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0
; GCN-NEXT: s_subb_u32 s25, s18, 0
; GCN-NEXT: s_cmp_ge_u32 s25, s15
; GCN-NEXT: s_cselect_b32 s26, -1, 0
@@ -5358,12 +5304,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_cselect_b32 s26, s27, s26
; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0
; GCN-NEXT: s_subb_u32 s18, s18, s15
-; GCN-NEXT: s_sub_u32 s27, s24, s14
-; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0
+; GCN-NEXT: s_sub_u32 s22, s24, s14
; GCN-NEXT: s_subb_u32 s18, s18, 0
; GCN-NEXT: s_cmp_lg_u32 s26, 0
-; GCN-NEXT: s_cselect_b32 s22, s27, s24
+; GCN-NEXT: s_cselect_b32 s22, s22, s24
; GCN-NEXT: s_cselect_b32 s18, s18, s25
; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
; GCN-NEXT: s_subb_u32 s11, s19, s11
@@ -5420,7 +5364,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s11
; GCN-NEXT: s_sub_u32 s3, 0, s10
-; GCN-NEXT: s_subb_u32 s14, 0, s11
+; GCN-NEXT: s_subb_u32 s12, 0, s11
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -5429,56 +5373,52 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_readfirstlane_b32 s15, v1
-; GCN-NEXT: v_readfirstlane_b32 s12, v0
-; GCN-NEXT: s_mul_i32 s13, s3, s15
-; GCN-NEXT: s_mul_hi_u32 s17, s3, s12
-; GCN-NEXT: s_mul_i32 s16, s14, s12
-; GCN-NEXT: s_add_i32 s13, s17, s13
-; GCN-NEXT: s_add_i32 s13, s13, s16
-; GCN-NEXT: s_mul_i32 s18, s3, s12
-; GCN-NEXT: s_mul_i32 s17, s12, s13
-; GCN-NEXT: s_mul_hi_u32 s19, s12, s18
-; GCN-NEXT: s_mul_hi_u32 s16, s12, s13
+; GCN-NEXT: v_readfirstlane_b32 s13, v1
+; GCN-NEXT: v_readfirstlane_b32 s14, v0
+; GCN-NEXT: s_mul_i32 s15, s3, s13
+; GCN-NEXT: s_mul_hi_u32 s17, s3, s14
+; GCN-NEXT: s_mul_i32 s16, s12, s14
+; GCN-NEXT: s_add_i32 s15, s17, s15
+; GCN-NEXT: s_add_i32 s15, s15, s16
+; GCN-NEXT: s_mul_i32 s18, s3, s14
+; GCN-NEXT: s_mul_i32 s17, s14, s15
+; GCN-NEXT: s_mul_hi_u32 s19, s14, s18
+; GCN-NEXT: s_mul_hi_u32 s16, s14, s15
; GCN-NEXT: s_add_u32 s17, s19, s17
; GCN-NEXT: s_addc_u32 s16, 0, s16
-; GCN-NEXT: s_mul_hi_u32 s20, s15, s18
-; GCN-NEXT: s_mul_i32 s18, s15, s18
+; GCN-NEXT: s_mul_hi_u32 s20, s13, s18
+; GCN-NEXT: s_mul_i32 s18, s13, s18
; GCN-NEXT: s_add_u32 s17, s17, s18
-; GCN-NEXT: s_mul_hi_u32 s19, s15, s13
+; GCN-NEXT: s_mul_hi_u32 s19, s13, s15
; GCN-NEXT: s_addc_u32 s16, s16, s20
; GCN-NEXT: s_addc_u32 s17, s19, 0
-; GCN-NEXT: s_mul_i32 s13, s15, s13
-; GCN-NEXT: s_add_u32 s13, s16, s13
+; GCN-NEXT: s_mul_i32 s15, s13, s15
+; GCN-NEXT: s_add_u32 s15, s16, s15
; GCN-NEXT: s_addc_u32 s16, 0, s17
-; GCN-NEXT: s_add_u32 s17, s12, s13
-; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0
-; GCN-NEXT: s_addc_u32 s15, s15, s16
-; GCN-NEXT: s_mul_i32 s12, s3, s15
-; GCN-NEXT: s_mul_hi_u32 s13, s3, s17
-; GCN-NEXT: s_add_i32 s12, s13, s12
-; GCN-NEXT: s_mul_i32 s14, s14, s17
-; GCN-NEXT: s_add_i32 s12, s12, s14
-; GCN-NEXT: s_mul_i32 s3, s3, s17
-; GCN-NEXT: s_mul_hi_u32 s14, s15, s3
-; GCN-NEXT: s_mul_i32 s16, s15, s3
-; GCN-NEXT: s_mul_i32 s19, s17, s12
-; GCN-NEXT: s_mul_hi_u32 s3, s17, s3
-; GCN-NEXT: s_mul_hi_u32 s18, s17, s12
+; GCN-NEXT: s_add_u32 s14, s14, s15
+; GCN-NEXT: s_addc_u32 s13, s13, s16
+; GCN-NEXT: s_mul_i32 s15, s3, s13
+; GCN-NEXT: s_mul_hi_u32 s16, s3, s14
+; GCN-NEXT: s_add_i32 s15, s16, s15
+; GCN-NEXT: s_mul_i32 s12, s12, s14
+; GCN-NEXT: s_add_i32 s15, s15, s12
+; GCN-NEXT: s_mul_i32 s3, s3, s14
+; GCN-NEXT: s_mul_hi_u32 s16, s13, s3
+; GCN-NEXT: s_mul_i32 s17, s13, s3
+; GCN-NEXT: s_mul_i32 s19, s14, s15
+; GCN-NEXT: s_mul_hi_u32 s3, s14, s3
+; GCN-NEXT: s_mul_hi_u32 s18, s14, s15
; GCN-NEXT: s_add_u32 s3, s3, s19
; GCN-NEXT: s_addc_u32 s18, 0, s18
-; GCN-NEXT: s_add_u32 s3, s3, s16
-; GCN-NEXT: s_mul_hi_u32 s13, s15, s12
-; GCN-NEXT: s_addc_u32 s3, s18, s14
-; GCN-NEXT: s_addc_u32 s13, s13, 0
-; GCN-NEXT: s_mul_i32 s12, s15, s12
-; GCN-NEXT: s_add_u32 s3, s3, s12
-; GCN-NEXT: s_addc_u32 s14, 0, s13
-; GCN-NEXT: s_add_u32 s3, s17, s3
-; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0
-; GCN-NEXT: s_addc_u32 s16, s15, s14
+; GCN-NEXT: s_add_u32 s3, s3, s17
+; GCN-NEXT: s_mul_hi_u32 s12, s13, s15
+; GCN-NEXT: s_addc_u32 s3, s18, s16
+; GCN-NEXT: s_addc_u32 s12, s12, 0
+; GCN-NEXT: s_mul_i32 s15, s13, s15
+; GCN-NEXT: s_add_u32 s3, s3, s15
+; GCN-NEXT: s_addc_u32 s12, 0, s12
+; GCN-NEXT: s_add_u32 s3, s14, s3
+; GCN-NEXT: s_addc_u32 s16, s13, s12
; GCN-NEXT: s_ashr_i32 s12, s5, 31
; GCN-NEXT: s_add_u32 s14, s4, s12
; GCN-NEXT: s_mov_b32 s13, s12
@@ -5507,11 +5447,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_mul_i32 s3, s10, s3
; GCN-NEXT: s_sub_u32 s3, s14, s3
; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
; GCN-NEXT: s_subb_u32 s14, s18, s11
; GCN-NEXT: s_sub_u32 s20, s3, s10
; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
; GCN-NEXT: s_subb_u32 s21, s14, 0
; GCN-NEXT: s_cmp_ge_u32 s21, s11
; GCN-NEXT: s_cselect_b32 s22, -1, 0
@@ -5521,12 +5459,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_cselect_b32 s22, s23, s22
; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
; GCN-NEXT: s_subb_u32 s14, s14, s11
-; GCN-NEXT: s_sub_u32 s23, s20, s10
-; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
+; GCN-NEXT: s_sub_u32 s18, s20, s10
; GCN-NEXT: s_subb_u32 s14, s14, 0
; GCN-NEXT: s_cmp_lg_u32 s22, 0
-; GCN-NEXT: s_cselect_b32 s18, s23, s20
+; GCN-NEXT: s_cselect_b32 s18, s18, s20
; GCN-NEXT: s_cselect_b32 s14, s14, s21
; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
; GCN-NEXT: s_subb_u32 s5, s15, s5
@@ -6299,11 +6235,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: v_readfirstlane_b32 s14, v8
; TONGA-NEXT: s_sub_u32 s12, s12, s14
; TONGA-NEXT: s_cselect_b64 s[14:15], -1, 0
-; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0
; TONGA-NEXT: s_subb_u32 s1, s1, s7
; TONGA-NEXT: s_sub_u32 s18, s12, s6
; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0
-; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
; TONGA-NEXT: s_subb_u32 s19, s1, 0
; TONGA-NEXT: s_cmp_ge_u32 s19, s7
; TONGA-NEXT: s_cselect_b32 s20, -1, 0
@@ -6313,12 +6247,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: s_cselect_b32 s20, s21, s20
; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
; TONGA-NEXT: s_subb_u32 s1, s1, s7
-; TONGA-NEXT: s_sub_u32 s21, s18, s6
-; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0
-; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
+; TONGA-NEXT: s_sub_u32 s16, s18, s6
; TONGA-NEXT: s_subb_u32 s1, s1, 0
; TONGA-NEXT: s_cmp_lg_u32 s20, 0
-; TONGA-NEXT: s_cselect_b32 s16, s21, s18
+; TONGA-NEXT: s_cselect_b32 s16, s16, s18
; TONGA-NEXT: s_cselect_b32 s1, s1, s19
; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0
; TONGA-NEXT: s_subb_u32 s3, s13, s3
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index 33b0a5d..ea9bb04 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -51,10 +51,9 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: s_addc_u32 s13, 0, s14
; GCN-NEXT: s_add_u32 s14, s0, s1
; GCN-NEXT: v_mov_b32_e32 v0, s14
-; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
; GCN-NEXT: v_mul_hi_u32 v0, s10, v0
+; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_addc_u32 s12, s12, s13
; GCN-NEXT: s_mul_i32 s0, s10, s12
; GCN-NEXT: v_readfirstlane_b32 s1, v0
@@ -85,7 +84,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: s_add_u32 s11, s14, s0
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_addc_u32 s1, s12, s10
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mul_hi_u32 v1, s6, v0
@@ -115,46 +113,43 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: v_readfirstlane_b32 s10, v0
; GCN-NEXT: s_add_i32 s5, s10, s5
; GCN-NEXT: s_mul_i32 s10, s9, s4
-; GCN-NEXT: s_add_i32 s10, s5, s10
-; GCN-NEXT: s_sub_i32 s11, s7, s10
+; GCN-NEXT: s_add_i32 s12, s5, s10
+; GCN-NEXT: s_sub_i32 s10, s7, s12
; GCN-NEXT: s_mul_i32 s4, s8, s4
; GCN-NEXT: s_sub_u32 s6, s6, s4
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT: s_or_b32 s12, s4, s5
-; GCN-NEXT: s_cmp_lg_u32 s12, 0
-; GCN-NEXT: s_subb_u32 s11, s11, s9
-; GCN-NEXT: s_sub_u32 s13, s6, s8
-; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GCN-NEXT: s_or_b32 s11, s4, s5
+; GCN-NEXT: s_subb_u32 s13, s10, s9
+; GCN-NEXT: s_sub_u32 s14, s6, s8
+; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT: s_or_b32 s15, s10, s11
+; GCN-NEXT: s_subb_u32 s15, s13, 0
+; GCN-NEXT: s_cmp_ge_u32 s15, s9
+; GCN-NEXT: s_cselect_b32 s16, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s14, s8
+; GCN-NEXT: s_cselect_b32 s17, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s15, s9
+; GCN-NEXT: s_cselect_b32 s16, s17, s16
+; GCN-NEXT: s_or_b32 s10, s10, s11
+; GCN-NEXT: s_subb_u32 s13, s13, s9
+; GCN-NEXT: s_sub_u32 s17, s14, s8
+; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT: s_or_b32 s10, s10, s11
+; GCN-NEXT: s_subb_u32 s10, s13, 0
+; GCN-NEXT: s_cmp_lg_u32 s16, 0
+; GCN-NEXT: s_cselect_b32 s11, s17, s14
+; GCN-NEXT: s_cselect_b32 s10, s10, s15
; GCN-NEXT: s_or_b32 s4, s4, s5
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
-; GCN-NEXT: s_subb_u32 s14, s11, 0
-; GCN-NEXT: s_cmp_ge_u32 s14, s9
+; GCN-NEXT: s_subb_u32 s4, s7, s12
+; GCN-NEXT: s_cmp_ge_u32 s4, s9
; GCN-NEXT: s_cselect_b32 s5, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s13, s8
-; GCN-NEXT: s_cselect_b32 s15, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s14, s9
-; GCN-NEXT: s_cselect_b32 s15, s15, s5
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
-; GCN-NEXT: s_subb_u32 s11, s11, s9
-; GCN-NEXT: s_sub_u32 s16, s13, s8
-; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT: s_or_b32 s4, s4, s5
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
-; GCN-NEXT: s_subb_u32 s4, s11, 0
-; GCN-NEXT: s_cmp_lg_u32 s15, 0
-; GCN-NEXT: s_cselect_b32 s5, s16, s13
-; GCN-NEXT: s_cselect_b32 s4, s4, s14
-; GCN-NEXT: s_cmp_lg_u32 s12, 0
-; GCN-NEXT: s_subb_u32 s7, s7, s10
-; GCN-NEXT: s_cmp_ge_u32 s7, s9
-; GCN-NEXT: s_cselect_b32 s10, -1, 0
; GCN-NEXT: s_cmp_ge_u32 s6, s8
-; GCN-NEXT: s_cselect_b32 s8, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s7, s9
-; GCN-NEXT: s_cselect_b32 s8, s8, s10
-; GCN-NEXT: s_cmp_lg_u32 s8, 0
-; GCN-NEXT: s_cselect_b32 s4, s4, s7
-; GCN-NEXT: s_cselect_b32 s5, s5, s6
+; GCN-NEXT: s_cselect_b32 s7, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s4, s9
+; GCN-NEXT: s_cselect_b32 s5, s7, s5
+; GCN-NEXT: s_cmp_lg_u32 s5, 0
+; GCN-NEXT: s_cselect_b32 s4, s10, s4
+; GCN-NEXT: s_cselect_b32 s5, s11, s6
; GCN-NEXT: v_mov_b32_e32 v0, s5
; GCN-NEXT: v_mov_b32_e32 v1, s4
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -187,7 +182,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-IR-NEXT: s_add_u32 s14, s12, 1
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_or_b32 s8, s8, s9
-; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0
; GCN-IR-NEXT: s_addc_u32 s8, s13, 0
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_sub_i32 s12, 63, s12
@@ -221,7 +215,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-IR-NEXT: s_add_u32 s16, s16, 1
; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
; GCN-IR-NEXT: s_or_b32 s18, s18, s19
-; GCN-IR-NEXT: s_cmp_lg_u32 s18, 0
; GCN-IR-NEXT: s_addc_u32 s17, s17, 0
; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5]
@@ -1016,10 +1009,9 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-NEXT: s_addc_u32 s13, 0, s14
; GCN-NEXT: s_add_u32 s14, s8, s9
; GCN-NEXT: v_mov_b32_e32 v0, s14
-; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-NEXT: v_mul_hi_u32 v0, s10, v0
+; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_cmp_lg_u32 s8, 0
; GCN-NEXT: s_addc_u32 s12, s12, s13
; GCN-NEXT: s_mul_i32 s8, s10, s12
; GCN-NEXT: v_readfirstlane_b32 s9, v0
@@ -1050,7 +1042,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-NEXT: s_add_u32 s11, s14, s8
; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_cmp_lg_u32 s8, 0
; GCN-NEXT: s_addc_u32 s10, s12, s10
; GCN-NEXT: s_ashr_i32 s8, s7, 31
; GCN-NEXT: s_add_u32 s6, s6, s8
@@ -1083,46 +1074,43 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-NEXT: v_readfirstlane_b32 s12, v0
; GCN-NEXT: s_add_i32 s11, s12, s11
; GCN-NEXT: s_mul_i32 s12, s5, s10
-; GCN-NEXT: s_add_i32 s12, s11, s12
-; GCN-NEXT: s_sub_i32 s13, s7, s12
+; GCN-NEXT: s_add_i32 s14, s11, s12
+; GCN-NEXT: s_sub_i32 s12, s7, s14
; GCN-NEXT: s_mul_i32 s10, s4, s10
; GCN-NEXT: s_sub_u32 s6, s6, s10
; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT: s_or_b32 s14, s10, s11
-; GCN-NEXT: s_cmp_lg_u32 s14, 0
-; GCN-NEXT: s_subb_u32 s13, s13, s5
-; GCN-NEXT: s_sub_u32 s15, s6, s4
-; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT: s_or_b32 s10, s10, s11
-; GCN-NEXT: s_cmp_lg_u32 s10, 0
-; GCN-NEXT: s_subb_u32 s16, s13, 0
-; GCN-NEXT: s_cmp_ge_u32 s16, s5
-; GCN-NEXT: s_cselect_b32 s11, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s15, s4
-; GCN-NEXT: s_cselect_b32 s17, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s16, s5
-; GCN-NEXT: s_cselect_b32 s17, s17, s11
-; GCN-NEXT: s_cmp_lg_u32 s10, 0
-; GCN-NEXT: s_subb_u32 s13, s13, s5
-; GCN-NEXT: s_sub_u32 s18, s15, s4
-; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT: s_or_b32 s13, s10, s11
+; GCN-NEXT: s_subb_u32 s15, s12, s5
+; GCN-NEXT: s_sub_u32 s16, s6, s4
+; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GCN-NEXT: s_or_b32 s17, s12, s13
+; GCN-NEXT: s_subb_u32 s17, s15, 0
+; GCN-NEXT: s_cmp_ge_u32 s17, s5
+; GCN-NEXT: s_cselect_b32 s18, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s16, s4
+; GCN-NEXT: s_cselect_b32 s19, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s17, s5
+; GCN-NEXT: s_cselect_b32 s18, s19, s18
+; GCN-NEXT: s_or_b32 s12, s12, s13
+; GCN-NEXT: s_subb_u32 s15, s15, s5
+; GCN-NEXT: s_sub_u32 s19, s16, s4
+; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GCN-NEXT: s_or_b32 s12, s12, s13
+; GCN-NEXT: s_subb_u32 s12, s15, 0
+; GCN-NEXT: s_cmp_lg_u32 s18, 0
+; GCN-NEXT: s_cselect_b32 s13, s19, s16
+; GCN-NEXT: s_cselect_b32 s12, s12, s17
; GCN-NEXT: s_or_b32 s10, s10, s11
-; GCN-NEXT: s_cmp_lg_u32 s10, 0
-; GCN-NEXT: s_subb_u32 s10, s13, 0
-; GCN-NEXT: s_cmp_lg_u32 s17, 0
-; GCN-NEXT: s_cselect_b32 s11, s18, s15
-; GCN-NEXT: s_cselect_b32 s10, s10, s16
-; GCN-NEXT: s_cmp_lg_u32 s14, 0
-; GCN-NEXT: s_subb_u32 s7, s7, s12
+; GCN-NEXT: s_subb_u32 s7, s7, s14
; GCN-NEXT: s_cmp_ge_u32 s7, s5
-; GCN-NEXT: s_cselect_b32 s12, -1, 0
+; GCN-NEXT: s_cselect_b32 s10, -1, 0
; GCN-NEXT: s_cmp_ge_u32 s6, s4
; GCN-NEXT: s_cselect_b32 s4, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s7, s5
-; GCN-NEXT: s_cselect_b32 s4, s4, s12
+; GCN-NEXT: s_cselect_b32 s4, s4, s10
; GCN-NEXT: s_cmp_lg_u32 s4, 0
-; GCN-NEXT: s_cselect_b32 s5, s10, s7
-; GCN-NEXT: s_cselect_b32 s4, s11, s6
+; GCN-NEXT: s_cselect_b32 s5, s12, s7
+; GCN-NEXT: s_cselect_b32 s4, s13, s6
; GCN-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
; GCN-NEXT: s_sub_u32 s4, s4, s8
; GCN-NEXT: s_subb_u32 s5, s5, s8
@@ -1170,7 +1158,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-IR-NEXT: s_add_u32 s16, s14, 1
; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
; GCN-IR-NEXT: s_or_b32 s10, s10, s11
-; GCN-IR-NEXT: s_cmp_lg_u32 s10, 0
; GCN-IR-NEXT: s_addc_u32 s10, s15, 0
; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
; GCN-IR-NEXT: s_sub_i32 s14, 63, s14
@@ -1204,7 +1191,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-IR-NEXT: s_add_u32 s18, s18, 1
; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0
; GCN-IR-NEXT: s_or_b32 s20, s20, s21
-; GCN-IR-NEXT: s_cmp_lg_u32 s20, 0
; GCN-IR-NEXT: s_addc_u32 s19, s19, 0
; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[12:13], s[2:3]
@@ -1369,10 +1355,9 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_addc_u32 s10, 0, s11
; GCN-NEXT: s_add_u32 s11, s6, s7
; GCN-NEXT: v_mov_b32_e32 v0, s11
-; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-NEXT: v_mul_hi_u32 v0, s2, v0
+; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-NEXT: s_or_b32 s6, s6, s7
-; GCN-NEXT: s_cmp_lg_u32 s6, 0
; GCN-NEXT: s_addc_u32 s9, s9, s10
; GCN-NEXT: s_mul_i32 s6, s2, s9
; GCN-NEXT: v_readfirstlane_b32 s7, v0
@@ -1403,7 +1388,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_add_u32 s2, s11, s2
; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-NEXT: s_or_b32 s6, s6, s7
-; GCN-NEXT: s_cmp_lg_u32 s6, 0
; GCN-NEXT: s_addc_u32 s6, s9, s8
; GCN-NEXT: v_mul_hi_u32 v1, s2, 24
; GCN-NEXT: v_mul_hi_u32 v0, s6, 24
@@ -1418,45 +1402,42 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_mul_i32 s7, s5, s6
; GCN-NEXT: s_mul_i32 s6, s4, s6
; GCN-NEXT: v_readfirstlane_b32 s8, v0
-; GCN-NEXT: s_add_i32 s8, s8, s7
-; GCN-NEXT: s_sub_i32 s9, 0, s8
-; GCN-NEXT: s_sub_u32 s10, 24, s6
-; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GCN-NEXT: s_or_b32 s11, s6, s7
-; GCN-NEXT: s_cmp_lg_u32 s11, 0
-; GCN-NEXT: s_subb_u32 s9, s9, s5
-; GCN-NEXT: s_sub_u32 s12, s10, s4
+; GCN-NEXT: s_add_i32 s10, s8, s7
+; GCN-NEXT: s_sub_i32 s8, 0, s10
+; GCN-NEXT: s_sub_u32 s11, 24, s6
; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0
+; GCN-NEXT: s_or_b32 s9, s6, s7
+; GCN-NEXT: s_subb_u32 s12, s8, s5
+; GCN-NEXT: s_sub_u32 s13, s11, s4
+; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT: s_or_b32 s14, s8, s9
+; GCN-NEXT: s_subb_u32 s14, s12, 0
+; GCN-NEXT: s_cmp_ge_u32 s14, s5
+; GCN-NEXT: s_cselect_b32 s15, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s13, s4
+; GCN-NEXT: s_cselect_b32 s16, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s14, s5
+; GCN-NEXT: s_cselect_b32 s15, s16, s15
+; GCN-NEXT: s_or_b32 s8, s8, s9
+; GCN-NEXT: s_subb_u32 s12, s12, s5
+; GCN-NEXT: s_sub_u32 s16, s13, s4
+; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT: s_or_b32 s8, s8, s9
+; GCN-NEXT: s_subb_u32 s8, s12, 0
+; GCN-NEXT: s_cmp_lg_u32 s15, 0
+; GCN-NEXT: s_cselect_b32 s9, s16, s13
+; GCN-NEXT: s_cselect_b32 s8, s8, s14
; GCN-NEXT: s_or_b32 s6, s6, s7
-; GCN-NEXT: s_cmp_lg_u32 s6, 0
-; GCN-NEXT: s_subb_u32 s13, s9, 0
-; GCN-NEXT: s_cmp_ge_u32 s13, s5
+; GCN-NEXT: s_subb_u32 s6, 0, s10
+; GCN-NEXT: s_cmp_ge_u32 s6, s5
; GCN-NEXT: s_cselect_b32 s7, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s12, s4
-; GCN-NEXT: s_cselect_b32 s14, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s13, s5
-; GCN-NEXT: s_cselect_b32 s14, s14, s7
-; GCN-NEXT: s_cmp_lg_u32 s6, 0
-; GCN-NEXT: s_subb_u32 s9, s9, s5
-; GCN-NEXT: s_sub_u32 s15, s12, s4
-; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GCN-NEXT: s_or_b32 s6, s6, s7
-; GCN-NEXT: s_cmp_lg_u32 s6, 0
-; GCN-NEXT: s_subb_u32 s6, s9, 0
-; GCN-NEXT: s_cmp_lg_u32 s14, 0
-; GCN-NEXT: s_cselect_b32 s7, s15, s12
-; GCN-NEXT: s_cselect_b32 s6, s6, s13
-; GCN-NEXT: s_cmp_lg_u32 s11, 0
-; GCN-NEXT: s_subb_u32 s8, 0, s8
-; GCN-NEXT: s_cmp_ge_u32 s8, s5
-; GCN-NEXT: s_cselect_b32 s9, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s10, s4
+; GCN-NEXT: s_cmp_ge_u32 s11, s4
; GCN-NEXT: s_cselect_b32 s4, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s8, s5
-; GCN-NEXT: s_cselect_b32 s4, s4, s9
+; GCN-NEXT: s_cmp_eq_u32 s6, s5
+; GCN-NEXT: s_cselect_b32 s4, s4, s7
; GCN-NEXT: s_cmp_lg_u32 s4, 0
-; GCN-NEXT: s_cselect_b32 s4, s6, s8
-; GCN-NEXT: s_cselect_b32 s5, s7, s10
+; GCN-NEXT: s_cselect_b32 s4, s8, s6
+; GCN-NEXT: s_cselect_b32 s5, s9, s11
; GCN-NEXT: v_mov_b32_e32 v0, s5
; GCN-NEXT: v_mov_b32_e32 v1, s4
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -1489,7 +1470,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s8, s2, 1
; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
; GCN-IR-NEXT: s_or_b32 s9, s10, s11
-; GCN-IR-NEXT: s_cmp_lg_u32 s9, 0
; GCN-IR-NEXT: s_addc_u32 s3, s3, 0
; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
; GCN-IR-NEXT: s_sub_i32 s2, 63, s2
@@ -1522,7 +1502,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s14, s14, 1
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_or_b32 s16, s16, s17
-; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0
; GCN-IR-NEXT: s_addc_u32 s15, s15, 0
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[8:9], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/stackguard.ll b/llvm/test/CodeGen/AMDGPU/stackguard.ll
new file mode 100644
index 0000000..393686f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/stackguard.ll
@@ -0,0 +1,14 @@
+; RUN: not llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck %s
+; RUN: not llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck %s
+
+; FIXME: To actually support stackguard, need to fix intrinsic to
+; return pointer in any address space.
+
+; CHECK: error: unable to lower stackguard
+define i1 @test_stackguard(ptr %p1) {
+ %p2 = call ptr @llvm.stackguard()
+ %res = icmp ne ptr %p2, %p1
+ ret i1 %res
+}
+
+declare ptr @llvm.stackguard()
diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll
index bb5918b2..bdd22f25 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll
@@ -18,7 +18,6 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: s_or_b32 s0, s0, s1
-; SI-NEXT: s_cmp_lg_u32 s0, 0
; SI-NEXT: s_addc_u32 s3, s3, s9
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -35,10 +34,8 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_add_u32 s2, s2, s4
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
; VI-NEXT: s_addc_u32 s3, s3, s5
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s3
@@ -53,14 +50,12 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s6, s2, s6
-; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT: s_addc_u32 s4, s3, s7
+; GFX9-NEXT: s_add_u32 s4, s2, s6
+; GFX9-NEXT: s_addc_u32 s5, s3, s7
; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
@@ -73,8 +68,6 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_add_u32 s2, s2, s6
-; GFX10-NEXT: s_cselect_b32 s4, -1, 0
-; GFX10-NEXT: s_cmp_lg_u32 s4, 0
; GFX10-NEXT: s_addc_u32 s3, s3, s7
; GFX10-NEXT: s_cselect_b32 s4, -1, 0
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
@@ -91,14 +84,12 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_u32 s2, s2, s4
-; GFX11-NEXT: s_cselect_b32 s4, -1, 0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_cmp_lg_u32 s4, 0
; GFX11-NEXT: s_addc_u32 s3, s3, s5
; GFX11-NEXT: s_cselect_b32 s4, -1, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, s2, s2, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s2
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
@@ -444,7 +435,6 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_add_u32 s4, s4, s6
; SI-NEXT: s_cselect_b64 s[12:13], -1, 0
; SI-NEXT: s_or_b32 s6, s12, s13
-; SI-NEXT: s_cmp_lg_u32 s6, 0
; SI-NEXT: s_addc_u32 s5, s5, s7
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
@@ -465,16 +455,14 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: s_add_u32 s2, s4, s6
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_add_u32 s0, s4, s6
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_addc_u32 s1, s5, s7
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
-; VI-NEXT: s_addc_u32 s0, s5, s7
-; VI-NEXT: v_mov_b32_e32 v4, s2
-; VI-NEXT: v_mov_b32_e32 v5, s0
-; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -486,12 +474,10 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s2, s12, s14
-; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: s_addc_u32 s0, s13, s15
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: s_add_u32 s0, s12, s14
+; GFX9-NEXT: s_addc_u32 s1, s13, s15
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -504,10 +490,8 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_add_u32 s0, s12, s14
-; GFX10-NEXT: s_cselect_b32 s1, -1, 0
-; GFX10-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-NEXT: s_cmp_lg_u32 s1, 0
; GFX10-NEXT: s_addc_u32 s1, s13, s15
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: s_cselect_b32 s0, -1, 0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
@@ -520,10 +504,8 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_u32 s4, s4, s6
-; GFX11-NEXT: s_cselect_b32 s6, -1, 0
-; GFX11-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-NEXT: s_cmp_lg_u32 s6, 0
; GFX11-NEXT: s_addc_u32 s5, s5, s7
+; GFX11-NEXT: v_mov_b32_e32 v0, s4
; GFX11-NEXT: s_cselect_b32 s4, -1, 0
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index 41199b0..fd461ac 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -148,7 +148,6 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-IR-NEXT: s_add_u32 s14, s12, 1
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_or_b32 s8, s8, s9
-; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0
; GCN-IR-NEXT: s_addc_u32 s8, s13, 0
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_sub_i32 s12, 63, s12
@@ -182,7 +181,6 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-IR-NEXT: s_add_u32 s10, s10, 1
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_or_b32 s16, s16, s17
-; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0
; GCN-IR-NEXT: s_addc_u32 s11, s11, 0
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[2:3], s[4:5]
@@ -831,10 +829,9 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_addc_u32 s10, 0, s11
; GCN-NEXT: s_add_u32 s11, s4, s5
; GCN-NEXT: v_mov_b32_e32 v0, s11
-; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
; GCN-NEXT: v_mul_hi_u32 v0, s6, v0
+; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
; GCN-NEXT: s_or_b32 s4, s4, s5
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
; GCN-NEXT: s_addc_u32 s9, s9, s10
; GCN-NEXT: s_mul_i32 s4, s6, s9
; GCN-NEXT: v_readfirstlane_b32 s5, v0
@@ -865,7 +862,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_add_u32 s8, s11, s4
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
; GCN-NEXT: s_or_b32 s4, s4, s5
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
; GCN-NEXT: s_addc_u32 s4, s9, s6
; GCN-NEXT: v_mul_hi_u32 v1, s8, 24
; GCN-NEXT: v_mul_hi_u32 v0, s4, 24
@@ -874,52 +870,50 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: v_readfirstlane_b32 s8, v1
; GCN-NEXT: v_readfirstlane_b32 s5, v0
; GCN-NEXT: s_add_u32 s4, s8, s4
-; GCN-NEXT: s_addc_u32 s8, 0, s5
-; GCN-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NEXT: s_addc_u32 s10, 0, s5
+; GCN-NEXT: v_mov_b32_e32 v0, s10
; GCN-NEXT: v_mul_hi_u32 v0, s2, v0
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
-; GCN-NEXT: s_mul_i32 s0, s3, s8
+; GCN-NEXT: s_mul_i32 s0, s3, s10
; GCN-NEXT: v_readfirstlane_b32 s1, v0
-; GCN-NEXT: s_add_i32 s9, s1, s0
-; GCN-NEXT: s_sub_i32 s10, 0, s9
-; GCN-NEXT: s_mul_i32 s0, s2, s8
-; GCN-NEXT: s_sub_u32 s11, 24, s0
-; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT: s_or_b32 s12, s0, s1
-; GCN-NEXT: s_cmp_lg_u32 s12, 0
-; GCN-NEXT: s_subb_u32 s10, s10, s3
-; GCN-NEXT: s_sub_u32 s13, s11, s2
+; GCN-NEXT: s_add_i32 s11, s1, s0
+; GCN-NEXT: s_sub_i32 s8, 0, s11
+; GCN-NEXT: s_mul_i32 s0, s2, s10
+; GCN-NEXT: s_sub_u32 s12, 24, s0
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GCN-NEXT: s_or_b32 s9, s0, s1
+; GCN-NEXT: s_subb_u32 s13, s8, s3
+; GCN-NEXT: s_sub_u32 s14, s12, s2
+; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT: s_or_b32 s8, s8, s9
+; GCN-NEXT: s_subb_u32 s8, s13, 0
+; GCN-NEXT: s_cmp_ge_u32 s8, s3
+; GCN-NEXT: s_cselect_b32 s9, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s14, s2
+; GCN-NEXT: s_cselect_b32 s13, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s8, s3
+; GCN-NEXT: s_cselect_b32 s8, s13, s9
+; GCN-NEXT: s_add_u32 s9, s10, 1
+; GCN-NEXT: s_addc_u32 s13, 0, 0
+; GCN-NEXT: s_add_u32 s14, s10, 2
+; GCN-NEXT: s_addc_u32 s15, 0, 0
+; GCN-NEXT: s_cmp_lg_u32 s8, 0
+; GCN-NEXT: s_cselect_b32 s8, s14, s9
+; GCN-NEXT: s_cselect_b32 s9, s15, s13
; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_cmp_lg_u32 s0, 0
-; GCN-NEXT: s_subb_u32 s0, s10, 0
+; GCN-NEXT: s_subb_u32 s0, 0, s11
; GCN-NEXT: s_cmp_ge_u32 s0, s3
; GCN-NEXT: s_cselect_b32 s1, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s13, s2
-; GCN-NEXT: s_cselect_b32 s10, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s12, s2
+; GCN-NEXT: s_cselect_b32 s2, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s0, s3
-; GCN-NEXT: s_cselect_b32 s0, s10, s1
-; GCN-NEXT: s_add_u32 s1, s8, 1
-; GCN-NEXT: s_addc_u32 s10, 0, 0
-; GCN-NEXT: s_add_u32 s13, s8, 2
-; GCN-NEXT: s_addc_u32 s14, 0, 0
+; GCN-NEXT: s_cselect_b32 s0, s2, s1
; GCN-NEXT: s_cmp_lg_u32 s0, 0
-; GCN-NEXT: s_cselect_b32 s0, s13, s1
-; GCN-NEXT: s_cselect_b32 s1, s14, s10
-; GCN-NEXT: s_cmp_lg_u32 s12, 0
-; GCN-NEXT: s_subb_u32 s9, 0, s9
-; GCN-NEXT: s_cmp_ge_u32 s9, s3
-; GCN-NEXT: s_cselect_b32 s10, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s11, s2
-; GCN-NEXT: s_cselect_b32 s2, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s9, s3
-; GCN-NEXT: s_cselect_b32 s2, s2, s10
-; GCN-NEXT: s_cmp_lg_u32 s2, 0
-; GCN-NEXT: s_cselect_b32 s1, s1, 0
-; GCN-NEXT: s_cselect_b32 s0, s0, s8
-; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: s_cselect_b32 s0, s9, 0
+; GCN-NEXT: s_cselect_b32 s1, s8, s10
+; GCN-NEXT: v_mov_b32_e32 v0, s1
+; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NEXT: s_endpgm
;
@@ -945,7 +939,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s10, s8, 1
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_or_b32 s6, s6, s7
-; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0
; GCN-IR-NEXT: s_addc_u32 s6, s9, 0
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_sub_i32 s8, 63, s8
@@ -978,7 +971,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s14, s14, 1
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_or_b32 s16, s16, s17
-; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0
; GCN-IR-NEXT: s_addc_u32 s15, s15, 0
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5]
@@ -1317,7 +1309,6 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s11, s8, 1
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_or_b32 s6, s6, s7
-; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0
; GCN-IR-NEXT: s_addc_u32 s6, s9, 0
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_sub_i32 s8, 63, s8
@@ -1347,7 +1338,6 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s10, s10, 1
; GCN-IR-NEXT: s_cselect_b64 s[12:13], -1, 0
; GCN-IR-NEXT: s_or_b32 s12, s12, s13
-; GCN-IR-NEXT: s_cmp_lg_u32 s12, 0
; GCN-IR-NEXT: s_addc_u32 s11, s11, 0
; GCN-IR-NEXT: s_cselect_b64 s[12:13], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index cdcc914..137dc1f 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -51,10 +51,9 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-NEXT: s_addc_u32 s13, 0, s14
; GCN-NEXT: s_add_u32 s14, s0, s1
; GCN-NEXT: v_mov_b32_e32 v0, s14
-; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
; GCN-NEXT: v_mul_hi_u32 v0, s10, v0
+; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_addc_u32 s12, s12, s13
; GCN-NEXT: s_mul_i32 s0, s10, s12
; GCN-NEXT: v_readfirstlane_b32 s1, v0
@@ -85,7 +84,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-NEXT: s_add_u32 s11, s14, s0
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_addc_u32 s1, s12, s10
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mul_hi_u32 v1, s6, v0
@@ -115,46 +113,43 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-NEXT: v_readfirstlane_b32 s10, v0
; GCN-NEXT: s_add_i32 s5, s10, s5
; GCN-NEXT: s_mul_i32 s10, s9, s4
-; GCN-NEXT: s_add_i32 s10, s5, s10
-; GCN-NEXT: s_sub_i32 s11, s7, s10
+; GCN-NEXT: s_add_i32 s12, s5, s10
+; GCN-NEXT: s_sub_i32 s10, s7, s12
; GCN-NEXT: s_mul_i32 s4, s8, s4
; GCN-NEXT: s_sub_u32 s6, s6, s4
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT: s_or_b32 s12, s4, s5
-; GCN-NEXT: s_cmp_lg_u32 s12, 0
-; GCN-NEXT: s_subb_u32 s11, s11, s9
-; GCN-NEXT: s_sub_u32 s13, s6, s8
-; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GCN-NEXT: s_or_b32 s11, s4, s5
+; GCN-NEXT: s_subb_u32 s13, s10, s9
+; GCN-NEXT: s_sub_u32 s14, s6, s8
+; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT: s_or_b32 s15, s10, s11
+; GCN-NEXT: s_subb_u32 s15, s13, 0
+; GCN-NEXT: s_cmp_ge_u32 s15, s9
+; GCN-NEXT: s_cselect_b32 s16, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s14, s8
+; GCN-NEXT: s_cselect_b32 s17, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s15, s9
+; GCN-NEXT: s_cselect_b32 s16, s17, s16
+; GCN-NEXT: s_or_b32 s10, s10, s11
+; GCN-NEXT: s_subb_u32 s13, s13, s9
+; GCN-NEXT: s_sub_u32 s17, s14, s8
+; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT: s_or_b32 s10, s10, s11
+; GCN-NEXT: s_subb_u32 s10, s13, 0
+; GCN-NEXT: s_cmp_lg_u32 s16, 0
+; GCN-NEXT: s_cselect_b32 s11, s17, s14
+; GCN-NEXT: s_cselect_b32 s10, s10, s15
; GCN-NEXT: s_or_b32 s4, s4, s5
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
-; GCN-NEXT: s_subb_u32 s14, s11, 0
-; GCN-NEXT: s_cmp_ge_u32 s14, s9
+; GCN-NEXT: s_subb_u32 s4, s7, s12
+; GCN-NEXT: s_cmp_ge_u32 s4, s9
; GCN-NEXT: s_cselect_b32 s5, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s13, s8
-; GCN-NEXT: s_cselect_b32 s15, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s14, s9
-; GCN-NEXT: s_cselect_b32 s15, s15, s5
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
-; GCN-NEXT: s_subb_u32 s11, s11, s9
-; GCN-NEXT: s_sub_u32 s16, s13, s8
-; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT: s_or_b32 s4, s4, s5
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
-; GCN-NEXT: s_subb_u32 s4, s11, 0
-; GCN-NEXT: s_cmp_lg_u32 s15, 0
-; GCN-NEXT: s_cselect_b32 s5, s16, s13
-; GCN-NEXT: s_cselect_b32 s4, s4, s14
-; GCN-NEXT: s_cmp_lg_u32 s12, 0
-; GCN-NEXT: s_subb_u32 s7, s7, s10
-; GCN-NEXT: s_cmp_ge_u32 s7, s9
-; GCN-NEXT: s_cselect_b32 s10, -1, 0
; GCN-NEXT: s_cmp_ge_u32 s6, s8
-; GCN-NEXT: s_cselect_b32 s8, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s7, s9
-; GCN-NEXT: s_cselect_b32 s8, s8, s10
-; GCN-NEXT: s_cmp_lg_u32 s8, 0
-; GCN-NEXT: s_cselect_b32 s4, s4, s7
-; GCN-NEXT: s_cselect_b32 s5, s5, s6
+; GCN-NEXT: s_cselect_b32 s7, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s4, s9
+; GCN-NEXT: s_cselect_b32 s5, s7, s5
+; GCN-NEXT: s_cmp_lg_u32 s5, 0
+; GCN-NEXT: s_cselect_b32 s4, s10, s4
+; GCN-NEXT: s_cselect_b32 s5, s11, s6
; GCN-NEXT: v_mov_b32_e32 v0, s5
; GCN-NEXT: v_mov_b32_e32 v1, s4
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -187,7 +182,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-IR-NEXT: s_add_u32 s14, s12, 1
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_or_b32 s8, s8, s9
-; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0
; GCN-IR-NEXT: s_addc_u32 s8, s13, 0
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_sub_i32 s12, 63, s12
@@ -221,7 +215,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-IR-NEXT: s_add_u32 s16, s16, 1
; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
; GCN-IR-NEXT: s_or_b32 s18, s18, s19
-; GCN-IR-NEXT: s_cmp_lg_u32 s18, 0
; GCN-IR-NEXT: s_addc_u32 s17, s17, 0
; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5]
@@ -853,10 +846,9 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_addc_u32 s10, 0, s11
; GCN-NEXT: s_add_u32 s11, s4, s5
; GCN-NEXT: v_mov_b32_e32 v0, s11
-; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
; GCN-NEXT: v_mul_hi_u32 v0, s6, v0
+; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
; GCN-NEXT: s_or_b32 s4, s4, s5
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
; GCN-NEXT: s_addc_u32 s9, s9, s10
; GCN-NEXT: s_mul_i32 s4, s6, s9
; GCN-NEXT: v_readfirstlane_b32 s5, v0
@@ -887,7 +879,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_add_u32 s8, s11, s4
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
; GCN-NEXT: s_or_b32 s4, s4, s5
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
; GCN-NEXT: s_addc_u32 s4, s9, s6
; GCN-NEXT: v_mul_hi_u32 v1, s8, 24
; GCN-NEXT: v_mul_hi_u32 v0, s4, 24
@@ -903,46 +894,43 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_mov_b32 s5, s1
; GCN-NEXT: s_mul_i32 s0, s3, s8
; GCN-NEXT: v_readfirstlane_b32 s1, v0
-; GCN-NEXT: s_add_i32 s9, s1, s0
-; GCN-NEXT: s_sub_i32 s10, 0, s9
+; GCN-NEXT: s_add_i32 s10, s1, s0
+; GCN-NEXT: s_sub_i32 s9, 0, s10
; GCN-NEXT: s_mul_i32 s0, s2, s8
-; GCN-NEXT: s_sub_u32 s8, 24, s0
-; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT: s_or_b32 s11, s0, s1
-; GCN-NEXT: s_cmp_lg_u32 s11, 0
-; GCN-NEXT: s_subb_u32 s10, s10, s3
-; GCN-NEXT: s_sub_u32 s12, s8, s2
+; GCN-NEXT: s_sub_u32 s11, 24, s0
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GCN-NEXT: s_or_b32 s8, s0, s1
+; GCN-NEXT: s_subb_u32 s12, s9, s3
+; GCN-NEXT: s_sub_u32 s13, s11, s2
+; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT: s_or_b32 s14, s8, s9
+; GCN-NEXT: s_subb_u32 s14, s12, 0
+; GCN-NEXT: s_cmp_ge_u32 s14, s3
+; GCN-NEXT: s_cselect_b32 s15, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s13, s2
+; GCN-NEXT: s_cselect_b32 s16, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s14, s3
+; GCN-NEXT: s_cselect_b32 s15, s16, s15
+; GCN-NEXT: s_or_b32 s8, s8, s9
+; GCN-NEXT: s_subb_u32 s12, s12, s3
+; GCN-NEXT: s_sub_u32 s16, s13, s2
+; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT: s_or_b32 s8, s8, s9
+; GCN-NEXT: s_subb_u32 s8, s12, 0
+; GCN-NEXT: s_cmp_lg_u32 s15, 0
+; GCN-NEXT: s_cselect_b32 s9, s16, s13
+; GCN-NEXT: s_cselect_b32 s8, s8, s14
; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_cmp_lg_u32 s0, 0
-; GCN-NEXT: s_subb_u32 s13, s10, 0
-; GCN-NEXT: s_cmp_ge_u32 s13, s3
+; GCN-NEXT: s_subb_u32 s0, 0, s10
+; GCN-NEXT: s_cmp_ge_u32 s0, s3
; GCN-NEXT: s_cselect_b32 s1, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s12, s2
-; GCN-NEXT: s_cselect_b32 s14, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s13, s3
-; GCN-NEXT: s_cselect_b32 s14, s14, s1
-; GCN-NEXT: s_cmp_lg_u32 s0, 0
-; GCN-NEXT: s_subb_u32 s10, s10, s3
-; GCN-NEXT: s_sub_u32 s15, s12, s2
-; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_cmp_lg_u32 s0, 0
-; GCN-NEXT: s_subb_u32 s0, s10, 0
-; GCN-NEXT: s_cmp_lg_u32 s14, 0
-; GCN-NEXT: s_cselect_b32 s1, s15, s12
-; GCN-NEXT: s_cselect_b32 s0, s0, s13
-; GCN-NEXT: s_cmp_lg_u32 s11, 0
-; GCN-NEXT: s_subb_u32 s9, 0, s9
-; GCN-NEXT: s_cmp_ge_u32 s9, s3
-; GCN-NEXT: s_cselect_b32 s10, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s8, s2
+; GCN-NEXT: s_cmp_ge_u32 s11, s2
; GCN-NEXT: s_cselect_b32 s2, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s9, s3
-; GCN-NEXT: s_cselect_b32 s2, s2, s10
-; GCN-NEXT: s_cmp_lg_u32 s2, 0
-; GCN-NEXT: s_cselect_b32 s0, s0, s9
-; GCN-NEXT: s_cselect_b32 s1, s1, s8
+; GCN-NEXT: s_cmp_eq_u32 s0, s3
+; GCN-NEXT: s_cselect_b32 s1, s2, s1
+; GCN-NEXT: s_cmp_lg_u32 s1, 0
+; GCN-NEXT: s_cselect_b32 s0, s8, s0
+; GCN-NEXT: s_cselect_b32 s1, s9, s11
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -970,7 +958,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s10, s8, 1
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_or_b32 s6, s6, s7
-; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0
; GCN-IR-NEXT: s_addc_u32 s6, s9, 0
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_sub_i32 s8, 63, s8
@@ -1003,7 +990,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s14, s14, 1
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_or_b32 s16, s16, s17
-; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0
; GCN-IR-NEXT: s_addc_u32 s15, s15, 0
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5]
@@ -1093,7 +1079,6 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s11, s8, 1
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_or_b32 s6, s6, s7
-; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0
; GCN-IR-NEXT: s_addc_u32 s6, s9, 0
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_sub_i32 s8, 63, s8
@@ -1123,7 +1108,6 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s12, s12, 1
; GCN-IR-NEXT: s_cselect_b64 s[14:15], -1, 0
; GCN-IR-NEXT: s_or_b32 s14, s14, s15
-; GCN-IR-NEXT: s_cmp_lg_u32 s14, 0
; GCN-IR-NEXT: s_addc_u32 s13, s13, 0
; GCN-IR-NEXT: s_cselect_b64 s[14:15], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll
index d67a7b1..e8db647 100644
--- a/llvm/test/CodeGen/AMDGPU/usubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubo.ll
@@ -18,7 +18,6 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: s_or_b32 s0, s0, s1
-; SI-NEXT: s_cmp_lg_u32 s0, 0
; SI-NEXT: s_subb_u32 s3, s3, s9
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -35,10 +34,8 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_sub_u32 s2, s2, s4
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
; VI-NEXT: s_subb_u32 s3, s3, s5
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s3
@@ -53,14 +50,12 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sub_u32 s6, s2, s6
-; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT: s_subb_u32 s4, s3, s7
+; GFX9-NEXT: s_sub_u32 s4, s2, s6
+; GFX9-NEXT: s_subb_u32 s5, s3, s7
; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
@@ -73,8 +68,6 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_sub_u32 s2, s2, s6
-; GFX10-NEXT: s_cselect_b32 s4, -1, 0
-; GFX10-NEXT: s_cmp_lg_u32 s4, 0
; GFX10-NEXT: s_subb_u32 s3, s3, s7
; GFX10-NEXT: s_cselect_b32 s4, -1, 0
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
@@ -91,14 +84,12 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_sub_u32 s2, s2, s4
-; GFX11-NEXT: s_cselect_b32 s4, -1, 0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_cmp_lg_u32 s4, 0
; GFX11-NEXT: s_subb_u32 s3, s3, s5
; GFX11-NEXT: s_cselect_b32 s4, -1, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, s2, s2, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s2
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
@@ -443,7 +434,6 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_sub_u32 s4, s4, s6
; SI-NEXT: s_cselect_b64 s[12:13], -1, 0
; SI-NEXT: s_or_b32 s6, s12, s13
-; SI-NEXT: s_cmp_lg_u32 s6, 0
; SI-NEXT: s_subb_u32 s5, s5, s7
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
@@ -464,16 +454,14 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: s_sub_u32 s2, s4, s6
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_sub_u32 s0, s4, s6
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_subb_u32 s1, s5, s7
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
-; VI-NEXT: s_subb_u32 s0, s5, s7
-; VI-NEXT: v_mov_b32_e32 v4, s2
-; VI-NEXT: v_mov_b32_e32 v5, s0
-; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -485,12 +473,10 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sub_u32 s2, s12, s14
-; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: s_subb_u32 s0, s13, s15
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: s_sub_u32 s0, s12, s14
+; GFX9-NEXT: s_subb_u32 s1, s13, s15
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -503,10 +489,8 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_sub_u32 s0, s12, s14
-; GFX10-NEXT: s_cselect_b32 s1, -1, 0
-; GFX10-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-NEXT: s_cmp_lg_u32 s1, 0
; GFX10-NEXT: s_subb_u32 s1, s13, s15
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: s_cselect_b32 s0, -1, 0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
@@ -519,10 +503,8 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_sub_u32 s4, s4, s6
-; GFX11-NEXT: s_cselect_b32 s6, -1, 0
-; GFX11-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-NEXT: s_cmp_lg_u32 s6, 0
; GFX11-NEXT: s_subb_u32 s5, s5, s7
+; GFX11-NEXT: v_mov_b32_e32 v0, s4
; GFX11-NEXT: s_cselect_b32 s4, -1, 0
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 75db387..28c6b40 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -774,44 +774,40 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
; GFX1032-NEXT: s_add_u32 s11, s12, s11
; GFX1032-NEXT: s_addc_u32 s12, 0, s13
; GFX1032-NEXT: s_add_u32 s8, s8, s11
-; GFX1032-NEXT: s_cselect_b32 s11, -1, 0
-; GFX1032-NEXT: s_mul_hi_u32 s13, s9, s8
-; GFX1032-NEXT: s_cmp_lg_u32 s11, 0
-; GFX1032-NEXT: s_mul_i32 s11, s9, s8
; GFX1032-NEXT: s_addc_u32 s5, s5, s12
-; GFX1032-NEXT: s_mul_i32 s10, s10, s8
+; GFX1032-NEXT: s_mul_hi_u32 s11, s9, s8
+; GFX1032-NEXT: s_mul_i32 s12, s9, s8
; GFX1032-NEXT: s_mul_i32 s9, s9, s5
-; GFX1032-NEXT: s_mul_hi_u32 s12, s8, s11
-; GFX1032-NEXT: s_add_i32 s9, s13, s9
-; GFX1032-NEXT: s_mul_hi_u32 s13, s5, s11
+; GFX1032-NEXT: s_mul_i32 s10, s10, s8
+; GFX1032-NEXT: s_add_i32 s9, s11, s9
+; GFX1032-NEXT: s_mul_i32 s11, s5, s12
; GFX1032-NEXT: s_add_i32 s9, s9, s10
-; GFX1032-NEXT: s_mul_i32 s10, s5, s11
+; GFX1032-NEXT: s_mul_hi_u32 s10, s8, s12
; GFX1032-NEXT: s_mul_i32 s15, s8, s9
; GFX1032-NEXT: s_mul_hi_u32 s14, s8, s9
-; GFX1032-NEXT: s_add_u32 s12, s12, s15
+; GFX1032-NEXT: s_add_u32 s10, s10, s15
+; GFX1032-NEXT: s_mul_hi_u32 s13, s5, s12
; GFX1032-NEXT: s_addc_u32 s14, 0, s14
-; GFX1032-NEXT: s_mul_hi_u32 s11, s5, s9
-; GFX1032-NEXT: s_add_u32 s10, s12, s10
+; GFX1032-NEXT: s_mul_hi_u32 s12, s5, s9
+; GFX1032-NEXT: s_add_u32 s10, s10, s11
; GFX1032-NEXT: s_mul_i32 s9, s5, s9
; GFX1032-NEXT: s_addc_u32 s10, s14, s13
-; GFX1032-NEXT: s_addc_u32 s11, s11, 0
+; GFX1032-NEXT: s_addc_u32 s11, s12, 0
; GFX1032-NEXT: s_add_u32 s9, s10, s9
; GFX1032-NEXT: s_addc_u32 s10, 0, s11
; GFX1032-NEXT: s_add_u32 s8, s8, s9
-; GFX1032-NEXT: s_cselect_b32 s9, -1, 0
-; GFX1032-NEXT: s_mul_hi_u32 s11, s2, s8
-; GFX1032-NEXT: s_cmp_lg_u32 s9, 0
-; GFX1032-NEXT: s_mul_hi_u32 s9, s3, s8
; GFX1032-NEXT: s_addc_u32 s5, s5, s10
-; GFX1032-NEXT: s_mul_i32 s8, s3, s8
+; GFX1032-NEXT: s_mul_hi_u32 s9, s2, s8
; GFX1032-NEXT: s_mul_i32 s12, s2, s5
-; GFX1032-NEXT: s_mul_hi_u32 s10, s2, s5
-; GFX1032-NEXT: s_add_u32 s11, s11, s12
-; GFX1032-NEXT: s_addc_u32 s10, 0, s10
+; GFX1032-NEXT: s_mul_hi_u32 s11, s2, s5
+; GFX1032-NEXT: s_mul_hi_u32 s10, s3, s8
+; GFX1032-NEXT: s_mul_i32 s8, s3, s8
+; GFX1032-NEXT: s_add_u32 s9, s9, s12
+; GFX1032-NEXT: s_addc_u32 s11, 0, s11
; GFX1032-NEXT: s_mul_hi_u32 s13, s3, s5
-; GFX1032-NEXT: s_add_u32 s8, s11, s8
+; GFX1032-NEXT: s_add_u32 s8, s9, s8
; GFX1032-NEXT: s_mul_i32 s5, s3, s5
-; GFX1032-NEXT: s_addc_u32 s8, s10, s9
+; GFX1032-NEXT: s_addc_u32 s8, s11, s10
; GFX1032-NEXT: s_addc_u32 s9, s13, 0
; GFX1032-NEXT: s_add_u32 s5, s8, s5
; GFX1032-NEXT: s_addc_u32 s8, 0, s9
@@ -824,11 +820,8 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
; GFX1032-NEXT: s_sub_i32 s11, s3, s9
; GFX1032-NEXT: s_sub_u32 s10, s2, s10
; GFX1032-NEXT: s_cselect_b32 s12, -1, 0
-; GFX1032-NEXT: s_cmp_lg_u32 s12, 0
; GFX1032-NEXT: s_subb_u32 s11, s11, s1
; GFX1032-NEXT: s_sub_u32 s13, s10, s0
-; GFX1032-NEXT: s_cselect_b32 s14, -1, 0
-; GFX1032-NEXT: s_cmp_lg_u32 s14, 0
; GFX1032-NEXT: s_subb_u32 s11, s11, 0
; GFX1032-NEXT: s_cmp_ge_u32 s11, s1
; GFX1032-NEXT: s_cselect_b32 s14, -1, 0
@@ -901,8 +894,8 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, s0
; GFX1064-NEXT: v_cvt_f32_u32_e32 v1, s1
-; GFX1064-NEXT: s_sub_u32 s9, 0, s0
-; GFX1064-NEXT: s_subb_u32 s10, 0, s1
+; GFX1064-NEXT: s_sub_u32 s8, 0, s0
+; GFX1064-NEXT: s_subb_u32 s9, 0, s1
; GFX1064-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GFX1064-NEXT: v_rcp_f32_e32 v0, v0
; GFX1064-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -911,109 +904,102 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
; GFX1064-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GFX1064-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX1064-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX1064-NEXT: v_readfirstlane_b32 s8, v1
-; GFX1064-NEXT: v_readfirstlane_b32 s4, v0
-; GFX1064-NEXT: s_mul_i32 s5, s9, s8
-; GFX1064-NEXT: s_mul_hi_u32 s12, s9, s4
-; GFX1064-NEXT: s_mul_i32 s11, s10, s4
-; GFX1064-NEXT: s_add_i32 s5, s12, s5
-; GFX1064-NEXT: s_mul_i32 s13, s9, s4
-; GFX1064-NEXT: s_add_i32 s5, s5, s11
-; GFX1064-NEXT: s_mul_hi_u32 s12, s4, s13
-; GFX1064-NEXT: s_mul_i32 s15, s4, s5
-; GFX1064-NEXT: s_mul_hi_u32 s14, s8, s13
-; GFX1064-NEXT: s_mul_i32 s11, s8, s13
-; GFX1064-NEXT: s_mul_hi_u32 s13, s4, s5
+; GFX1064-NEXT: v_readfirstlane_b32 s4, v1
+; GFX1064-NEXT: v_readfirstlane_b32 s5, v0
+; GFX1064-NEXT: s_mul_i32 s10, s8, s4
+; GFX1064-NEXT: s_mul_hi_u32 s12, s8, s5
+; GFX1064-NEXT: s_mul_i32 s11, s9, s5
+; GFX1064-NEXT: s_add_i32 s10, s12, s10
+; GFX1064-NEXT: s_mul_i32 s13, s8, s5
+; GFX1064-NEXT: s_add_i32 s10, s10, s11
+; GFX1064-NEXT: s_mul_hi_u32 s12, s5, s13
+; GFX1064-NEXT: s_mul_i32 s15, s5, s10
+; GFX1064-NEXT: s_mul_hi_u32 s14, s4, s13
+; GFX1064-NEXT: s_mul_i32 s11, s4, s13
+; GFX1064-NEXT: s_mul_hi_u32 s13, s5, s10
; GFX1064-NEXT: s_add_u32 s12, s12, s15
; GFX1064-NEXT: s_addc_u32 s13, 0, s13
-; GFX1064-NEXT: s_mul_hi_u32 s16, s8, s5
+; GFX1064-NEXT: s_mul_hi_u32 s16, s4, s10
; GFX1064-NEXT: s_add_u32 s11, s12, s11
-; GFX1064-NEXT: s_mul_i32 s5, s8, s5
+; GFX1064-NEXT: s_mul_i32 s10, s4, s10
; GFX1064-NEXT: s_addc_u32 s11, s13, s14
; GFX1064-NEXT: s_addc_u32 s12, s16, 0
-; GFX1064-NEXT: s_add_u32 s5, s11, s5
+; GFX1064-NEXT: s_add_u32 s10, s11, s10
; GFX1064-NEXT: s_addc_u32 s11, 0, s12
-; GFX1064-NEXT: s_add_u32 s12, s4, s5
-; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX1064-NEXT: s_mul_hi_u32 s13, s9, s12
-; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX1064-NEXT: s_mul_i32 s4, s9, s12
-; GFX1064-NEXT: s_addc_u32 s8, s8, s11
-; GFX1064-NEXT: s_mul_i32 s10, s10, s12
-; GFX1064-NEXT: s_mul_i32 s9, s9, s8
-; GFX1064-NEXT: s_mul_hi_u32 s5, s12, s4
-; GFX1064-NEXT: s_add_i32 s9, s13, s9
-; GFX1064-NEXT: s_mul_hi_u32 s11, s8, s4
-; GFX1064-NEXT: s_add_i32 s9, s9, s10
-; GFX1064-NEXT: s_mul_i32 s4, s8, s4
-; GFX1064-NEXT: s_mul_i32 s14, s12, s9
-; GFX1064-NEXT: s_mul_hi_u32 s13, s12, s9
-; GFX1064-NEXT: s_add_u32 s5, s5, s14
+; GFX1064-NEXT: s_add_u32 s5, s5, s10
+; GFX1064-NEXT: s_addc_u32 s4, s4, s11
+; GFX1064-NEXT: s_mul_hi_u32 s10, s8, s5
+; GFX1064-NEXT: s_mul_i32 s11, s8, s5
+; GFX1064-NEXT: s_mul_i32 s8, s8, s4
+; GFX1064-NEXT: s_mul_i32 s9, s9, s5
+; GFX1064-NEXT: s_add_i32 s8, s10, s8
+; GFX1064-NEXT: s_mul_i32 s10, s4, s11
+; GFX1064-NEXT: s_add_i32 s8, s8, s9
+; GFX1064-NEXT: s_mul_hi_u32 s9, s5, s11
+; GFX1064-NEXT: s_mul_i32 s14, s5, s8
+; GFX1064-NEXT: s_mul_hi_u32 s13, s5, s8
+; GFX1064-NEXT: s_add_u32 s9, s9, s14
+; GFX1064-NEXT: s_mul_hi_u32 s12, s4, s11
; GFX1064-NEXT: s_addc_u32 s13, 0, s13
-; GFX1064-NEXT: s_mul_hi_u32 s10, s8, s9
-; GFX1064-NEXT: s_add_u32 s4, s5, s4
-; GFX1064-NEXT: s_mul_i32 s9, s8, s9
-; GFX1064-NEXT: s_addc_u32 s4, s13, s11
-; GFX1064-NEXT: s_addc_u32 s5, s10, 0
-; GFX1064-NEXT: s_add_u32 s4, s4, s9
-; GFX1064-NEXT: s_addc_u32 s9, 0, s5
-; GFX1064-NEXT: s_add_u32 s10, s12, s4
-; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX1064-NEXT: s_mul_hi_u32 s11, s2, s10
-; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX1064-NEXT: s_mul_hi_u32 s4, s3, s10
-; GFX1064-NEXT: s_addc_u32 s5, s8, s9
-; GFX1064-NEXT: s_mul_i32 s8, s3, s10
-; GFX1064-NEXT: s_mul_i32 s10, s2, s5
-; GFX1064-NEXT: s_mul_hi_u32 s9, s2, s5
-; GFX1064-NEXT: s_add_u32 s10, s11, s10
-; GFX1064-NEXT: s_addc_u32 s9, 0, s9
-; GFX1064-NEXT: s_mul_hi_u32 s12, s3, s5
-; GFX1064-NEXT: s_add_u32 s8, s10, s8
+; GFX1064-NEXT: s_mul_hi_u32 s11, s4, s8
+; GFX1064-NEXT: s_add_u32 s9, s9, s10
+; GFX1064-NEXT: s_mul_i32 s8, s4, s8
+; GFX1064-NEXT: s_addc_u32 s9, s13, s12
+; GFX1064-NEXT: s_addc_u32 s10, s11, 0
+; GFX1064-NEXT: s_add_u32 s8, s9, s8
+; GFX1064-NEXT: s_addc_u32 s9, 0, s10
+; GFX1064-NEXT: s_add_u32 s5, s5, s8
+; GFX1064-NEXT: s_addc_u32 s4, s4, s9
+; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s5
+; GFX1064-NEXT: s_mul_i32 s11, s2, s4
+; GFX1064-NEXT: s_mul_hi_u32 s10, s2, s4
+; GFX1064-NEXT: s_mul_hi_u32 s9, s3, s5
; GFX1064-NEXT: s_mul_i32 s5, s3, s5
-; GFX1064-NEXT: s_addc_u32 s4, s9, s4
+; GFX1064-NEXT: s_add_u32 s8, s8, s11
+; GFX1064-NEXT: s_addc_u32 s10, 0, s10
+; GFX1064-NEXT: s_mul_hi_u32 s12, s3, s4
+; GFX1064-NEXT: s_add_u32 s5, s8, s5
+; GFX1064-NEXT: s_mul_i32 s4, s3, s4
+; GFX1064-NEXT: s_addc_u32 s5, s10, s9
; GFX1064-NEXT: s_addc_u32 s8, s12, 0
-; GFX1064-NEXT: s_add_u32 s10, s4, s5
+; GFX1064-NEXT: s_add_u32 s10, s5, s4
; GFX1064-NEXT: s_addc_u32 s11, 0, s8
; GFX1064-NEXT: s_mul_hi_u32 s4, s0, s10
; GFX1064-NEXT: s_mul_i32 s5, s0, s11
; GFX1064-NEXT: s_mul_i32 s8, s1, s10
; GFX1064-NEXT: s_add_i32 s4, s4, s5
-; GFX1064-NEXT: s_add_i32 s12, s4, s8
+; GFX1064-NEXT: s_add_i32 s8, s4, s8
; GFX1064-NEXT: s_mul_i32 s4, s0, s10
-; GFX1064-NEXT: s_sub_i32 s8, s3, s12
-; GFX1064-NEXT: s_sub_u32 s13, s2, s4
+; GFX1064-NEXT: s_sub_i32 s9, s3, s8
+; GFX1064-NEXT: s_sub_u32 s12, s2, s4
; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX1064-NEXT: s_subb_u32 s14, s8, s1
-; GFX1064-NEXT: s_sub_u32 s15, s13, s0
-; GFX1064-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GFX1064-NEXT: s_cmp_lg_u64 s[8:9], 0
-; GFX1064-NEXT: s_subb_u32 s8, s14, 0
-; GFX1064-NEXT: s_cmp_ge_u32 s8, s1
-; GFX1064-NEXT: s_cselect_b32 s9, -1, 0
-; GFX1064-NEXT: s_cmp_ge_u32 s15, s0
+; GFX1064-NEXT: s_subb_u32 s9, s9, s1
+; GFX1064-NEXT: s_sub_u32 s13, s12, s0
+; GFX1064-NEXT: s_subb_u32 s9, s9, 0
+; GFX1064-NEXT: s_cmp_ge_u32 s9, s1
; GFX1064-NEXT: s_cselect_b32 s14, -1, 0
-; GFX1064-NEXT: s_cmp_eq_u32 s8, s1
-; GFX1064-NEXT: s_cselect_b32 s8, s14, s9
-; GFX1064-NEXT: s_add_u32 s9, s10, 1
+; GFX1064-NEXT: s_cmp_ge_u32 s13, s0
+; GFX1064-NEXT: s_cselect_b32 s13, -1, 0
+; GFX1064-NEXT: s_cmp_eq_u32 s9, s1
+; GFX1064-NEXT: s_cselect_b32 s9, s13, s14
+; GFX1064-NEXT: s_add_u32 s13, s10, 1
; GFX1064-NEXT: s_addc_u32 s14, s11, 0
; GFX1064-NEXT: s_add_u32 s15, s10, 2
; GFX1064-NEXT: s_addc_u32 s16, s11, 0
-; GFX1064-NEXT: s_cmp_lg_u32 s8, 0
-; GFX1064-NEXT: s_cselect_b32 s15, s15, s9
+; GFX1064-NEXT: s_cmp_lg_u32 s9, 0
+; GFX1064-NEXT: s_cselect_b32 s13, s15, s13
; GFX1064-NEXT: s_cselect_b32 s14, s16, s14
; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX1064-NEXT: s_subb_u32 s3, s3, s12
+; GFX1064-NEXT: s_subb_u32 s3, s3, s8
; GFX1064-NEXT: s_cmp_ge_u32 s3, s1
; GFX1064-NEXT: s_cselect_b32 s4, -1, 0
-; GFX1064-NEXT: s_cmp_ge_u32 s13, s0
+; GFX1064-NEXT: s_cmp_ge_u32 s12, s0
; GFX1064-NEXT: s_cselect_b32 s5, -1, 0
; GFX1064-NEXT: s_cmp_eq_u32 s3, s1
; GFX1064-NEXT: s_cselect_b32 s1, s5, s4
; GFX1064-NEXT: s_cmp_lg_u32 s1, 0
; GFX1064-NEXT: s_cselect_b32 s5, s14, s11
-; GFX1064-NEXT: s_cselect_b32 s4, s15, s10
+; GFX1064-NEXT: s_cselect_b32 s4, s13, s10
; GFX1064-NEXT: s_cbranch_execnz .LBB15_3
; GFX1064-NEXT: .LBB15_2:
; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll b/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll
index 64d055b..4445383 100644
--- a/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll
+++ b/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll
@@ -271,7 +271,6 @@ define i1 @workgroup_nonzero() {
; DAGISEL-GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DAGISEL-GFX8-NEXT: s_or_b32 s4, s12, s13
; DAGISEL-GFX8-NEXT: s_or_b32 s4, s4, s14
-; DAGISEL-GFX8-NEXT: s_cmp_lg_u32 s4, 0
; DAGISEL-GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0
; DAGISEL-GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; DAGISEL-GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -281,7 +280,6 @@ define i1 @workgroup_nonzero() {
; DAGISEL-GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DAGISEL-GFX942-NEXT: s_or_b32 s0, s12, s13
; DAGISEL-GFX942-NEXT: s_or_b32 s0, s0, s14
-; DAGISEL-GFX942-NEXT: s_cmp_lg_u32 s0, 0
; DAGISEL-GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0
; DAGISEL-GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; DAGISEL-GFX942-NEXT: s_setpc_b64 s[30:31]
@@ -299,8 +297,6 @@ define i1 @workgroup_nonzero() {
; DAGISEL-GFX12-NEXT: s_or_b32 s0, ttmp9, s0
; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe
; DAGISEL-GFX12-NEXT: s_or_b32 s0, s0, s1
-; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe
-; DAGISEL-GFX12-NEXT: s_cmp_lg_u32 s0, 0
; DAGISEL-GFX12-NEXT: s_cselect_b32 s0, -1, 0
; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe
; DAGISEL-GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
@@ -311,7 +307,6 @@ define i1 @workgroup_nonzero() {
; GISEL-GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-GFX8-NEXT: s_or_b32 s4, s12, s13
; GISEL-GFX8-NEXT: s_or_b32 s4, s4, s14
-; GISEL-GFX8-NEXT: s_cmp_lg_u32 s4, 0
; GISEL-GFX8-NEXT: s_cselect_b32 s4, 1, 0
; GISEL-GFX8-NEXT: v_mov_b32_e32 v0, s4
; GISEL-GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -321,7 +316,6 @@ define i1 @workgroup_nonzero() {
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-GFX942-NEXT: s_or_b32 s0, s12, s13
; GISEL-GFX942-NEXT: s_or_b32 s0, s0, s14
-; GISEL-GFX942-NEXT: s_cmp_lg_u32 s0, 0
; GISEL-GFX942-NEXT: s_cselect_b32 s0, 1, 0
; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, s0
; GISEL-GFX942-NEXT: s_setpc_b64 s[30:31]
@@ -339,8 +333,6 @@ define i1 @workgroup_nonzero() {
; GISEL-GFX12-NEXT: s_or_b32 s0, ttmp9, s0
; GISEL-GFX12-NEXT: s_wait_alu 0xfffe
; GISEL-GFX12-NEXT: s_or_b32 s0, s0, s1
-; GISEL-GFX12-NEXT: s_wait_alu 0xfffe
-; GISEL-GFX12-NEXT: s_cmp_lg_u32 s0, 0
; GISEL-GFX12-NEXT: s_cselect_b32 s0, 1, 0
; GISEL-GFX12-NEXT: s_wait_alu 0xfffe
; GISEL-GFX12-NEXT: v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/ARM/2014-05-14-DwarfEHCrash.ll b/llvm/test/CodeGen/ARM/2014-05-14-DwarfEHCrash.ll
index 96639ed..bfaf799 100644
--- a/llvm/test/CodeGen/ARM/2014-05-14-DwarfEHCrash.ll
+++ b/llvm/test/CodeGen/ARM/2014-05-14-DwarfEHCrash.ll
@@ -45,6 +45,6 @@ declare ptr @__cxa_begin_catch(ptr)
declare void @__cxa_end_catch()
-attributes #0 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="true" }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/ARM/ARMLoadStoreDBG.mir b/llvm/test/CodeGen/ARM/ARMLoadStoreDBG.mir
index 812ac23..a18023c 100644
--- a/llvm/test/CodeGen/ARM/ARMLoadStoreDBG.mir
+++ b/llvm/test/CodeGen/ARM/ARMLoadStoreDBG.mir
@@ -31,8 +31,8 @@
; Function Attrs: nounwind readnone
declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
- attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
- attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+ attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
attributes #2 = { nounwind readnone }
attributes #3 = { nounwind }
diff --git a/llvm/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll b/llvm/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll
index 25119fe..7df230c 100644
--- a/llvm/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll
+++ b/llvm/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll
@@ -441,8 +441,7 @@ entry:
define i32 @test_shufflevector_s32_v2s32(i32 %arg) {
; CHECK-LABEL: name: test_shufflevector_s32_v2s32
; CHECK: [[ARG:%[0-9]+]]:_(s32) = COPY $r0
-; CHECK-DAG: [[UNDEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = G_SHUFFLE_VECTOR [[ARG]](s32), [[UNDEF]], shufflemask(0, 0)
+; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[ARG]](s32), [[ARG]](s32)
; CHECK: G_EXTRACT_VECTOR_ELT [[VEC]](<2 x s32>)
%vec = insertelement <1 x i32> undef, i32 %arg, i32 0
%shuffle = shufflevector <1 x i32> %vec, <1 x i32> undef, <2 x i32> zeroinitializer
@@ -453,8 +452,7 @@ define i32 @test_shufflevector_s32_v2s32(i32 %arg) {
define i32 @test_shufflevector_s32_s32_s32(i32 %arg) {
; CHECK-LABEL: name: test_shufflevector_s32_s32_s32
; CHECK: [[ARG:%[0-9]+]]:_(s32) = COPY $r0
-; CHECK-DAG: [[UNDEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-; CHECK: [[VEC:%[0-9]+]]:_(s32) = G_SHUFFLE_VECTOR [[ARG]](s32), [[UNDEF]], shufflemask(0)
+; CHECK: r0 = COPY [[ARG]](s32)
%vec = insertelement <1 x i32> undef, i32 %arg, i32 0
%shuffle = shufflevector <1 x i32> %vec, <1 x i32> undef, <1 x i32> zeroinitializer
%res = extractelement <1 x i32> %shuffle, i32 0
diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll
index 9601a2e..2731148 100644
--- a/llvm/test/CodeGen/ARM/O3-pipeline.ll
+++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll
@@ -166,6 +166,7 @@
; CHECK-NEXT: ARM Execution Domain Fix
; CHECK-NEXT: BreakFalseDeps
; CHECK-NEXT: ARM pseudo instruction expansion pass
+; CHECK-NEXT: Insert KCFI indirect call checks
; CHECK-NEXT: Thumb2 instruction size reduce pass
; CHECK-NEXT: MachineDominator Tree Construction
; CHECK-NEXT: Machine Natural Loop Construction
diff --git a/llvm/test/CodeGen/ARM/Windows/wineh-basic.ll b/llvm/test/CodeGen/ARM/Windows/wineh-basic.ll
index d0bdd66..e4dc92d 100644
--- a/llvm/test/CodeGen/ARM/Windows/wineh-basic.ll
+++ b/llvm/test/CodeGen/ARM/Windows/wineh-basic.ll
@@ -36,8 +36,8 @@ declare arm_aapcs_vfpcc i32 @__CxxFrameHandler3(...)
declare arm_aapcs_vfpcc void @__std_terminate() local_unnamed_addr
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a9" "target-features"="+dsp,+fp16,+neon,+strict-align,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a9" "target-features"="+dsp,+fp16,+neon,+strict-align,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a9" "target-features"="+dsp,+fp16,+neon,+strict-align,+vfp3" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a9" "target-features"="+dsp,+fp16,+neon,+strict-align,+vfp3" "use-soft-float"="false" }
attributes #2 = { noreturn nounwind }
!llvm.module.flags = !{!0, !1}
diff --git a/llvm/test/CodeGen/ARM/byval_load_align.ll b/llvm/test/CodeGen/ARM/byval_load_align.ll
index c594bd3..5bb4fe7 100644
--- a/llvm/test/CodeGen/ARM/byval_load_align.ll
+++ b/llvm/test/CodeGen/ARM/byval_load_align.ll
@@ -22,6 +22,6 @@ entry:
declare void @Logger(i8 signext, ptr byval(%struct.ModuleID)) #1
-attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll b/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll
index 972a470..cabd43e 100644
--- a/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll
+++ b/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll
@@ -27,7 +27,7 @@ entry:
!1 = !{i64 0, !"_ZTSFivE.generalized"}
!2 = !{i64 0, !"_ZTSFviE.generalized"}
-; CHECK: .section .llvm.callgraph,"o",%progbits,.text
+; CHECK: .section .llvm.callgraph,"o",%llvm_call_graph,.text
;; Version
; CHECK-NEXT: .byte 0
;; Flags -- Potential indirect target so LSB is set to 1. Other bits are 0.
diff --git a/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll b/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll
index ec8d5b8..3d3974e 100644
--- a/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll
+++ b/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll
@@ -36,7 +36,7 @@ entry:
!4 = !{!5}
!5 = !{i64 0, !"_ZTSFPvS_E.generalized"}
-; CHECK: .section .llvm.callgraph,"o",%progbits,.text
+; CHECK: .section .llvm.callgraph,"o",%llvm_call_graph,.text
;; Version
; CHECK-NEXT: .byte 0
;; Flags
diff --git a/llvm/test/CodeGen/ARM/cfguard-module-flag.ll b/llvm/test/CodeGen/ARM/cfguard-module-flag.ll
index 3e8c9f4..bb3c04a 100644
--- a/llvm/test/CodeGen/ARM/cfguard-module-flag.ll
+++ b/llvm/test/CodeGen/ARM/cfguard-module-flag.ll
@@ -21,7 +21,7 @@ entry:
; CHECK-NOT: __guard_check_icall_fptr
; CHECK-NOT: __guard_dispatch_icall_fptr
}
-attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a9" "target-features"="+armv7-a,+dsp,+fp16,+neon,+strict-align,+thumb-mode,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false"}
+attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a9" "target-features"="+armv7-a,+dsp,+fp16,+neon,+strict-align,+thumb-mode,+vfp3" "use-soft-float"="false"}
!llvm.module.flags = !{!0}
!0 = !{i32 2, !"cfguard", i32 1}
diff --git a/llvm/test/CodeGen/ARM/clang-section.ll b/llvm/test/CodeGen/ARM/clang-section.ll
index 9277d90..9c32ab2 100644
--- a/llvm/test/CodeGen/ARM/clang-section.ll
+++ b/llvm/test/CodeGen/ARM/clang-section.ll
@@ -35,8 +35,8 @@ attributes #0 = { "bss-section"="my_bss.1" "data-section"="my_data.1" "rodata-se
attributes #1 = { "data-section"="my_data.1" "rodata-section"="my_rodata.1" }
attributes #2 = { "bss-section"="my_bss.2" "rodata-section"="my_rodata.1" }
attributes #3 = { "bss-section"="my_bss.2" "data-section"="my_data.2" "rodata-section"="my_rodata.2" }
-attributes #6 = { "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a9" "target-features"="+dsp,+fp16,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #7 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a9" "target-features"="+dsp,+fp16,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #6 = { "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a9" "target-features"="+dsp,+fp16,+neon,+vfp3" "use-soft-float"="false" }
+attributes #7 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a9" "target-features"="+dsp,+fp16,+neon,+vfp3" "use-soft-float"="false" }
!llvm.module.flags = !{!0, !1, !2, !3}
diff --git a/llvm/test/CodeGen/ARM/cmse-clear-float-bigend.mir b/llvm/test/CodeGen/ARM/cmse-clear-float-bigend.mir
index 47f4e1a..ae36da4 100644
--- a/llvm/test/CodeGen/ARM/cmse-clear-float-bigend.mir
+++ b/llvm/test/CodeGen/ARM/cmse-clear-float-bigend.mir
@@ -16,7 +16,7 @@
; Function Attrs: nounwind
declare void @llvm.stackprotector(ptr, ptr) #1
- attributes #0 = { "cmse_nonsecure_entry" nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+8msecext,+armv8-m.main,-d32,-fp64,+fp-armv8,+hwdiv,+thumb-mode,-crypto,-fullfp16,-neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #0 = { "cmse_nonsecure_entry" nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+8msecext,+armv8-m.main,-d32,-fp64,+fp-armv8,+hwdiv,+thumb-mode,-crypto,-fullfp16,-neon" "use-soft-float"="false" }
attributes #1 = { nounwind }
attributes #2 = { "cmse_nonsecure_call" nounwind }
diff --git a/llvm/test/CodeGen/ARM/coalesce-dbgvalue.ll b/llvm/test/CodeGen/ARM/coalesce-dbgvalue.ll
index 4d4853c..7960b79 100644
--- a/llvm/test/CodeGen/ARM/coalesce-dbgvalue.ll
+++ b/llvm/test/CodeGen/ARM/coalesce-dbgvalue.ll
@@ -72,8 +72,8 @@ declare i32 @fn3(...) #1
; Function Attrs: nounwind readnone
declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { nounwind readnone }
attributes #3 = { nounwind }
diff --git a/llvm/test/CodeGen/ARM/constantpool-promote-dbg.ll b/llvm/test/CodeGen/ARM/constantpool-promote-dbg.ll
index 246eeeb..4bc6c41 100644
--- a/llvm/test/CodeGen/ARM/constantpool-promote-dbg.ll
+++ b/llvm/test/CodeGen/ARM/constantpool-promote-dbg.ll
@@ -19,7 +19,7 @@ entry:
ret ptr getelementptr inbounds ([4 x i8], ptr @.str, i32 0, i32 1), !dbg !16
}
-attributes #0 = { minsize norecurse nounwind optsize readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m3" "target-features"="+hwdiv,+soft-float,-crypto,-neon" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { minsize norecurse nounwind optsize readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m3" "target-features"="+hwdiv,+soft-float,-crypto,-neon" "use-soft-float"="true" }
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5, !6}
diff --git a/llvm/test/CodeGen/ARM/constantpool-promote.ll b/llvm/test/CodeGen/ARM/constantpool-promote.ll
index c383b39..87f14ebf 100644
--- a/llvm/test/CodeGen/ARM/constantpool-promote.ll
+++ b/llvm/test/CodeGen/ARM/constantpool-promote.ll
@@ -200,8 +200,8 @@ declare void @d(ptr) #1
declare void @llvm.memcpy.p0.p0.i32(ptr nocapture writeonly, ptr nocapture readonly, i32, i1)
declare void @llvm.memmove.p0.p0.i32(ptr, ptr, i32, i1) local_unnamed_addr
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
attributes #2 = { nounwind }
!llvm.module.flags = !{!0, !1}
diff --git a/llvm/test/CodeGen/ARM/early-cfi-sections.ll b/llvm/test/CodeGen/ARM/early-cfi-sections.ll
index 72b8702..ef99ae5 100644
--- a/llvm/test/CodeGen/ARM/early-cfi-sections.ll
+++ b/llvm/test/CodeGen/ARM/early-cfi-sections.ll
@@ -13,7 +13,7 @@ entry:
ret void, !dbg !10
}
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="arm7tdmi" "target-features"="+soft-float,+strict-align,-crypto,-neon" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="arm7tdmi" "target-features"="+soft-float,+strict-align,-crypto,-neon" "use-soft-float"="true" }
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5, !6}
diff --git a/llvm/test/CodeGen/ARM/fp16-vld.ll b/llvm/test/CodeGen/ARM/fp16-vld.ll
index 549546e..778685c 100644
--- a/llvm/test/CodeGen/ARM/fp16-vld.ll
+++ b/llvm/test/CodeGen/ARM/fp16-vld.ll
@@ -43,4 +43,4 @@ byeblock:
ret void
}
-attributes #0 = { norecurse nounwind readonly "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "target-cpu"="generic" "target-features"="+armv8.2-a,+fullfp16,+strict-align,-thumb-mode" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind readonly "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "target-cpu"="generic" "target-features"="+armv8.2-a,+fullfp16,+strict-align,-thumb-mode" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/ARM/global-merge-1.ll b/llvm/test/CodeGen/ARM/global-merge-1.ll
index 46e9d96..05719ae 100644
--- a/llvm/test/CodeGen/ARM/global-merge-1.ll
+++ b/llvm/test/CodeGen/ARM/global-merge-1.ll
@@ -74,9 +74,9 @@ define internal ptr @returnFoo() #2 {
ret ptr @foo
}
-attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
attributes #3 = { nounwind }
!llvm.ident = !{!0}
diff --git a/llvm/test/CodeGen/ARM/isel-v8i32-crash.ll b/llvm/test/CodeGen/ARM/isel-v8i32-crash.ll
index 27534a6..bdd842a 100644
--- a/llvm/test/CodeGen/ARM/isel-v8i32-crash.ll
+++ b/llvm/test/CodeGen/ARM/isel-v8i32-crash.ll
@@ -21,4 +21,4 @@ entry:
ret void
}
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/ARM/kcfi-arm.ll b/llvm/test/CodeGen/ARM/kcfi-arm.ll
new file mode 100644
index 0000000..e3696cf
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/kcfi-arm.ll
@@ -0,0 +1,138 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=armv7-linux-gnueabi -verify-machineinstrs < %s | FileCheck %s --check-prefix=ASM
+; RUN: llc -mtriple=armv7-linux-gnueabi -verify-machineinstrs -stop-after=finalize-isel < %s | FileCheck %s --check-prefixes=MIR,ISEL
+; RUN: llc -mtriple=armv7-linux-gnueabi -verify-machineinstrs -stop-after=kcfi < %s | FileCheck %s --check-prefixes=MIR,KCFI
+
+; MIR checks for all functions (grouped here to prevent update_llc_test_checks.py from removing them)
+
+; MIR-LABEL: name: f1
+; MIR: body:
+
+; ISEL: BLX %0, csr_aapcs,{{.*}} cfi-type 12345678
+
+; KCFI: BUNDLE{{.*}} {
+; KCFI-NEXT: KCFI_CHECK_ARM $r0, 12345678
+; KCFI-NEXT: BLX killed $r0, csr_aapcs,{{.*}}
+; KCFI-NEXT: }
+
+; MIR-LABEL: name: f2
+; MIR: body:
+
+; ISEL: TCRETURNri %0, 0, csr_aapcs, implicit $sp, cfi-type 12345678
+
+; KCFI: BUNDLE{{.*}} {
+; KCFI-NEXT: KCFI_CHECK_ARM $r0, 12345678
+; KCFI-NEXT: TAILJMPr killed $r0, csr_aapcs, implicit $sp, implicit $sp
+; KCFI-NEXT: }
+
+; ASM: .long 12345678
+define void @f1(ptr noundef %x) !kcfi_type !1 {
+; ASM-LABEL: f1:
+; ASM: @ %bb.0:
+; ASM-NEXT: .save {r11, lr}
+; ASM-NEXT: push {r11, lr}
+; ASM-NEXT: bic r12, r0, #1
+; ASM-NEXT: ldr r12, [r12, #-4]
+; ASM-NEXT: eor r12, r12, #78
+; ASM-NEXT: eor r12, r12, #24832
+; ASM-NEXT: eor r12, r12, #12320768
+; ASM-NEXT: eors r12, r12, #0
+; ASM-NEXT: beq .Ltmp0
+; ASM-NEXT: udf #33760
+; ASM-NEXT: .Ltmp0:
+; ASM-NEXT: blx r0
+; ASM-NEXT: pop {r11, pc}
+
+ call void %x() [ "kcfi"(i32 12345678) ]
+ ret void
+}
+
+; Test with tail call
+define void @f2(ptr noundef %x) !kcfi_type !1 {
+; ASM-LABEL: f2:
+; ASM: @ %bb.0:
+; ASM-NEXT: bic r12, r0, #1
+; ASM-NEXT: ldr r12, [r12, #-4]
+; ASM-NEXT: eor r12, r12, #78
+; ASM-NEXT: eor r12, r12, #24832
+; ASM-NEXT: eor r12, r12, #12320768
+; ASM-NEXT: eors r12, r12, #0
+; ASM-NEXT: beq .Ltmp1
+; ASM-NEXT: udf #33760
+; ASM-NEXT: .Ltmp1:
+; ASM-NEXT: bx r0
+
+ tail call void %x() [ "kcfi"(i32 12345678) ]
+ ret void
+}
+
+; Test r3 spill/reload when target is r12 and r3 is a call argument.
+; With 5+ arguments (target + 4 args), r0-r3 are all used for arguments,
+; forcing r3 to be spilled when we need it as scratch register.
+define void @f3_r3_spill(ptr noundef %target, i32 %a, i32 %b, i32 %c, i32 %d) !kcfi_type !1 {
+; ASM-LABEL: f3_r3_spill:
+; ASM: @ %bb.0:
+; ASM-NEXT: .save {r11, lr}
+; ASM-NEXT: push {r11, lr}
+; ASM-NEXT: mov lr, r3
+; ASM-NEXT: ldr r3, [sp, #8]
+; ASM-NEXT: mov r12, r0
+; ASM-NEXT: mov r0, r1
+; ASM-NEXT: mov r1, r2
+; ASM-NEXT: mov r2, lr
+; ASM-NEXT: stmdb sp!, {r3}
+; ASM-NEXT: bic r3, r12, #1
+; ASM-NEXT: ldr r3, [r3, #-4]
+; ASM-NEXT: eor r3, r3, #78
+; ASM-NEXT: eor r3, r3, #24832
+; ASM-NEXT: eor r3, r3, #12320768
+; ASM-NEXT: eors r3, r3, #0
+; ASM-NEXT: ldm sp!, {r3}
+; ASM-NEXT: beq .Ltmp2
+; ASM-NEXT: udf #33772
+; ASM-NEXT: .Ltmp2:
+; ASM-NEXT: blx r12
+; ASM-NEXT: pop {r11, pc}
+; Arguments: r0=%target, r1=%a, r2=%b, r3=%c, [sp]=%d
+; Call needs: r0=%a, r1=%b, r2=%c, r3=%d, target in r12
+; Compiler shuffles arguments into place, saving r3 (c) in lr, loading d from stack
+; r3 is live as 4th argument, so push it before KCFI check
+; Restore r3 immediately after comparison, before branch
+ call void %target(i32 %a, i32 %b, i32 %c, i32 %d) [ "kcfi"(i32 12345678) ]
+ ret void
+}
+
+; Test with 3 arguments - r3 not live, target in r12, so r3 used as scratch without spilling
+define void @f4_r3_unused(ptr noundef %target, i32 %a, i32 %b) !kcfi_type !1 {
+; ASM-LABEL: f4_r3_unused:
+; ASM: @ %bb.0:
+; ASM-NEXT: .save {r11, lr}
+; ASM-NEXT: push {r11, lr}
+; ASM-NEXT: mov r3, r0
+; ASM-NEXT: mov r0, r1
+; ASM-NEXT: mov r1, r2
+; ASM-NEXT: bic r12, r3, #1
+; ASM-NEXT: ldr r12, [r12, #-4]
+; ASM-NEXT: eor r12, r12, #78
+; ASM-NEXT: eor r12, r12, #24832
+; ASM-NEXT: eor r12, r12, #12320768
+; ASM-NEXT: eors r12, r12, #0
+; ASM-NEXT: beq .Ltmp3
+; ASM-NEXT: udf #33763
+; ASM-NEXT: .Ltmp3:
+; ASM-NEXT: blx r3
+; ASM-NEXT: pop {r11, pc}
+; Only 3 arguments total, so r3 is not used as call argument
+; Compiler puts target→r3, a→r0, b→r1
+; r3 is the target, so we use r12 as scratch (no spill needed)
+ call void %target(i32 %a, i32 %b) [ "kcfi"(i32 12345678) ]
+ ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"kcfi", i32 1}
+!1 = !{i32 12345678}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; ISEL: {{.*}}
+; KCFI: {{.*}}
+; MIR: {{.*}}
diff --git a/llvm/test/CodeGen/ARM/kcfi-cbz-range.ll b/llvm/test/CodeGen/ARM/kcfi-cbz-range.ll
new file mode 100644
index 0000000..8e71cae
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/kcfi-cbz-range.ll
@@ -0,0 +1,81 @@
+; RUN: llc -mtriple=thumbv7-linux-gnueabi -filetype=obj < %s
+; RUN: llc -mtriple=thumbv7-linux-gnueabi < %s | FileCheck %s
+
+; This test verifies that KCFI instrumentation doesn't cause "out of range
+; pc-relative fixup value" errors when generating object files.
+;
+; The test creates a scenario with enough KCFI-instrumented indirect calls
+; (~32 bytes each) that would push a cbz/cbnz instruction out of its ±126 byte
+; range if the KCFI_CHECK pseudo-instruction size is not properly accounted for.
+;
+; Without the fix (KCFI_CHECK returns size 0):
+; - Backend thinks KCFI checks take no space
+; - Generates cbz to branch over the code
+; - During assembly, cbz target is >126 bytes away
+; - Assembly fails with "error: out of range pc-relative fixup value"
+;
+; With the fix (KCFI_CHECK returns size 32 for Thumb2):
+; - Backend correctly accounts for KCFI check expansion
+; - Avoids cbz or uses longer-range branch instructions
+; - Assembly succeeds, object file is generated
+
+declare void @external_function(i32)
+
+; Test WITHOUT KCFI: should generate cbz since calls are small
+; CHECK-LABEL: test_without_kcfi:
+; CHECK: cbz
+; CHECK-NOT: bic{{.*}}#1
+define i32 @test_without_kcfi(ptr %callback, i32 %x) {
+entry:
+ %cmp = icmp eq i32 %x, 0
+ br i1 %cmp, label %if_zero, label %if_nonzero
+
+if_nonzero:
+ ; Regular (non-KCFI) indirect calls - much smaller
+ call void %callback()
+ call void %callback()
+ call void %callback()
+ call void %callback()
+ call void %callback()
+ call void %callback()
+
+ call void @external_function(i32 %x)
+ %add1 = add i32 %x, 1
+ ret i32 %add1
+
+if_zero:
+ call void @external_function(i32 0)
+ ret i32 0
+}
+
+; Test WITH KCFI: should NOT generate cbz due to large KCFI checks
+; CHECK-LABEL: test_with_kcfi:
+; CHECK-NOT: cbz
+; CHECK: bic{{.*}}#1
+define i32 @test_with_kcfi(ptr %callback, i32 %x) !kcfi_type !1 {
+entry:
+ %cmp = icmp eq i32 %x, 0
+ br i1 %cmp, label %if_zero, label %if_nonzero
+
+if_nonzero:
+ ; Six KCFI-instrumented indirect calls (~192 bytes total, exceeds cbz range)
+ call void %callback() [ "kcfi"(i32 12345678) ]
+ call void %callback() [ "kcfi"(i32 12345678) ]
+ call void %callback() [ "kcfi"(i32 12345678) ]
+ call void %callback() [ "kcfi"(i32 12345678) ]
+ call void %callback() [ "kcfi"(i32 12345678) ]
+ call void %callback() [ "kcfi"(i32 12345678) ]
+
+ ; Regular call to prevent optimization
+ call void @external_function(i32 %x)
+ %add1 = add i32 %x, 1
+ ret i32 %add1
+
+if_zero:
+ call void @external_function(i32 0)
+ ret i32 0
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"kcfi", i32 1}
+!1 = !{i32 12345678}
diff --git a/llvm/test/CodeGen/ARM/kcfi-patchable-function-prefix.ll b/llvm/test/CodeGen/ARM/kcfi-patchable-function-prefix.ll
new file mode 100644
index 0000000..f8e0838
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/kcfi-patchable-function-prefix.ll
@@ -0,0 +1,99 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=armv7-linux-gnueabi -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK: .p2align 2
+; CHECK-NOT: nop
+; CHECK: .long 12345678
+define void @f1(ptr noundef %x) !kcfi_type !1 {
+; CHECK-LABEL: f1:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: .save {r11, lr}
+; CHECK-NEXT: push {r11, lr}
+; CHECK-NEXT: bic r12, r0, #1
+; CHECK-NEXT: ldr r12, [r12, #-4]
+; CHECK-NEXT: eor r12, r12, #78
+; CHECK-NEXT: eor r12, r12, #24832
+; CHECK-NEXT: eor r12, r12, #12320768
+; CHECK-NEXT: eors r12, r12, #0
+; CHECK-NEXT: beq .Ltmp0
+; CHECK-NEXT: udf #33760
+; CHECK-NEXT: .Ltmp0:
+; CHECK-NEXT: blx r0
+; CHECK-NEXT: pop {r11, pc}
+ call void %x() [ "kcfi"(i32 12345678) ]
+ ret void
+}
+
+; CHECK: .p2align 2
+; CHECK-NOT: .long
+; CHECK-NOT: nop
+define void @f2(ptr noundef %x) {
+; CHECK-LABEL: f2:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: .save {r11, lr}
+; CHECK-NEXT: push {r11, lr}
+; CHECK-NEXT: bic r12, r0, #1
+; CHECK-NEXT: ldr r12, [r12, #-4]
+; CHECK-NEXT: eor r12, r12, #78
+; CHECK-NEXT: eor r12, r12, #24832
+; CHECK-NEXT: eor r12, r12, #12320768
+; CHECK-NEXT: eors r12, r12, #0
+; CHECK-NEXT: beq .Ltmp1
+; CHECK-NEXT: udf #33760
+; CHECK-NEXT: .Ltmp1:
+; CHECK-NEXT: blx r0
+; CHECK-NEXT: pop {r11, pc}
+ call void %x() [ "kcfi"(i32 12345678) ]
+ ret void
+}
+
+; CHECK: .p2align 2
+; CHECK: .long 12345678
+; CHECK-COUNT-11: nop
+define void @f3(ptr noundef %x) #0 !kcfi_type !1 {
+; CHECK-LABEL: f3:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: .save {r11, lr}
+; CHECK-NEXT: push {r11, lr}
+; CHECK-NEXT: bic r12, r0, #1
+; CHECK-NEXT: ldr r12, [r12, #-48]
+; CHECK-NEXT: eor r12, r12, #78
+; CHECK-NEXT: eor r12, r12, #24832
+; CHECK-NEXT: eor r12, r12, #12320768
+; CHECK-NEXT: eors r12, r12, #0
+; CHECK-NEXT: beq .Ltmp3
+; CHECK-NEXT: udf #33760
+; CHECK-NEXT: .Ltmp3:
+; CHECK-NEXT: blx r0
+; CHECK-NEXT: pop {r11, pc}
+ call void %x() [ "kcfi"(i32 12345678) ]
+ ret void
+}
+
+; CHECK: .p2align 2
+; CHECK-COUNT-11: nop
+define void @f4(ptr noundef %x) #0 {
+; CHECK-LABEL: f4:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: .save {r11, lr}
+; CHECK-NEXT: push {r11, lr}
+; CHECK-NEXT: bic r12, r0, #1
+; CHECK-NEXT: ldr r12, [r12, #-48]
+; CHECK-NEXT: eor r12, r12, #78
+; CHECK-NEXT: eor r12, r12, #24832
+; CHECK-NEXT: eor r12, r12, #12320768
+; CHECK-NEXT: eors r12, r12, #0
+; CHECK-NEXT: beq .Ltmp5
+; CHECK-NEXT: udf #33760
+; CHECK-NEXT: .Ltmp5:
+; CHECK-NEXT: blx r0
+; CHECK-NEXT: pop {r11, pc}
+ call void %x() [ "kcfi"(i32 12345678) ]
+ ret void
+}
+
+attributes #0 = { "patchable-function-prefix"="11" }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"kcfi", i32 1}
+!1 = !{i32 12345678}
diff --git a/llvm/test/CodeGen/ARM/kcfi-thumb.ll b/llvm/test/CodeGen/ARM/kcfi-thumb.ll
new file mode 100644
index 0000000..7c02d830
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/kcfi-thumb.ll
@@ -0,0 +1,215 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=thumbv6m-none-eabi < %s | FileCheck %s
+
+; This test verifies that Thumb1 (ARMv6-M) generates correct code for backend KCFI.
+; Thumb1 uses the backend KCFI implementation with Thumb1-specific instructions.
+
+; Test function without KCFI annotation
+; CHECK-LABEL: .globl nosan
+; CHECK-NEXT: .p2align 1
+; CHECK-NEXT: .type nosan,%function
+; CHECK-NEXT: .code 16
+; CHECK-NEXT: .thumb_func
+define dso_local void @nosan() nounwind {
+; CHECK-LABEL: nosan:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: bx lr
+ ret void
+}
+
+; Test function with KCFI annotation - verifies type hash emission
+;; The alignment is at least 4 to avoid unaligned type hash loads when this
+;; instrumented function is indirectly called.
+; CHECK-LABEL: .globl target_func
+; CHECK-NEXT: .p2align 2
+; CHECK-NEXT: .type target_func,%function
+; CHECK-NEXT: .long 3170468932
+; CHECK-NEXT: .code 16
+; CHECK-NEXT: .thumb_func
+define void @target_func() !kcfi_type !1 {
+; CHECK-LABEL: target_func:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: bx lr
+ ret void
+}
+
+; Test indirect call with KCFI check using operand bundles
+; CHECK-LABEL: .globl f1
+; CHECK: .p2align 2
+; CHECK-NEXT: .type f1,%function
+; CHECK-NEXT: .long 3170468932
+; CHECK-NEXT: .code 16
+; CHECK-NEXT: .thumb_func
+define void @f1(ptr noundef %x) !kcfi_type !1 {
+; CHECK-LABEL: f1:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: movs r3, #1
+; CHECK-NEXT: mov r2, r0
+; CHECK-NEXT: bics r2, r3
+; CHECK-NEXT: subs r2, #4
+; CHECK-NEXT: ldr r2, [r2]
+; CHECK-NEXT: movs r3, #188
+; CHECK-NEXT: lsls r3, r3, #8
+; CHECK-NEXT: adds r3, #249
+; CHECK-NEXT: lsls r3, r3, #8
+; CHECK-NEXT: adds r3, #132
+; CHECK-NEXT: lsls r3, r3, #8
+; CHECK-NEXT: adds r3, #68
+; CHECK-NEXT: cmp r2, r3
+; CHECK-NEXT: beq .Ltmp0
+; CHECK-NEXT: bkpt #0
+; CHECK-NEXT: .Ltmp0:
+; CHECK-NEXT: blx r0
+; CHECK-NEXT: pop {r7, pc}
+ call void %x() [ "kcfi"(i32 -1124498364) ]
+ ret void
+}
+
+; Test with tail call - backend KCFI supports tail calls
+define void @f2(ptr noundef %x) !kcfi_type !1 {
+; CHECK-LABEL: f2:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: movs r3, #1
+; CHECK-NEXT: mov r2, r0
+; CHECK-NEXT: bics r2, r3
+; CHECK-NEXT: subs r2, #4
+; CHECK-NEXT: ldr r2, [r2]
+; CHECK-NEXT: movs r3, #188
+; CHECK-NEXT: lsls r3, r3, #8
+; CHECK-NEXT: adds r3, #249
+; CHECK-NEXT: lsls r3, r3, #8
+; CHECK-NEXT: adds r3, #132
+; CHECK-NEXT: lsls r3, r3, #8
+; CHECK-NEXT: adds r3, #68
+; CHECK-NEXT: cmp r2, r3
+; CHECK-NEXT: beq .Ltmp1
+; CHECK-NEXT: bkpt #0
+; CHECK-NEXT: .Ltmp1:
+; CHECK-NEXT: blx r0
+; CHECK-NEXT: pop {r7, pc}
+ tail call void %x() [ "kcfi"(i32 -1124498364) ]
+ ret void
+}
+
+; Test with R2 live (3 arguments) - compiler shuffles args, no spilling needed
+define void @f3_r2_live(ptr noundef %x, i32 %a, i32 %b, i32 %c) !kcfi_type !1 {
+; CHECK-LABEL: f3_r2_live:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: .save {r4, lr}
+; CHECK-NEXT: push {r4, lr}
+; CHECK-NEXT: mov r4, r0
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: mov r1, r2
+; CHECK-NEXT: mov r2, r3
+; CHECK-NEXT: push {r2}
+; CHECK-NEXT: movs r3, #1
+; CHECK-NEXT: mov r2, r4
+; CHECK-NEXT: bics r2, r3
+; CHECK-NEXT: subs r2, #4
+; CHECK-NEXT: ldr r2, [r2]
+; CHECK-NEXT: movs r3, #188
+; CHECK-NEXT: lsls r3, r3, #8
+; CHECK-NEXT: adds r3, #249
+; CHECK-NEXT: lsls r3, r3, #8
+; CHECK-NEXT: adds r3, #132
+; CHECK-NEXT: lsls r3, r3, #8
+; CHECK-NEXT: adds r3, #68
+; CHECK-NEXT: cmp r2, r3
+; CHECK-NEXT: pop {r2}
+; CHECK-NEXT: beq .Ltmp2
+; CHECK-NEXT: bkpt #0
+; CHECK-NEXT: .Ltmp2:
+; CHECK-NEXT: blx r4
+; CHECK-NEXT: pop {r4, pc}
+; Compiler shuffles: target→r4, c→r2, a→r0, b→r1
+; R2 is live (3rd arg), so we push it, then uses R3 as temp, R2 as scratch
+ call void %x(i32 %a, i32 %b, i32 %c) [ "kcfi"(i32 -1124498364) ]
+ ret void
+}
+
+; Test with both R2 and R3 live (4 arguments) - compiler moves to r5/r4, uses R3 temp and R12 scratch
+define void @f4_r2_r3_live(ptr noundef %x, i32 %a, i32 %b, i32 %c, i32 %d) !kcfi_type !1 {
+; CHECK-LABEL: f4_r2_r3_live:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: .save {r4, r5, r7, lr}
+; CHECK-NEXT: push {r4, r5, r7, lr}
+; CHECK-NEXT: mov r5, r3
+; CHECK-NEXT: mov r4, r0
+; CHECK-NEXT: ldr r3, [sp, #16]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: mov r1, r2
+; CHECK-NEXT: mov r2, r5
+; CHECK-NEXT: push {r3}
+; CHECK-NEXT: push {r2}
+; CHECK-NEXT: movs r3, #1
+; CHECK-NEXT: mov r2, r4
+; CHECK-NEXT: bics r2, r3
+; CHECK-NEXT: subs r2, #4
+; CHECK-NEXT: ldr r2, [r2]
+; CHECK-NEXT: movs r3, #188
+; CHECK-NEXT: lsls r3, r3, #8
+; CHECK-NEXT: adds r3, #249
+; CHECK-NEXT: lsls r3, r3, #8
+; CHECK-NEXT: adds r3, #132
+; CHECK-NEXT: lsls r3, r3, #8
+; CHECK-NEXT: adds r3, #68
+; CHECK-NEXT: cmp r2, r3
+; CHECK-NEXT: pop {r2}
+; CHECK-NEXT: pop {r3}
+; CHECK-NEXT: beq .Ltmp3
+; CHECK-NEXT: bkpt #0
+; CHECK-NEXT: .Ltmp3:
+; CHECK-NEXT: blx r4
+; CHECK-NEXT: pop {r4, r5, r7, pc}
+; Compiler shuffles: r3→r5, target→r4, d→r3 (from stack), a→r0, b→r1, c→r2
+; Then pushes r3 (d value), then r2, uses R3 as temp, R2 as scratch
+ call void %x(i32 %a, i32 %b, i32 %c, i32 %d) [ "kcfi"(i32 -1124498364) ]
+ ret void
+}
+
+; Test where target ends up in R12, forcing R2 as scratch, with both R2 and R3 live
+; This uses inline asm to force target into R12, with 4 call arguments to make R2/R3 live
+define void @f5_r12_target_r2_r3_live(i32 %a, i32 %b, i32 %c, i32 %d) !kcfi_type !1 {
+; CHECK-LABEL: f5_r12_target_r2_r3_live:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: @APP
+; CHECK-NEXT: @NO_APP
+; CHECK-NEXT: push {r3}
+; CHECK-NEXT: push {r2}
+; CHECK-NEXT: movs r3, #1
+; CHECK-NEXT: mov r2, r12
+; CHECK-NEXT: bics r2, r3
+; CHECK-NEXT: subs r2, #4
+; CHECK-NEXT: ldr r2, [r2]
+; CHECK-NEXT: movs r3, #188
+; CHECK-NEXT: lsls r3, r3, #8
+; CHECK-NEXT: adds r3, #249
+; CHECK-NEXT: lsls r3, r3, #8
+; CHECK-NEXT: adds r3, #132
+; CHECK-NEXT: lsls r3, r3, #8
+; CHECK-NEXT: adds r3, #68
+; CHECK-NEXT: cmp r2, r3
+; CHECK-NEXT: pop {r2}
+; CHECK-NEXT: pop {r3}
+; CHECK-NEXT: beq .Ltmp4
+; CHECK-NEXT: bkpt #0
+; CHECK-NEXT: .Ltmp4:
+; CHECK-NEXT: blx r12
+; CHECK-NEXT: pop {r7, pc}
+; Use inline asm to get function pointer into R12
+; With 4 arguments (r0-r3), both R2 and R3 are live
+; Target in R12 means R2 is scratch, R3 is temp, and both need spilling
+ %target = call ptr asm "", "={r12}"()
+ call void %target(i32 %a, i32 %b, i32 %c, i32 %d) [ "kcfi"(i32 -1124498364) ]
+ ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"kcfi", i32 1}
+!1 = !{i32 -1124498364}
diff --git a/llvm/test/CodeGen/ARM/kcfi-thumb2.ll b/llvm/test/CodeGen/ARM/kcfi-thumb2.ll
new file mode 100644
index 0000000..f319d98
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/kcfi-thumb2.ll
@@ -0,0 +1,163 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=thumbv7-linux-gnueabi -verify-machineinstrs < %s | FileCheck %s --check-prefix=ASM
+; RUN: llc -mtriple=thumbv7-linux-gnueabi -verify-machineinstrs -stop-after=finalize-isel < %s | FileCheck %s --check-prefixes=MIR,ISEL
+; RUN: llc -mtriple=thumbv7-linux-gnueabi -verify-machineinstrs -stop-after=kcfi < %s | FileCheck %s --check-prefixes=MIR,KCFI
+
+; MIR checks for all functions (grouped here to prevent update_llc_test_checks.py from removing them)
+
+; MIR-LABEL: name: f1
+; MIR: body:
+
+; ISEL: tBLXr 14 /* CC::al */, $noreg, %0, csr_aapcs,{{.*}} cfi-type 12345678
+
+; KCFI: BUNDLE{{.*}} {
+; KCFI-NEXT: KCFI_CHECK_Thumb2 $r0, 12345678
+; KCFI-NEXT: tBLXr 14 /* CC::al */, $noreg, {{(killed )?}}$r0, csr_aapcs,{{.*}}
+; KCFI-NEXT: }
+
+; MIR-LABEL: name: f2
+; MIR: body:
+
+; ISEL: TCRETURNri %0, 0, csr_aapcs, implicit $sp, cfi-type 12345678
+
+; KCFI: BUNDLE{{.*}} {
+; KCFI-NEXT: KCFI_CHECK_Thumb2 $r0, 12345678
+; KCFI-NEXT: tTAILJMPr {{(killed )?}}$r0, csr_aapcs, implicit $sp, implicit $sp
+; KCFI-NEXT: }
+
+; Test function without KCFI annotation
+; ASM-LABEL: .globl nosan
+; ASM-NEXT: .p2align 1
+; ASM-NEXT: .type nosan,%function
+; ASM-NEXT: .code 16
+; ASM-NEXT: .thumb_func
+define dso_local void @nosan() nounwind {
+; ASM-LABEL: nosan:
+; ASM: @ %bb.0:
+; ASM-NEXT: bx lr
+ ret void
+}
+
+; Test function with KCFI annotation - verifies type hash emission
+;; The alignment is at least 4 to avoid unaligned type hash loads when this
+;; instrumented function is indirectly called.
+; ASM-LABEL: .globl target_func
+; ASM-NEXT: .p2align 2
+; ASM-NEXT: .type target_func,%function
+; ASM-NEXT: .long 12345678
+; ASM-NEXT: .code 16
+; ASM-NEXT: .thumb_func
+define void @target_func() !kcfi_type !1 {
+; ASM-LABEL: target_func:
+; ASM: @ %bb.0:
+; ASM-NEXT: bx lr
+ ret void
+}
+
+; Test indirect call with KCFI check
+; ASM: .long 12345678
+define void @f1(ptr noundef %x) !kcfi_type !1 {
+; ASM-LABEL: f1:
+; ASM: @ %bb.0:
+; ASM-NEXT: .save {r7, lr}
+; ASM-NEXT: push {r7, lr}
+; ASM-NEXT: bic r12, r0, #1
+; ASM-NEXT: ldr r12, [r12, #-4]
+; ASM-NEXT: eor r12, r12, #78
+; ASM-NEXT: eor r12, r12, #24832
+; ASM-NEXT: eor r12, r12, #12320768
+; ASM-NEXT: eors r12, r12, #0
+; ASM-NEXT: beq.w .Ltmp0
+; ASM-NEXT: udf #128
+; ASM-NEXT: .Ltmp0:
+; ASM-NEXT: blx r0
+; ASM-NEXT: pop {r7, pc}
+
+ call void %x() [ "kcfi"(i32 12345678) ]
+ ret void
+}
+
+; Test with tail call
+define void @f2(ptr noundef %x) !kcfi_type !1 {
+; ASM-LABEL: f2:
+; ASM: @ %bb.0:
+; ASM-NEXT: bic r12, r0, #1
+; ASM-NEXT: ldr r12, [r12, #-4]
+; ASM-NEXT: eor r12, r12, #78
+; ASM-NEXT: eor r12, r12, #24832
+; ASM-NEXT: eor r12, r12, #12320768
+; ASM-NEXT: eors r12, r12, #0
+; ASM-NEXT: beq.w .Ltmp1
+; ASM-NEXT: udf #128
+; ASM-NEXT: .Ltmp1:
+; ASM-NEXT: bx r0
+
+ tail call void %x() [ "kcfi"(i32 12345678) ]
+ ret void
+}
+
+; Test r3 spill/reload when target is r12 and r3 is a call argument (Thumb2)
+define void @f3_r3_spill(ptr noundef %target, i32 %a, i32 %b, i32 %c, i32 %d) !kcfi_type !1 {
+; ASM-LABEL: f3_r3_spill:
+; ASM: @ %bb.0:
+; ASM-NEXT: .save {r7, lr}
+; ASM-NEXT: push {r7, lr}
+; ASM-NEXT: mov lr, r3
+; ASM-NEXT: ldr r3, [sp, #8]
+; ASM-NEXT: mov r12, r0
+; ASM-NEXT: mov r0, r1
+; ASM-NEXT: mov r1, r2
+; ASM-NEXT: mov r2, lr
+; ASM-NEXT: push {r3}
+; ASM-NEXT: bic r3, r12, #1
+; ASM-NEXT: ldr r3, [r3, #-4]
+; ASM-NEXT: eor r3, r3, #78
+; ASM-NEXT: eor r3, r3, #24832
+; ASM-NEXT: eor r3, r3, #12320768
+; ASM-NEXT: eors r3, r3, #0
+; ASM-NEXT: pop {r3}
+; ASM-NEXT: beq.w .Ltmp2
+; ASM-NEXT: udf #140
+; ASM-NEXT: .Ltmp2:
+; ASM-NEXT: blx r12
+; ASM-NEXT: pop {r7, pc}
+; Arguments: r0=%target, r1=%a, r2=%b, r3=%c, [sp+8]=%d
+; Call needs: r0=%a, r1=%b, r2=%c, r3=%d, target in r12
+; r3 is live as 4th argument, so push it before KCFI check
+ call void %target(i32 %a, i32 %b, i32 %c, i32 %d) [ "kcfi"(i32 12345678) ]
+ ret void
+}
+
+; Test with 3 arguments - r3 not live, target in r12 or elsewhere, r12 used as scratch
+define void @f4_r3_unused(ptr noundef %target, i32 %a, i32 %b) !kcfi_type !1 {
+; ASM-LABEL: f4_r3_unused:
+; ASM: @ %bb.0:
+; ASM-NEXT: .save {r7, lr}
+; ASM-NEXT: push {r7, lr}
+; ASM-NEXT: mov r3, r0
+; ASM-NEXT: mov r0, r1
+; ASM-NEXT: mov r1, r2
+; ASM-NEXT: bic r12, r3, #1
+; ASM-NEXT: ldr r12, [r12, #-4]
+; ASM-NEXT: eor r12, r12, #78
+; ASM-NEXT: eor r12, r12, #24832
+; ASM-NEXT: eor r12, r12, #12320768
+; ASM-NEXT: eors r12, r12, #0
+; ASM-NEXT: beq.w .Ltmp3
+; ASM-NEXT: udf #131
+; ASM-NEXT: .Ltmp3:
+; ASM-NEXT: blx r3
+; ASM-NEXT: pop {r7, pc}
+; Only 3 arguments total, so r3 is not used as call argument
+; Target might be in r3, using r12 as scratch (no spill needed)
+ call void %target(i32 %a, i32 %b) [ "kcfi"(i32 12345678) ]
+ ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"kcfi", i32 1}
+!1 = !{i32 12345678}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; ISEL: {{.*}}
+; KCFI: {{.*}}
+; MIR: {{.*}}
diff --git a/llvm/test/CodeGen/ARM/kcfi.ll b/llvm/test/CodeGen/ARM/kcfi.ll
deleted file mode 100644
index 9e16468..0000000
--- a/llvm/test/CodeGen/ARM/kcfi.ll
+++ /dev/null
@@ -1,28 +0,0 @@
-; RUN: llc -mtriple=thumbv6m-none-eabi < %s | FileCheck %s
-
-; CHECK-LABEL: .globl nosan
-; CHECK-NEXT: .p2align 1
-; CHECK-NEXT: .type nosan,%function
-; CHECK-NEXT: .code 16
-; CHECK-NEXT: .thumb_func
-; CHECK-NEXT: nosan:
-define dso_local void @nosan() nounwind {
- ret void
-}
-
-;; The alignment is at least 4 to avoid unaligned type hash loads when this
-;; instrumented function is indirectly called.
-; CHECK-LABEL: .globl f1
-; CHECK-NEXT: .p2align 2
-; CHECK-NEXT: .type f1,%function
-; CHECK-NEXT: .long 3170468932
-; CHECK-NEXT: .code 16
-; CHECK-NEXT: .thumb_func
-; CHECK-NEXT: f1:
-define void @f1(ptr noundef %x) !kcfi_type !1 {
- ret void
-}
-
-!llvm.module.flags = !{!0}
-!0 = !{i32 4, !"kcfi", i32 1}
-!1 = !{i32 -1124498364}
diff --git a/llvm/test/CodeGen/ARM/out-of-registers.ll b/llvm/test/CodeGen/ARM/out-of-registers.ll
index c6488f1..8da2069 100644
--- a/llvm/test/CodeGen/ARM/out-of-registers.ll
+++ b/llvm/test/CodeGen/ARM/out-of-registers.ll
@@ -32,7 +32,7 @@ declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vl
; Function Attrs: nounwind
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
attributes #1 = { nounwind }
attributes #2 = { nounwind readonly }
diff --git a/llvm/test/CodeGen/ARM/relax-per-target-feature.ll b/llvm/test/CodeGen/ARM/relax-per-target-feature.ll
index 71db294..99ed6f3 100644
--- a/llvm/test/CodeGen/ARM/relax-per-target-feature.ll
+++ b/llvm/test/CodeGen/ARM/relax-per-target-feature.ll
@@ -30,5 +30,5 @@ entry:
attributes #0 = { nounwind "disable-tail-calls"="false" "target-cpu"="cortex-a53" "target-features"="+crypto,+fp-armv8,+neon,+soft-float-abi,+strict-align,+thumb-mode,-crc,-dotprod,-dsp,-hwdiv,-hwdiv-arm,-ras" "use-soft-float"="true" }
-attributes #2 = { nounwind "disable-tail-calls"="false" "target-cpu"="arm7tdmi" "target-features"="+strict-align,+thumb-mode,-crc,-dotprod,-dsp,-hwdiv,-hwdiv-arm,-ras" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #2 = { nounwind "disable-tail-calls"="false" "target-cpu"="arm7tdmi" "target-features"="+strict-align,+thumb-mode,-crc,-dotprod,-dsp,-hwdiv,-hwdiv-arm,-ras" "use-soft-float"="true" }
attributes #3 = { nounwind }
diff --git a/llvm/test/CodeGen/ARM/softfp-constant-comparison.ll b/llvm/test/CodeGen/ARM/softfp-constant-comparison.ll
index 76df93b..2aa7611 100644
--- a/llvm/test/CodeGen/ARM/softfp-constant-comparison.ll
+++ b/llvm/test/CodeGen/ARM/softfp-constant-comparison.ll
@@ -32,4 +32,4 @@ land.end: ; preds = %land.rhs, %entry
ret void
}
-attributes #0 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m4" "target-features"="+armv7e-m,+dsp,+fp16,+hwdiv,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp,-aes,-crc,-crypto,-dotprod,-fp16fml,-fullfp16,-hwdiv-arm,-lob,-mve,-mve.fp,-ras,-sb,-sha2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m4" "target-features"="+armv7e-m,+dsp,+fp16,+hwdiv,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp,-aes,-crc,-crypto,-dotprod,-fp16fml,-fullfp16,-hwdiv-arm,-lob,-mve,-mve.fp,-ras,-sb,-sha2" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/ARM/stack-protector-bmovpcb_call.ll b/llvm/test/CodeGen/ARM/stack-protector-bmovpcb_call.ll
index 6f2cb42..2cf6d29 100644
--- a/llvm/test/CodeGen/ARM/stack-protector-bmovpcb_call.ll
+++ b/llvm/test/CodeGen/ARM/stack-protector-bmovpcb_call.ll
@@ -25,7 +25,7 @@ declare void @llvm.memcpy.p0.p0.i32(ptr nocapture, ptr nocapture readonly, i32,
; Function Attrs: nounwind optsize
declare i32 @printf(ptr nocapture readonly, ...) #2
-attributes #0 = { nounwind optsize ssp "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind optsize ssp "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
attributes #1 = { nounwind }
-attributes #2 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
attributes #3 = { nounwind optsize }
diff --git a/llvm/test/CodeGen/ARM/stack_guard_remat.ll b/llvm/test/CodeGen/ARM/stack_guard_remat.ll
index 983ef13..0930ccc 100644
--- a/llvm/test/CodeGen/ARM/stack_guard_remat.ll
+++ b/llvm/test/CodeGen/ARM/stack_guard_remat.ll
@@ -68,7 +68,7 @@ declare void @foo3(ptr)
; Function Attrs: nounwind
declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
-attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
;--- pic-flag.ll
!llvm.module.flags = !{!0}
diff --git a/llvm/test/CodeGen/ARM/struct-byval-frame-index.ll b/llvm/test/CodeGen/ARM/struct-byval-frame-index.ll
index 24df0d3..868dc03 100644
--- a/llvm/test/CodeGen/ARM/struct-byval-frame-index.ll
+++ b/llvm/test/CodeGen/ARM/struct-byval-frame-index.ll
@@ -34,4 +34,4 @@ entry:
; Function Attrs: nounwind
declare void @RestoreMVBlock8x8(i32, i32, ptr byval(%structN) nocapture, i32) #1
-attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/ARM/subtarget-align.ll b/llvm/test/CodeGen/ARM/subtarget-align.ll
index a24b487..f87e21f 100644
--- a/llvm/test/CodeGen/ARM/subtarget-align.ll
+++ b/llvm/test/CodeGen/ARM/subtarget-align.ll
@@ -18,7 +18,7 @@ entry:
ret i32 0
}
-attributes #0 = { "target-cpu"="generic" "target-features"="+armv7-a,+dsp,+neon,+vfp3,-thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { "target-cpu"="generic" "target-features"="+armv7-a,+dsp,+neon,+vfp3,-thumb-mode" "use-soft-float"="false" }
attributes #1 = { "target-cpu"="arm7tdmi" "target-features"="+armv4t" "use-soft-float"="true" }
diff --git a/llvm/test/CodeGen/ARM/unschedule-first-call.ll b/llvm/test/CodeGen/ARM/unschedule-first-call.ll
index e0bb787..ad422f7 100644
--- a/llvm/test/CodeGen/ARM/unschedule-first-call.ll
+++ b/llvm/test/CodeGen/ARM/unschedule-first-call.ll
@@ -128,7 +128,7 @@ declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) #1
; Function Attrs: nounwind readnone
declare { i64, i1 } @llvm.ssub.with.overflow.i64(i64, i64) #1
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-cpu"="arm1176jzf-s" "target-features"="+dsp,+strict-align,+vfp2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-cpu"="arm1176jzf-s" "target-features"="+dsp,+strict-align,+vfp2" "use-soft-float"="false" }
attributes #1 = { nounwind readnone }
!llvm.ident = !{!0}
diff --git a/llvm/test/CodeGen/ARM/vector-spilling.ll b/llvm/test/CodeGen/ARM/vector-spilling.ll
index 5dc20a8..8d1339844 100644
--- a/llvm/test/CodeGen/ARM/vector-spilling.ll
+++ b/llvm/test/CodeGen/ARM/vector-spilling.ll
@@ -30,4 +30,4 @@ entry:
declare void @foo(<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>)
-attributes #0 = { noredzone "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noredzone "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/ARM/vldm-sched-a9.ll b/llvm/test/CodeGen/ARM/vldm-sched-a9.ll
index 892b261..4e36711 100644
--- a/llvm/test/CodeGen/ARM/vldm-sched-a9.ll
+++ b/llvm/test/CodeGen/ARM/vldm-sched-a9.ll
@@ -132,4 +132,4 @@ entry:
declare void @capture(ptr, ptr)
-attributes #0 = { noredzone "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noredzone "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/AVR/dynalloca.ll b/llvm/test/CodeGen/AVR/dynalloca.ll
index 3face71..b32910b 100644
--- a/llvm/test/CodeGen/AVR/dynalloca.ll
+++ b/llvm/test/CodeGen/AVR/dynalloca.ll
@@ -64,16 +64,16 @@ define void @dynalloca2(i16 %x) {
; CHECK-NEXT: out 63, r0
; CHECK-NEXT: out 61, {{.*}}
; Store values on the stack
-; CHECK: ldi r20, 0
-; CHECK: ldi r21, 0
-; CHECK: std Z+8, r21
-; CHECK: std Z+7, r20
-; CHECK: std Z+6, r21
-; CHECK: std Z+5, r20
-; CHECK: std Z+4, r21
-; CHECK: std Z+3, r20
-; CHECK: std Z+2, r21
-; CHECK: std Z+1, r20
+; CHECK: ldi [[REG1:r[0-9]+]], 0
+; CHECK: ldi [[REG2:r[0-9]+]], 0
+; CHECK: std Z+8, [[REG2]]
+; CHECK: std Z+7, [[REG1]]
+; CHECK: std Z+6, [[REG2]]
+; CHECK: std Z+5, [[REG1]]
+; CHECK: std Z+4, [[REG2]]
+; CHECK: std Z+3, [[REG1]]
+; CHECK: std Z+2, [[REG2]]
+; CHECK: std Z+1, [[REG1]]
; CHECK: call
; Call frame restore
; CHECK-NEXT: in r30, 61
diff --git a/llvm/test/CodeGen/AVR/issue-163015.ll b/llvm/test/CodeGen/AVR/issue-163015.ll
new file mode 100644
index 0000000..6c4dc51
--- /dev/null
+++ b/llvm/test/CodeGen/AVR/issue-163015.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -mtriple=avr | FileCheck %s
+
+@ui1 = protected local_unnamed_addr global i64 zeroinitializer, align 8
+@ui2 = protected local_unnamed_addr global i64 zeroinitializer, align 8
+@failed = private unnamed_addr addrspace(1) constant [12 x i8] c"test failed\00"
+@stats2 = external protected global i16, align 1
+
+; CHECK-LABEL: main:
+define i32 @main() addrspace(1) {
+entry:
+ store i64 94, ptr @ui1, align 8
+ store i64 53, ptr @ui2, align 8
+ tail call addrspace(1) void @foo(i16 ptrtoint (ptr addrspace(1) @failed to i16), i16 11, i8 2, i16 32, ptr @stats2)
+ %11 = load i64, ptr @ui1, align 8
+ %12 = load i64, ptr @ui2, align 8
+
+; COM: CHECK: call __udivdi3
+ %15 = udiv i64 %11, %12
+
+; look for the buggy pattern where r30/r31 are being clobbered, corrupting the stack pointer
+; CHECK-NOT: std Z+{{[1-9]+}}, r30
+; CHECK-NOT: std Z+{{[1-9]+}}, r31
+
+; CHECK: call expect
+ tail call addrspace(1) void @expect(i64 %15, i64 1, i16 ptrtoint (ptr addrspace(1) @failed to i16), i16 11, i8 2, i16 33)
+
+; CHECK: ret
+ ret i32 0
+}
+
+declare protected void @expect(i64, i64, i16, i16, i8, i16) local_unnamed_addr addrspace(1) #0
+declare protected void @foo(i16, i16, i8, i16, i16) local_unnamed_addr addrspace(1) #0
diff --git a/llvm/test/CodeGen/BPF/BTF/binary-format.ll b/llvm/test/CodeGen/BPF/BTF/binary-format.ll
index 3b1be1a..fd09566 100644
--- a/llvm/test/CodeGen/BPF/BTF/binary-format.ll
+++ b/llvm/test/CodeGen/BPF/BTF/binary-format.ll
@@ -7,7 +7,7 @@
; clang -target bpf -O2 -g -gdwarf-5 -gembed-source -S -emit-llvm t.c
; Function Attrs: nounwind readnone
-define dso_local i32 @f(i32 returned %a) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @f(i32 returned %a) local_unnamed_addr !dbg !7 {
entry:
call void @llvm.dbg.value(metadata i32 %a, metadata !12, metadata !DIExpression()), !dbg !13
ret i32 %a, !dbg !14
@@ -42,10 +42,7 @@ entry:
; CHECK-EB: 0x00000050 00000008 0000000f 00000018 00000410
; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/builtin-btf-type-id.ll b/llvm/test/CodeGen/BPF/BTF/builtin-btf-type-id.ll
index 2fb8d25..1672334 100644
--- a/llvm/test/CodeGen/BPF/BTF/builtin-btf-type-id.ll
+++ b/llvm/test/CodeGen/BPF/BTF/builtin-btf-type-id.ll
@@ -24,7 +24,7 @@
@bpf_log = internal global ptr inttoptr (i64 999 to ptr), align 8, !dbg !17
; Function Attrs: nounwind
-define dso_local void @prog1() #0 !dbg !28 {
+define dso_local void @prog1() !dbg !28 {
entry:
%0 = load ptr, ptr @bpf_log, align 8, !dbg !31, !tbaa !32
%1 = call i64 @llvm.bpf.btf.type.id(i32 0, i64 0), !dbg !36, !llvm.preserve.access.index !7
@@ -33,10 +33,10 @@ entry:
}
; Function Attrs: nounwind readnone
-declare i64 @llvm.bpf.btf.type.id(i32, i64) #1
+declare i64 @llvm.bpf.btf.type.id(i32, i64)
; Function Attrs: nounwind
-define dso_local void @prog2() #0 !dbg !38 {
+define dso_local void @prog2() !dbg !38 {
entry:
%0 = load ptr, ptr @bpf_log, align 8, !dbg !39, !tbaa !32
%1 = call i64 @llvm.bpf.btf.type.id(i32 1, i64 0), !dbg !40, !llvm.preserve.access.index !6
@@ -45,7 +45,7 @@ entry:
}
; Function Attrs: nounwind
-define dso_local void @prog3() #0 !dbg !42 {
+define dso_local void @prog3() !dbg !42 {
entry:
%0 = load ptr, ptr @bpf_log, align 8, !dbg !43, !tbaa !32
%1 = call i64 @llvm.bpf.btf.type.id(i32 2, i64 1), !dbg !44, !llvm.preserve.access.index !11
@@ -96,9 +96,6 @@ entry:
; CHECK-NEXT: .long 48
; CHECK-NEXT: .long 7
-attributes #0 = { nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-
!llvm.dbg.cu = !{!2}
!llvm.module.flags = !{!24, !25, !26}
!llvm.ident = !{!27}
diff --git a/llvm/test/CodeGen/BPF/BTF/char-no-debuginfo.ll b/llvm/test/CodeGen/BPF/BTF/char-no-debuginfo.ll
index cc14a32b..1c2b1d1 100644
--- a/llvm/test/CodeGen/BPF/BTF/char-no-debuginfo.ll
+++ b/llvm/test/CodeGen/BPF/BTF/char-no-debuginfo.ll
@@ -10,7 +10,7 @@
@g = dso_local local_unnamed_addr global i32 5, section "maps", align 4
; Function Attrs: norecurse nounwind readonly
-define dso_local i32 @test() local_unnamed_addr #0 {
+define dso_local i32 @test() local_unnamed_addr {
%1 = load i32, ptr @g, align 4, !tbaa !2
ret i32 %1
}
@@ -18,8 +18,6 @@ define dso_local i32 @test() local_unnamed_addr #0 {
; CHECK-NOT: .section .BTF
; CHECK-NOT: .section .BTF.ext
-attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-builtin.ll b/llvm/test/CodeGen/BPF/BTF/extern-builtin.ll
index a855016..fa0aa5b 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-builtin.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-builtin.ll
@@ -10,7 +10,7 @@
; clang -target bpf -O2 -g -S -emit-llvm test.c
; Function Attrs: nounwind readonly
-define dso_local i64 @test(ptr readonly %skb) local_unnamed_addr #0 !dbg !13 {
+define dso_local i64 @test(ptr readonly %skb) local_unnamed_addr !dbg !13 {
entry:
call void @llvm.dbg.value(metadata ptr %skb, metadata !17, metadata !DIExpression()), !dbg !18
%call = tail call i64 @llvm.bpf.load.byte(ptr %skb, i64 10), !dbg !19
@@ -54,13 +54,9 @@ entry:
; CHECK-NEXT: .byte 0
; Function Attrs: nounwind readonly
-declare !dbg !4 i64 @llvm.bpf.load.byte(ptr, i64) #1
+declare !dbg !4 i64 @llvm.bpf.load.byte(ptr, i64)
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readonly }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!9, !10, !11}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-func-arg.ll b/llvm/test/CodeGen/BPF/BTF/extern-func-arg.ll
index b7cbb48f..9a31beb 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-func-arg.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-func-arg.ll
@@ -8,9 +8,9 @@
; clang -target bpf -O2 -g -S -emit-llvm test.c
; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test() local_unnamed_addr !dbg !13 {
entry:
- %call = tail call i32 @global_func(i8 signext 0) #2, !dbg !16
+ %call = tail call i32 @global_func(i8 signext 0), !dbg !16
ret i32 %call, !dbg !17
}
@@ -49,11 +49,7 @@ entry:
; CHECK: .ascii "char" # string offset=55
; CHECK: .ascii "global_func" # string offset=60
-declare !dbg !4 dso_local i32 @global_func(i8 signext) local_unnamed_addr #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
+declare !dbg !4 dso_local i32 @global_func(i8 signext) local_unnamed_addr
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!9, !10, !11}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-global-var.ll b/llvm/test/CodeGen/BPF/BTF/extern-global-var.ll
index 299aa1d..c3f93ab 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-global-var.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-global-var.ll
@@ -10,7 +10,7 @@
@a = external dso_local local_unnamed_addr global i8, align 1
; Function Attrs: norecurse nounwind readonly
-define dso_local i32 @foo() local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @foo() local_unnamed_addr !dbg !7 {
%1 = load i8, ptr @a, align 1, !dbg !11, !tbaa !12
%2 = sext i8 %1 to i32, !dbg !11
ret i32 %2, !dbg !15
@@ -45,8 +45,6 @@ define dso_local i32 @foo() local_unnamed_addr #0 !dbg !7 {
; CHECK-NEXT: .ascii "/home/yhs/work/tests/llvm/bug/test.c" # string offset=15
; CHECK-NEXT: .byte 0
-attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
!llvm.ident = !{!6}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-var-func-weak-section.ll b/llvm/test/CodeGen/BPF/BTF/extern-var-func-weak-section.ll
index d11addd..0ddd634 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-var-func-weak-section.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-var-func-weak-section.ll
@@ -10,12 +10,12 @@
; clang -target bpf -O2 -g -S -emit-llvm test.c
; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test() local_unnamed_addr !dbg !13 {
entry:
- %call = tail call i32 @global_func(i8 signext 0) #2, !dbg !16
+ %call = tail call i32 @global_func(i8 signext 0), !dbg !16
ret i32 %call, !dbg !17
}
-declare !dbg !4 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed_addr #1 section "abc"
+declare !dbg !4 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed_addr section "abc"
; CHECK: .section .BTF,"",@progbits
; CHECK-NEXT: .short 60319 # 0xeb9f
@@ -69,10 +69,6 @@ declare !dbg !4 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed
; CHECK-NEXT: .byte 0
; CHECK-NEXT: .ascii "abc" # string offset=72
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
-
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!9, !10, !11}
!llvm.ident = !{!12}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-var-func-weak.ll b/llvm/test/CodeGen/BPF/BTF/extern-var-func-weak.ll
index 9e82295..fbfc03b 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-var-func-weak.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-var-func-weak.ll
@@ -10,12 +10,12 @@
; clang -target bpf -O2 -g -S -emit-llvm test.c
; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test() local_unnamed_addr !dbg !13 {
entry:
- %call = tail call i32 @global_func(i8 signext 0) #2, !dbg !16
+ %call = tail call i32 @global_func(i8 signext 0), !dbg !16
ret i32 %call, !dbg !17
}
-declare !dbg !4 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed_addr #1
+declare !dbg !4 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed_addr
; CHECK: .section .BTF,"",@progbits
; CHECK-NEXT: .short 60319 # 0xeb9f
@@ -62,10 +62,6 @@ declare !dbg !4 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed
; CHECK-NEXT: .ascii "global_func" # string offset=60
; CHECK-NEXT: .byte 0
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
-
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!9, !10, !11}
!llvm.ident = !{!12}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-var-func.ll b/llvm/test/CodeGen/BPF/BTF/extern-var-func.ll
index 262abb3..0ba4732 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-var-func.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-var-func.ll
@@ -10,9 +10,9 @@
; clang -target bpf -O2 -g -S -emit-llvm test.c
; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test() local_unnamed_addr !dbg !13 {
entry:
- %call = tail call i32 @global_func(i8 signext 0) #2, !dbg !16
+ %call = tail call i32 @global_func(i8 signext 0), !dbg !16
ret i32 %call, !dbg !17
}
@@ -61,11 +61,7 @@ entry:
; CHECK-NEXT: .ascii "global_func" # string offset=60
; CHECK-NEXT: .byte 0
-declare !dbg !4 dso_local i32 @global_func(i8 signext) local_unnamed_addr #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
+declare !dbg !4 dso_local i32 @global_func(i8 signext) local_unnamed_addr
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!9, !10, !11}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-var-section.ll b/llvm/test/CodeGen/BPF/BTF/extern-var-section.ll
index b6e14fc..27793d1 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-var-section.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-var-section.ll
@@ -13,9 +13,9 @@
@ch = external dso_local local_unnamed_addr global i8, section "abc", align 1, !dbg !0
; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !16 {
+define dso_local i32 @test() local_unnamed_addr !dbg !16 {
entry:
- %call = tail call i32 @global_func(i8 signext 0) #2, !dbg !19
+ %call = tail call i32 @global_func(i8 signext 0), !dbg !19
%0 = load i8, ptr @ch, align 1, !dbg !20, !tbaa !21
%conv = sext i8 %0 to i32, !dbg !20
%add = add nsw i32 %call, %conv, !dbg !24
@@ -84,11 +84,7 @@ entry:
; CHECK-NEXT: .ascii "abc" # string offset=75
; CHECK-NEXT: .byte 0
-declare !dbg !6 dso_local i32 @global_func(i8 signext) local_unnamed_addr #1 section "abc"
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
+declare !dbg !6 dso_local i32 @global_func(i8 signext) local_unnamed_addr section "abc"
!llvm.dbg.cu = !{!2}
!llvm.module.flags = !{!12, !13, !14}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-var-struct-weak.ll b/llvm/test/CodeGen/BPF/BTF/extern-var-struct-weak.ll
index 63ab578..ffec16b 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-var-struct-weak.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-var-struct-weak.ll
@@ -12,7 +12,7 @@
@global = extern_weak dso_local local_unnamed_addr global %struct.t1, align 4, !dbg !0
; Function Attrs: norecurse nounwind readonly
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @test() local_unnamed_addr !dbg !15 {
entry:
%0 = load i32, ptr @global, align 4, !dbg !18, !tbaa !19
ret i32 %0, !dbg !24
@@ -68,8 +68,6 @@ entry:
; CHECK-NEXT: .ascii "global" # string offset=66
; CHECK-NEXT: .byte 0
-attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.dbg.cu = !{!2}
!llvm.module.flags = !{!11, !12, !13}
!llvm.ident = !{!14}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-var-struct.ll b/llvm/test/CodeGen/BPF/BTF/extern-var-struct.ll
index 3ecda4f..dfe5e5e 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-var-struct.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-var-struct.ll
@@ -13,7 +13,7 @@
@global = external dso_local local_unnamed_addr global %struct.t1, align 4, !dbg !0
; Function Attrs: norecurse nounwind readonly
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @test() local_unnamed_addr !dbg !15 {
entry:
%0 = load i32, ptr @global, align 4, !dbg !18, !tbaa !19
ret i32 %0, !dbg !24
@@ -69,8 +69,6 @@ entry:
; CHECK-NEXT: .ascii "global" # string offset=66
; CHECK-NEXT: .byte 0
-attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.dbg.cu = !{!2}
!llvm.module.flags = !{!11, !12, !13}
!llvm.ident = !{!14}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-var-weak-section.ll b/llvm/test/CodeGen/BPF/BTF/extern-var-weak-section.ll
index 57ca18c..7d28987 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-var-weak-section.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-var-weak-section.ll
@@ -12,15 +12,15 @@
@ch = extern_weak dso_local local_unnamed_addr global i8, section "abc", align 1, !dbg !0
; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !16 {
+define dso_local i32 @test() local_unnamed_addr !dbg !16 {
entry:
- %call = tail call i32 @global_func(i8 signext 0) #2, !dbg !19
+ %call = tail call i32 @global_func(i8 signext 0), !dbg !19
%0 = load i8, ptr @ch, align 1, !dbg !20, !tbaa !21
%conv = sext i8 %0 to i32, !dbg !20
%add = add nsw i32 %call, %conv, !dbg !24
ret i32 %add, !dbg !25
}
-declare !dbg !6 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed_addr #1 section "abc"
+declare !dbg !6 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed_addr section "abc"
; CHECK: .section .BTF,"",@progbits
; CHECK-NEXT: .short 60319 # 0xeb9f
@@ -84,10 +84,6 @@ declare !dbg !6 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed
; CHECK-NEXT: .ascii "abc" # string offset=75
; CHECK-NEXT: .byte 0
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
-
!llvm.dbg.cu = !{!2}
!llvm.module.flags = !{!12, !13, !14}
!llvm.ident = !{!15}
diff --git a/llvm/test/CodeGen/BPF/BTF/filename.ll b/llvm/test/CodeGen/BPF/BTF/filename.ll
index ae08aea..0d8742fa 100644
--- a/llvm/test/CodeGen/BPF/BTF/filename.ll
+++ b/llvm/test/CodeGen/BPF/BTF/filename.ll
@@ -7,7 +7,7 @@
; clang -target bpf -O2 -g -S -emit-llvm t.c
; Function Attrs: norecurse nounwind readnone uwtable
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test() local_unnamed_addr !dbg !7 {
ret i32 0, !dbg !11
}
@@ -63,8 +63,6 @@ define dso_local i32 @test() local_unnamed_addr #0 !dbg !7 {
; CHECK-NEXT: .long 0
; CHECK-NEXT: .long 1038 # Line 1 Col 14
-attributes #0 = { norecurse nounwind readnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
!llvm.ident = !{!6}
diff --git a/llvm/test/CodeGen/BPF/BTF/func-func-ptr.ll b/llvm/test/CodeGen/BPF/BTF/func-func-ptr.ll
index b700be9..f8c3de5 100644
--- a/llvm/test/CodeGen/BPF/BTF/func-func-ptr.ll
+++ b/llvm/test/CodeGen/BPF/BTF/func-func-ptr.ll
@@ -14,7 +14,7 @@
@b1 = common dso_local local_unnamed_addr global %struct.t1 zeroinitializer, align 8, !dbg !6
; Function Attrs: nounwind readnone
-define dso_local void @f1(i32 %p2) local_unnamed_addr #0 !dbg !19 {
+define dso_local void @f1(i32 %p2) local_unnamed_addr !dbg !19 {
entry:
call void @llvm.dbg.value(metadata i32 %p2, metadata !21, metadata !DIExpression()), !dbg !22
ret void, !dbg !23
@@ -95,10 +95,7 @@ entry:
; CHECK-NEXT: .long 3091 # Line 3 Col 19
; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!2}
!llvm.module.flags = !{!15, !16, !17}
diff --git a/llvm/test/CodeGen/BPF/BTF/func-non-void.ll b/llvm/test/CodeGen/BPF/BTF/func-non-void.ll
index 2f562b2..745645d 100644
--- a/llvm/test/CodeGen/BPF/BTF/func-non-void.ll
+++ b/llvm/test/CodeGen/BPF/BTF/func-non-void.ll
@@ -7,7 +7,7 @@
; clang -target bpf -O2 -g -S -emit-llvm t.c
; Function Attrs: nounwind readnone
-define dso_local i32 @f1(i32 returned) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @f1(i32 returned) local_unnamed_addr !dbg !7 {
call void @llvm.dbg.value(metadata i32 %0, metadata !12, metadata !DIExpression()), !dbg !13
ret i32 %0, !dbg !14
}
@@ -73,10 +73,7 @@ define dso_local i32 @f1(i32 returned) local_unnamed_addr #0 !dbg !7 {
; CHECK-NEXT: .long 1042 # Line 1 Col 18
; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/func-source.ll b/llvm/test/CodeGen/BPF/BTF/func-source.ll
index a485d2c..c305e83 100644
--- a/llvm/test/CodeGen/BPF/BTF/func-source.ll
+++ b/llvm/test/CodeGen/BPF/BTF/func-source.ll
@@ -10,7 +10,7 @@
; correct reference to the lines in the string table.
; Function Attrs: norecurse nounwind readnone
-define dso_local void @f() local_unnamed_addr #0 !dbg !7 {
+define dso_local void @f() local_unnamed_addr !dbg !7 {
entry:
ret void, !dbg !10
}
@@ -63,8 +63,6 @@ entry:
; CHECK-NEXT: .long 18
; CHECK-NEXT: .long 1040 # Line 1 Col 16
-attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
!llvm.ident = !{!6}
diff --git a/llvm/test/CodeGen/BPF/BTF/func-typedef.ll b/llvm/test/CodeGen/BPF/BTF/func-typedef.ll
index 2570536..388deeb 100644
--- a/llvm/test/CodeGen/BPF/BTF/func-typedef.ll
+++ b/llvm/test/CodeGen/BPF/BTF/func-typedef.ll
@@ -9,7 +9,7 @@
; clang -target bpf -O2 -g -S -emit-llvm t.c
; Function Attrs: nounwind readnone
-define dso_local i32 @f(i32 returned %a) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @f(i32 returned %a) local_unnamed_addr !dbg !7 {
entry:
call void @llvm.dbg.value(metadata i32 %a, metadata !14, metadata !DIExpression()), !dbg !15
ret i32 %a, !dbg !16
@@ -85,12 +85,8 @@ entry:
; CHECK-NEXT: .long 0
; CHECK-NEXT: .long 3092 # Line 3 Col 20
-
; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/func-unused-arg.ll b/llvm/test/CodeGen/BPF/BTF/func-unused-arg.ll
index f9439e6..380642c 100644
--- a/llvm/test/CodeGen/BPF/BTF/func-unused-arg.ll
+++ b/llvm/test/CodeGen/BPF/BTF/func-unused-arg.ll
@@ -7,7 +7,7 @@
; clang -target bpf -O2 -g -S -emit-llvm t.c
; Function Attrs: nounwind readnone
-define dso_local i32 @f1(i32) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @f1(i32) local_unnamed_addr !dbg !7 {
call void @llvm.dbg.value(metadata i32 %0, metadata !12, metadata !DIExpression()), !dbg !13
ret i32 0, !dbg !14
}
@@ -69,10 +69,7 @@ define dso_local i32 @f1(i32) local_unnamed_addr #0 !dbg !7 {
; CHECK-NEXT: .long 1042 # Line 1 Col 18
; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/func-void.ll b/llvm/test/CodeGen/BPF/BTF/func-void.ll
index bf70b6a..9205700 100644
--- a/llvm/test/CodeGen/BPF/BTF/func-void.ll
+++ b/llvm/test/CodeGen/BPF/BTF/func-void.ll
@@ -7,7 +7,7 @@
; clang -target bpf -O2 -g -S -emit-llvm t.c
; Function Attrs: norecurse nounwind readnone
-define dso_local void @f1() local_unnamed_addr #0 !dbg !7 {
+define dso_local void @f1() local_unnamed_addr !dbg !7 {
ret void, !dbg !10
}
@@ -57,8 +57,6 @@ define dso_local void @f1() local_unnamed_addr #0 !dbg !7 {
; CHECK-NEXT: .long 0
; CHECK-NEXT: .long 1040 # Line 1 Col 16
-attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
!llvm.ident = !{!6}
diff --git a/llvm/test/CodeGen/BPF/BTF/local-var-readonly-1.ll b/llvm/test/CodeGen/BPF/BTF/local-var-readonly-1.ll
index 6ef7a30..5c797f7 100644
--- a/llvm/test/CodeGen/BPF/BTF/local-var-readonly-1.ll
+++ b/llvm/test/CodeGen/BPF/BTF/local-var-readonly-1.ll
@@ -21,16 +21,16 @@
@__const.test.val = private unnamed_addr constant %struct.anon { [4 x i32] [i32 2, i32 3, i32 4, i32 5] }, align 4
; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test() local_unnamed_addr !dbg !7 {
entry:
%val = alloca %struct.anon, align 4
call void @llvm.dbg.value(metadata ptr @.str, metadata !12, metadata !DIExpression()), !dbg !25
- call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %val) #4, !dbg !26
+ call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %val), !dbg !26
call void @llvm.dbg.declare(metadata ptr %val, metadata !16, metadata !DIExpression()), !dbg !27
call void @llvm.memcpy.p0.p0.i64(ptr nonnull align 4 dereferenceable(16) %val, ptr nonnull align 4 dereferenceable(16) @__const.test.val, i64 16, i1 false), !dbg !27
- tail call void @foo(ptr @.str) #4, !dbg !28
- call void @foo(ptr nonnull %val) #4, !dbg !29
- call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %val) #4, !dbg !30
+ tail call void @foo(ptr @.str), !dbg !28
+ call void @foo(ptr nonnull %val), !dbg !29
+ call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %val), !dbg !30
ret i32 0, !dbg !31
}
@@ -39,27 +39,21 @@ entry:
; CHECK-NOT: BTF_KIND_DATASEC
; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #2
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #1
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg)
-declare !dbg !32 dso_local void @foo(ptr) local_unnamed_addr #3
+declare !dbg !32 dso_local void @foo(ptr) local_unnamed_addr
; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { nounwind readnone speculatable willreturn }
-attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/local-var-readonly-2.ll b/llvm/test/CodeGen/BPF/BTF/local-var-readonly-2.ll
index 0e183a5..243cd87 100644
--- a/llvm/test/CodeGen/BPF/BTF/local-var-readonly-2.ll
+++ b/llvm/test/CodeGen/BPF/BTF/local-var-readonly-2.ll
@@ -19,14 +19,14 @@
@__const.test.val = private unnamed_addr constant %struct.anon { [4 x i32] [i32 2, i32 3, i32 4, i32 5], i8 4 }, align 4
; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test() local_unnamed_addr !dbg !7 {
entry:
%val = alloca %struct.anon, align 4
- call void @llvm.lifetime.start.p0(i64 20, ptr nonnull %val) #4, !dbg !23
+ call void @llvm.lifetime.start.p0(i64 20, ptr nonnull %val), !dbg !23
call void @llvm.dbg.declare(metadata ptr %val, metadata !12, metadata !DIExpression()), !dbg !24
call void @llvm.memcpy.p0.p0.i64(ptr nonnull align 4 dereferenceable(20) %val, ptr nonnull align 4 dereferenceable(20) @__const.test.val, i64 20, i1 false), !dbg !24
- call void @foo(ptr nonnull %val) #4, !dbg !25
- call void @llvm.lifetime.end.p0(i64 20, ptr nonnull %val) #4, !dbg !26
+ call void @foo(ptr nonnull %val), !dbg !25
+ call void @llvm.lifetime.end.p0(i64 20, ptr nonnull %val), !dbg !26
ret i32 0, !dbg !27
}
@@ -38,24 +38,18 @@ entry:
; CHECK: .ascii ".rodata" # string offset=42
; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #2
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #1
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg)
-declare !dbg !28 dso_local void @foo(ptr) local_unnamed_addr #3
+declare !dbg !28 dso_local void @foo(ptr) local_unnamed_addr
; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { nounwind readnone speculatable willreturn }
-attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind }
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/local-var.ll b/llvm/test/CodeGen/BPF/BTF/local-var.ll
index dd79923..fa605d8 100644
--- a/llvm/test/CodeGen/BPF/BTF/local-var.ll
+++ b/llvm/test/CodeGen/BPF/BTF/local-var.ll
@@ -7,7 +7,7 @@
; clang -target bpf -O2 -g -S -emit-llvm test.c
; Function Attrs: nounwind
-define dso_local i32 @foo(i8 signext) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @foo(i8 signext) local_unnamed_addr !dbg !7 {
%2 = alloca i16, align 2
call void @llvm.dbg.value(metadata i8 %0, metadata !13, metadata !DIExpression()), !dbg !17
call void @llvm.lifetime.start.p0(i64 2, ptr nonnull %2), !dbg !18
@@ -59,20 +59,16 @@ define dso_local i32 @foo(i8 signext) local_unnamed_addr #0 !dbg !7 {
; CHECK-NEXT: .byte 0
; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #2
+declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #2
+declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
-attributes #2 = { argmemonly nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/pruning-const.ll b/llvm/test/CodeGen/BPF/BTF/pruning-const.ll
index 8fef9c2..733815d 100644
--- a/llvm/test/CodeGen/BPF/BTF/pruning-const.ll
+++ b/llvm/test/CodeGen/BPF/BTF/pruning-const.ll
@@ -22,14 +22,14 @@
%struct.s2 = type { %struct.tt }
; Function Attrs: norecurse nounwind readnone
-define dso_local i32 @test1(ptr nocapture readnone %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test1(ptr nocapture readnone %arg) local_unnamed_addr !dbg !7 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !22, metadata !DIExpression()), !dbg !23
ret i32 0, !dbg !24
}
; Function Attrs: norecurse nounwind readonly
-define dso_local i32 @test2(ptr nocapture readonly %arg) local_unnamed_addr #1 !dbg !25 {
+define dso_local i32 @test2(ptr nocapture readonly %arg) local_unnamed_addr !dbg !25 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !33, metadata !DIExpression()), !dbg !34
%0 = load i32, ptr %arg, align 4, !dbg !35, !tbaa !36
@@ -64,11 +64,7 @@ entry:
; CHECK: .ascii "m2" # string offset=72
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/pruning-typedef.ll b/llvm/test/CodeGen/BPF/BTF/pruning-typedef.ll
index 4c8aa1f..727daea 100644
--- a/llvm/test/CodeGen/BPF/BTF/pruning-typedef.ll
+++ b/llvm/test/CodeGen/BPF/BTF/pruning-typedef.ll
@@ -24,14 +24,14 @@
%struct.s2 = type { %struct.tt }
; Function Attrs: norecurse nounwind readnone
-define dso_local i32 @test1(ptr nocapture readnone %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test1(ptr nocapture readnone %arg) local_unnamed_addr !dbg !7 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !23, metadata !DIExpression()), !dbg !24
ret i32 0, !dbg !25
}
; Function Attrs: norecurse nounwind readonly
-define dso_local i32 @test2(ptr nocapture readonly %arg) local_unnamed_addr #1 !dbg !26 {
+define dso_local i32 @test2(ptr nocapture readonly %arg) local_unnamed_addr !dbg !26 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !34, metadata !DIExpression()), !dbg !35
%0 = load i32, ptr %arg, align 4, !dbg !36, !tbaa !37
@@ -71,11 +71,7 @@ entry:
; CHECK: .ascii "m2" # string offset=81
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/ptr-named-2.ll b/llvm/test/CodeGen/BPF/BTF/ptr-named-2.ll
new file mode 100644
index 0000000..df0cbeb
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/BTF/ptr-named-2.ll
@@ -0,0 +1,59 @@
+; RUN: llc -mtriple=bpfel -filetype=obj -o %t1 %s
+; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1
+; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s
+; RUN: llc -mtriple=bpfeb -filetype=obj -o %t1 %s
+; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1
+; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s
+;
+; This IR is hand-written.
+
+; ModuleID = 'ptr-named-2.ll'
+source_filename = "ptr-named-2.ll"
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "bpfel-unknown-none"
+
+%struct.TypeExamples = type { i32*, i32, i32, i32* }
+
+@type_examples = internal global %struct.TypeExamples zeroinitializer, align 8, !dbg !0
+
+!llvm.dbg.cu = !{!1}
+!llvm.module.flags = !{!2, !3, !4}
+!llvm.ident = !{!21}
+
+; CHECK-BTF: [1] STRUCT 'TypeExamples' size=32 vlen=4
+; CHECK-BTF-NEXT: 'ptr' type_id=2 bits_offset=0
+; CHECK-BTF-NEXT: 'volatile' type_id=4 bits_offset=64
+; CHECK-BTF-NEXT: 'const' type_id=5 bits_offset=128
+; CHECK-BTF-NEXT: 'restrict_ptr' type_id=6 bits_offset=192
+; CHECK-BTF-NEXT: [2] PTR '(anon)' type_id=3
+; CHECK-BTF-NEXT: [3] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED
+; CHECK-BTF-NEXT: [4] VOLATILE '(anon)' type_id=3
+; CHECK-BTF-NEXT: [5] CONST '(anon)' type_id=3
+; CHECK-BTF-NEXT: [6] RESTRICT '(anon)' type_id=7
+; CHECK-BTF-NEXT: [7] PTR '(anon)' type_id=3
+; CHECK-BTF-NEXT: [8] VAR 'type_examples' type_id=1, linkage=static
+; CHECK-BTF-NEXT: [9] DATASEC '.bss' size=0 vlen=1
+; CHECK-BTF-NEXT: type_id=8 offset=0 size=24
+
+!0 = !DIGlobalVariableExpression(var: !5, expr: !DIExpression())
+!1 = distinct !DICompileUnit(language: DW_LANG_C99, file: !6, isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !7, globals: !8, splitDebugInlining: false, nameTableKind: None)
+!2 = !{i32 2, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = distinct !DIGlobalVariable(name: "type_examples", scope: !1, file: !6, line: 12, type: !9, isLocal: true, isDefinition: true)
+!6 = !DIFile(filename: "ptr-named-2.ll", directory: "/tmp")
+!7 = !{}
+!8 = !{!0}
+!9 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "TypeExamples", file: !6, line: 5, size: 256, elements: !10)
+!10 = !{!11, !12, !13, !14}
+!11 = !DIDerivedType(tag: DW_TAG_member, name: "ptr", scope: !9, file: !6, line: 6, baseType: !15, size: 64)
+!12 = !DIDerivedType(tag: DW_TAG_member, name: "volatile", scope: !9, file: !6, line: 7, baseType: !17, size: 64, offset: 64)
+!13 = !DIDerivedType(tag: DW_TAG_member, name: "const", scope: !9, file: !6, line: 8, baseType: !18, size: 64, offset: 128)
+!14 = !DIDerivedType(tag: DW_TAG_member, name: "restrict_ptr", scope: !9, file: !6, line: 9, baseType: !19, size: 64, offset: 192)
+!15 = !DIDerivedType(tag: DW_TAG_pointer_type, name: "*int", baseType: !16, size: 64)
+!16 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!17 = !DIDerivedType(tag: DW_TAG_volatile_type, name: "volatile int", baseType: !16)
+!18 = !DIDerivedType(tag: DW_TAG_const_type, name: "const int", baseType: !16)
+!19 = !DIDerivedType(tag: DW_TAG_restrict_type, name: "*int restrict", baseType: !20)
+!20 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !16, size: 64)
+!21 = !{!"my hand-written IR"}
diff --git a/llvm/test/CodeGen/BPF/BTF/ptr-named.ll b/llvm/test/CodeGen/BPF/BTF/ptr-named.ll
new file mode 100644
index 0000000..675c34e
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/BTF/ptr-named.ll
@@ -0,0 +1,75 @@
+; RUN: llc -mtriple=bpfel -filetype=obj -o %t1 %s
+; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1
+; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s
+; RUN: llc -mtriple=bpfeb -filetype=obj -o %t1 %s
+; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1
+; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s
+;
+; Source:
+; #![no_std]
+; #![no_main]
+;
+; pub struct MyType {
+; ptr: *const u32,
+; }
+;
+; impl MyType {
+; pub const fn new() -> Self {
+; let ptr = core::ptr::null();
+; Self { ptr }
+; }
+; }
+;
+; unsafe impl Sync for MyType {}
+;
+; #[unsafe(no_mangle)]
+; pub static X: MyType = MyType::new();
+;
+; #[cfg(not(test))]
+; #[panic_handler]
+; fn panic(_info: &core::panic::PanicInfo) -> ! {
+; loop {}
+; }
+; Compilation flag:
+; cargo +nightly rustc -Zbuild-std=core --target=bpfel-unknown-none -- --emit=llvm-bc
+; llvm-extract --glob=X $(find target/ -name "*.bc" | head -n 1) -o ptr-named.bc
+; llvm-dis ptr-named.bc -o ptr-named.ll
+
+; ModuleID = 'ptr-named.bc'
+source_filename = "1m2uqe50qkwxmo53ydydvou91"
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "bpfel"
+
+@X = constant [8 x i8] zeroinitializer, align 8, !dbg !0
+
+!llvm.module.flags = !{!11, !12, !13, !14}
+!llvm.ident = !{!15}
+!llvm.dbg.cu = !{!16}
+
+; CHECK-BTF: [1] STRUCT 'MyType' size=8 vlen=1
+; CHECK-BTF-NEXT: 'ptr' type_id=2 bits_offset=0
+; CHECK-BTF-NEXT: [2] PTR '(anon)' type_id=3
+; CHECK-BTF-NEXT: [3] INT 'u32' size=4 bits_offset=0 nr_bits=32 encoding=(none)
+; CHECK-BTF-NEXT: [4] VAR 'X' type_id=1, linkage=global
+; CHECK-BTF-NEXT: [5] DATASEC '.rodata' size=0 vlen=1
+; CHECK-BTF-NEXT: type_id=4 offset=0 size=8
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "X", scope: !2, file: !3, line: 19, type: !4, isLocal: false, isDefinition: true, align: 64)
+!2 = !DINamespace(name: "ptr_named", scope: null)
+!3 = !DIFile(filename: "ptr-named/src/main.rs", directory: "/tmp/ptr-named", checksumkind: CSK_MD5, checksum: "e37168304600b30cbb5ba168f0384932")
+!4 = !DICompositeType(tag: DW_TAG_structure_type, name: "MyType", scope: !2, file: !5, size: 64, align: 64, flags: DIFlagPublic, elements: !6, templateParams: !10, identifier: "7609fa40332dd486922f074276a171c3")
+!5 = !DIFile(filename: "<unknown>", directory: "")
+!6 = !{!7}
+!7 = !DIDerivedType(tag: DW_TAG_member, name: "ptr", scope: !4, file: !5, baseType: !8, size: 64, align: 64, flags: DIFlagPrivate)
+!8 = !DIDerivedType(tag: DW_TAG_pointer_type, name: "*const u32", baseType: !9, size: 64, align: 64, dwarfAddressSpace: 0)
+!9 = !DIBasicType(name: "u32", size: 32, encoding: DW_ATE_unsigned)
+!10 = !{}
+!11 = !{i32 8, !"PIC Level", i32 2}
+!12 = !{i32 7, !"PIE Level", i32 2}
+!13 = !{i32 7, !"Dwarf Version", i32 4}
+!14 = !{i32 2, !"Debug Info Version", i32 3}
+!15 = !{!"rustc version 1.92.0-nightly (c8905eaa6 2025-09-28)"}
+!16 = distinct !DICompileUnit(language: DW_LANG_Rust, file: !17, producer: "clang LLVM (rustc version 1.92.0-nightly (c8905eaa6 2025-09-28))", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !18, splitDebugInlining: false, nameTableKind: None)
+!17 = !DIFile(filename: "ptr-named/src/main.rs/@/1m2uqe50qkwxmo53ydydvou91", directory: "/tmp/ptr-named")
+!18 = !{!0}
diff --git a/llvm/test/CodeGen/BPF/BTF/static-func.ll b/llvm/test/CodeGen/BPF/BTF/static-func.ll
index fc79dbf..6506407 100644
--- a/llvm/test/CodeGen/BPF/BTF/static-func.ll
+++ b/llvm/test/CodeGen/BPF/BTF/static-func.ll
@@ -9,18 +9,18 @@
; clang -target bpf -O2 -g -S -emit-llvm test.c
; Function Attrs: nounwind
-define dso_local i32 @test2() local_unnamed_addr #0 !dbg !12 {
+define dso_local i32 @test2() local_unnamed_addr !dbg !12 {
entry:
%call = tail call fastcc i32 @test1(), !dbg !13
ret i32 %call, !dbg !14
}
; Function Attrs: noinline nounwind
-define internal fastcc i32 @test1() unnamed_addr #1 !dbg !15 {
+define internal fastcc i32 @test1() unnamed_addr !dbg !15 {
entry:
- %call = tail call i32 @foo() #3, !dbg !16
+ %call = tail call i32 @foo(), !dbg !16
ret i32 %call, !dbg !17
}
-declare !dbg !4 dso_local i32 @foo() local_unnamed_addr #2
+declare !dbg !4 dso_local i32 @foo() local_unnamed_addr
; CHECK: .section .BTF,"",@progbits
; CHECK-NEXT: .short 60319 # 0xeb9f
@@ -67,11 +67,6 @@ declare !dbg !4 dso_local i32 @foo() local_unnamed_addr #2
; CHECK-NEXT: .ascii "foo" # string offset=60
; CHECK-NEXT: .byte 0
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
-
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!8, !9, !10}
!llvm.ident = !{!11}
diff --git a/llvm/test/CodeGen/BPF/BTF/static-var-derived-type.ll b/llvm/test/CodeGen/BPF/BTF/static-var-derived-type.ll
index 1827c97..fedec38 100644
--- a/llvm/test/CodeGen/BPF/BTF/static-var-derived-type.ll
+++ b/llvm/test/CodeGen/BPF/BTF/static-var-derived-type.ll
@@ -17,7 +17,7 @@
@v4 = internal constant ptr null, align 8, !dbg !19
; Function Attrs: norecurse nounwind
-define dso_local i64 @foo() local_unnamed_addr #0 !dbg !27 {
+define dso_local i64 @foo() local_unnamed_addr !dbg !27 {
%1 = load volatile ptr, ptr @v1, align 8, !dbg !29, !tbaa !30
%2 = load volatile ptr, ptr @v2, align 8, !dbg !34, !tbaa !30
%3 = ptrtoint ptr %1 to i64, !dbg !35
@@ -141,8 +141,6 @@ define dso_local i64 @foo() local_unnamed_addr #0 !dbg !27 {
; CHECK-NEXT: .ascii ".rodata" # string offset=87
; CHECK-NEXT: .byte 0
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.dbg.cu = !{!2}
!llvm.module.flags = !{!23, !24, !25}
!llvm.ident = !{!26}
diff --git a/llvm/test/CodeGen/BPF/BTF/static-var-inited-sec.ll b/llvm/test/CodeGen/BPF/BTF/static-var-inited-sec.ll
index cc785b7..deef48a 100644
--- a/llvm/test/CodeGen/BPF/BTF/static-var-inited-sec.ll
+++ b/llvm/test/CodeGen/BPF/BTF/static-var-inited-sec.ll
@@ -14,7 +14,7 @@
@a = internal global i8 3, section "maps", align 1, !dbg !10
; Function Attrs: norecurse nounwind
-define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
+define dso_local i32 @foo() local_unnamed_addr !dbg !2 {
%1 = load volatile i8, ptr @a, align 1, !dbg !20, !tbaa !21
%2 = sext i8 %1 to i32, !dbg !20
%3 = load volatile i16, ptr @foo.b, align 2, !dbg !24, !tbaa !25
@@ -93,8 +93,6 @@ define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
; CHECK-NEXT: .ascii "maps" # string offset=71
; CHECK-NEXT: .byte 0
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.dbg.cu = !{!7}
!llvm.module.flags = !{!16, !17, !18}
!llvm.ident = !{!19}
diff --git a/llvm/test/CodeGen/BPF/BTF/static-var-inited.ll b/llvm/test/CodeGen/BPF/BTF/static-var-inited.ll
index 2b62882..8f29a83 100644
--- a/llvm/test/CodeGen/BPF/BTF/static-var-inited.ll
+++ b/llvm/test/CodeGen/BPF/BTF/static-var-inited.ll
@@ -14,7 +14,7 @@
@a = internal global i8 3, align 1, !dbg !10
; Function Attrs: norecurse nounwind
-define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
+define dso_local i32 @foo() local_unnamed_addr !dbg !2 {
%1 = load volatile i8, ptr @a, align 1, !dbg !20, !tbaa !21
%2 = sext i8 %1 to i32, !dbg !20
%3 = load volatile i16, ptr @foo.b, align 2, !dbg !24, !tbaa !25
@@ -93,8 +93,6 @@ define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
; CHECK-NEXT: .ascii ".data" # string offset=71
; CHECK-NEXT: .byte 0
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.dbg.cu = !{!7}
!llvm.module.flags = !{!16, !17, !18}
!llvm.ident = !{!19}
diff --git a/llvm/test/CodeGen/BPF/BTF/static-var-readonly-sec.ll b/llvm/test/CodeGen/BPF/BTF/static-var-readonly-sec.ll
index a4ae948..e16b467 100644
--- a/llvm/test/CodeGen/BPF/BTF/static-var-readonly-sec.ll
+++ b/llvm/test/CodeGen/BPF/BTF/static-var-readonly-sec.ll
@@ -14,7 +14,7 @@
@a = internal constant i8 0, section "maps", align 1, !dbg !10
; Function Attrs: norecurse nounwind
-define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
+define dso_local i32 @foo() local_unnamed_addr !dbg !2 {
%1 = load volatile i8, ptr @a, align 1, !dbg !22, !tbaa !23
%2 = sext i8 %1 to i32, !dbg !22
%3 = load volatile i16, ptr @foo.b, align 2, !dbg !26, !tbaa !27
@@ -99,8 +99,6 @@ define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
; CHECK-NEXT: .ascii "maps" # string offset=71
; CHECK-NEXT: .byte 0
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.dbg.cu = !{!7}
!llvm.module.flags = !{!18, !19, !20}
!llvm.ident = !{!21}
diff --git a/llvm/test/CodeGen/BPF/BTF/static-var-readonly.ll b/llvm/test/CodeGen/BPF/BTF/static-var-readonly.ll
index a9d60ce..1ddd499 100644
--- a/llvm/test/CodeGen/BPF/BTF/static-var-readonly.ll
+++ b/llvm/test/CodeGen/BPF/BTF/static-var-readonly.ll
@@ -14,7 +14,7 @@
@a = internal constant i8 0, align 1, !dbg !10
; Function Attrs: norecurse nounwind
-define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
+define dso_local i32 @foo() local_unnamed_addr !dbg !2 {
%1 = load volatile i8, ptr @a, align 1, !dbg !22, !tbaa !23
%2 = sext i8 %1 to i32, !dbg !22
%3 = load volatile i16, ptr @foo.b, align 2, !dbg !26, !tbaa !27
@@ -99,8 +99,6 @@ define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
; CHECK-NEXT: .ascii ".rodata" # string offset=71
; CHECK-NEXT: .byte 0
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.dbg.cu = !{!7}
!llvm.module.flags = !{!18, !19, !20}
!llvm.ident = !{!21}
diff --git a/llvm/test/CodeGen/BPF/BTF/static-var-sec.ll b/llvm/test/CodeGen/BPF/BTF/static-var-sec.ll
index ac27b2b..0ff8f2e 100644
--- a/llvm/test/CodeGen/BPF/BTF/static-var-sec.ll
+++ b/llvm/test/CodeGen/BPF/BTF/static-var-sec.ll
@@ -14,7 +14,7 @@
@a = internal global i8 0, section "maps", align 1, !dbg !10
; Function Attrs: norecurse nounwind
-define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
+define dso_local i32 @foo() local_unnamed_addr !dbg !2 {
%1 = load volatile i8, ptr @a, align 1, !dbg !20, !tbaa !21
%2 = sext i8 %1 to i32, !dbg !20
%3 = load volatile i16, ptr @foo.b, align 2, !dbg !24, !tbaa !25
@@ -93,8 +93,6 @@ define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
; CHECK-NEXT: .ascii "maps" # string offset=71
; CHECK-NEXT: .byte 0
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.dbg.cu = !{!7}
!llvm.module.flags = !{!16, !17, !18}
!llvm.ident = !{!19}
diff --git a/llvm/test/CodeGen/BPF/BTF/static-var-zerolen-array.ll b/llvm/test/CodeGen/BPF/BTF/static-var-zerolen-array.ll
index 28da203..fe9f508 100644
--- a/llvm/test/CodeGen/BPF/BTF/static-var-zerolen-array.ll
+++ b/llvm/test/CodeGen/BPF/BTF/static-var-zerolen-array.ll
@@ -15,7 +15,7 @@
@sv = internal global { i32, i32, [10 x i8] } { i32 3, i32 4, [10 x i8] c"abcdefghi\00" }, align 4, !dbg !0
; Function Attrs: norecurse nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !21 {
+define dso_local i32 @test() local_unnamed_addr !dbg !21 {
%1 = load volatile i32, ptr @sv, align 4, !dbg !24, !tbaa !25
ret i32 %1, !dbg !29
}
@@ -104,8 +104,6 @@ define dso_local i32 @test() local_unnamed_addr #0 !dbg !21 {
; CHECK-NEXT: .ascii ".data" # string offset=89
; CHECK-NEXT: .byte 0
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.dbg.cu = !{!2}
!llvm.module.flags = !{!17, !18, !19}
!llvm.ident = !{!20}
diff --git a/llvm/test/CodeGen/BPF/BTF/static-var.ll b/llvm/test/CodeGen/BPF/BTF/static-var.ll
index 461bd27..f7710e3 100644
--- a/llvm/test/CodeGen/BPF/BTF/static-var.ll
+++ b/llvm/test/CodeGen/BPF/BTF/static-var.ll
@@ -14,7 +14,7 @@
@a = internal global i8 0, align 1, !dbg !10
; Function Attrs: norecurse nounwind
-define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
+define dso_local i32 @foo() local_unnamed_addr !dbg !2 {
%1 = load volatile i8, ptr @a, align 1, !dbg !20, !tbaa !21
%2 = sext i8 %1 to i32, !dbg !20
%3 = load volatile i16, ptr @foo.b, align 2, !dbg !24, !tbaa !25
@@ -93,8 +93,6 @@ define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
; CHECK-NEXT: .ascii ".bss" # string offset=71
; CHECK-NEXT: .byte 0
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.dbg.cu = !{!7}
!llvm.module.flags = !{!16, !17, !18}
!llvm.ident = !{!19}
diff --git a/llvm/test/CodeGen/BPF/BTF/struct-anon-2.ll b/llvm/test/CodeGen/BPF/BTF/struct-anon-2.ll
index 5b125ea..68d4be0 100644
--- a/llvm/test/CodeGen/BPF/BTF/struct-anon-2.ll
+++ b/llvm/test/CodeGen/BPF/BTF/struct-anon-2.ll
@@ -15,7 +15,7 @@
%struct.anon.0 = type { i64 }
; Function Attrs: norecurse nounwind readnone
-define dso_local i32 @f1(ptr nocapture readnone %s1) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @f1(ptr nocapture readnone %s1) local_unnamed_addr !dbg !7 {
entry:
call void @llvm.dbg.value(metadata ptr %s1, metadata !25, metadata !DIExpression()), !dbg !26
ret i32 0, !dbg !27
@@ -65,12 +65,8 @@ entry:
; CHECK: .ascii "B1" # string offset=17
; CHECK: .ascii "long int" # string offset=20
-
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/weak-global-2.ll b/llvm/test/CodeGen/BPF/BTF/weak-global-2.ll
index 4b3b557..14cb8e0 100644
--- a/llvm/test/CodeGen/BPF/BTF/weak-global-2.ll
+++ b/llvm/test/CodeGen/BPF/BTF/weak-global-2.ll
@@ -11,7 +11,7 @@
@g = weak dso_local local_unnamed_addr global i8 2, align 1, !dbg !0
; Function Attrs: norecurse nounwind readonly
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test() local_unnamed_addr !dbg !11 {
entry:
%0 = load i8, ptr @g, align 1, !dbg !15, !tbaa !16
%conv = sext i8 %0 to i32, !dbg !15
@@ -37,9 +37,6 @@ entry:
; CHECK: .byte 103 # string offset=60
; CHECK: .ascii ".data" # string offset=62
-
-attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.dbg.cu = !{!2}
!llvm.module.flags = !{!7, !8, !9}
!llvm.ident = !{!10}
diff --git a/llvm/test/CodeGen/BPF/BTF/weak-global.ll b/llvm/test/CodeGen/BPF/BTF/weak-global.ll
index ea0a887..5605e0b 100644
--- a/llvm/test/CodeGen/BPF/BTF/weak-global.ll
+++ b/llvm/test/CodeGen/BPF/BTF/weak-global.ll
@@ -11,7 +11,7 @@
@g = weak dso_local local_unnamed_addr global i8 0, align 1, !dbg !0
; Function Attrs: norecurse nounwind readonly
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test() local_unnamed_addr !dbg !11 {
entry:
%0 = load i8, ptr @g, align 1, !dbg !15, !tbaa !16
%conv = sext i8 %0 to i32, !dbg !15
@@ -37,8 +37,6 @@ entry:
; CHECK: .byte 103 # string offset=60
; CHECK: .ascii ".bss" # string offset=62
-attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.dbg.cu = !{!2}
!llvm.module.flags = !{!7, !8, !9}
!llvm.ident = !{!10}
diff --git a/llvm/test/CodeGen/BPF/CORE/btf-id-duplicate.ll b/llvm/test/CodeGen/BPF/CORE/btf-id-duplicate.ll
index 23a4617..eecb993 100644
--- a/llvm/test/CodeGen/BPF/CORE/btf-id-duplicate.ll
+++ b/llvm/test/CodeGen/BPF/CORE/btf-id-duplicate.ll
@@ -13,7 +13,7 @@
%struct.s1 = type { i32, i32 }
; Function Attrs: nounwind
-define dso_local i32 @foo(ptr %arg) #0 !dbg !7 {
+define dso_local i32 @foo(ptr %arg) !dbg !7 {
entry:
%arg.addr = alloca ptr, align 8
store ptr %arg, ptr %arg.addr, align 8, !tbaa !18
@@ -24,13 +24,13 @@ entry:
}
; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
; Function Attrs: nounwind readnone
-declare i64 @llvm.bpf.btf.type.id(i32, i64) #2
+declare i64 @llvm.bpf.btf.type.id(i32, i64)
; Function Attrs: nounwind
-define dso_local i32 @bar(ptr %arg) #0 !dbg !25 {
+define dso_local i32 @bar(ptr %arg) !dbg !25 {
entry:
%arg.addr = alloca ptr, align 8
store ptr %arg, ptr %arg.addr, align 8, !tbaa !18
@@ -58,10 +58,6 @@ entry:
; CHECK-NEXT: .long 26
; CHECK-NEXT: .long 6
-attributes #0 = { nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nofree nosync nounwind readnone speculatable willreturn }
-attributes #2 = { nounwind readnone }
-
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
!llvm.ident = !{!6}
diff --git a/llvm/test/CodeGen/BPF/CORE/field-reloc-alu32.ll b/llvm/test/CodeGen/BPF/CORE/field-reloc-alu32.ll
index 40a2432..0851f25 100644
--- a/llvm/test/CodeGen/BPF/CORE/field-reloc-alu32.ll
+++ b/llvm/test/CodeGen/BPF/CORE/field-reloc-alu32.ll
@@ -15,7 +15,7 @@ target triple = "bpf"
@c = common dso_local global %struct.b zeroinitializer, align 4, !dbg !0
; Function Attrs: nounwind readnone
-define dso_local i32 @f() local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @f() local_unnamed_addr !dbg !15 {
entry:
%0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.bs(ptr elementtype(%struct.b) nonnull @c, i32 1, i32 1), !dbg !18, !llvm.preserve.access.index !6
%1 = tail call i32 @llvm.bpf.preserve.field.info.p0(ptr %0, i64 0), !dbg !19
@@ -40,13 +40,10 @@ entry:
; CHECK-NEXT: .long 0
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.bs(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.bs(ptr, i32, i32)
; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
!llvm.dbg.cu = !{!2}
!llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1-bpfeb.ll b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1-bpfeb.ll
index b8b7a0b..51df39b 100644
--- a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1-bpfeb.ll
+++ b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1-bpfeb.ll
@@ -25,7 +25,7 @@ target triple = "bpfeb"
%struct.s = type { i64, i32, i32, i32, i8, i8 }
; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !13 {
; CHECK-ALU64-LABEL: test:
; CHECK-ALU64: .Ltest$local:
; CHECK-ALU64-NEXT: .type .Ltest$local,@function
@@ -122,17 +122,13 @@ entry:
; CHECK-NEXT: .long 4
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32)
; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!9, !10, !11}
@@ -177,4 +173,3 @@ attributes #2 = { nounwind readnone speculatable }
!36 = !DILocation(line: 14, column: 10, scope: !13)
!37 = !DILocation(line: 13, column: 67, scope: !13)
!38 = !DILocation(line: 12, column: 3, scope: !13)
-
diff --git a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1.ll b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1.ll
index 4cf0a13..295c105 100644
--- a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1.ll
@@ -25,7 +25,7 @@ target triple = "bpfel"
%struct.s = type { i64, i32, i32, i32, i8, i8 }
; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !13 {
; CHECK-ALU64-LABEL: test:
; CHECK-ALU64: .Ltest$local:
; CHECK-ALU64-NEXT: .type .Ltest$local,@function
@@ -122,17 +122,13 @@ entry:
; CHECK-NEXT: .long 4
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32)
; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!9, !10, !11}
diff --git a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-2-bpfeb.ll b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-2-bpfeb.ll
index cdcd7e6..8f83404 100644
--- a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-2-bpfeb.ll
+++ b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-2-bpfeb.ll
@@ -26,7 +26,7 @@ target triple = "bpfeb"
%struct.s = type <{ i8, i16 }>
; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !13 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !27, metadata !DIExpression()), !dbg !28
%0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg, i32 1, i32 4), !dbg !29, !llvm.preserve.access.index !18
@@ -70,17 +70,13 @@ entry:
; CHECK-NEXT: .long 4
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32)
; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!9, !10, !11}
diff --git a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-2.ll b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-2.ll
index dd7f1c7..1a7619a 100644
--- a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-2.ll
@@ -26,7 +26,7 @@ target triple = "bpfel"
%struct.s = type <{ i8, i16 }>
; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !13 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !27, metadata !DIExpression()), !dbg !28
%0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg, i32 1, i32 4), !dbg !29, !llvm.preserve.access.index !18
@@ -70,17 +70,13 @@ entry:
; CHECK-NEXT: .long 4
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32)
; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!9, !10, !11}
diff --git a/llvm/test/CodeGen/BPF/CORE/field-reloc-duplicate.ll b/llvm/test/CodeGen/BPF/CORE/field-reloc-duplicate.ll
index 126bd0a..5a98b05 100644
--- a/llvm/test/CodeGen/BPF/CORE/field-reloc-duplicate.ll
+++ b/llvm/test/CodeGen/BPF/CORE/field-reloc-duplicate.ll
@@ -13,7 +13,7 @@
%struct.s1 = type { i32, i32 }
; Function Attrs: nounwind
-define dso_local i32 @foo(ptr %arg) #0 !dbg !7 {
+define dso_local i32 @foo(ptr %arg) !dbg !7 {
entry:
%arg.addr = alloca ptr, align 8
store ptr %arg, ptr %arg.addr, align 8, !tbaa !18
@@ -25,13 +25,13 @@ entry:
}
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32 immarg, i32 immarg)
; Function Attrs: nounwind
-define dso_local i32 @bar(ptr %arg) #0 !dbg !29 {
+define dso_local i32 @bar(ptr %arg) !dbg !29 {
entry:
%arg.addr = alloca ptr, align 8
store ptr %arg, ptr %arg.addr, align 8, !tbaa !18
@@ -60,10 +60,6 @@ entry:
; CHECK-NEXT: .long 26
; CHECK-NEXT: .long 0
-attributes #0 = { nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable willreturn }
-attributes #2 = { nounwind readnone }
-
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
!llvm.ident = !{!6}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-array-2.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-array-2.ll
index 90681d3c..00c3a6d 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-array-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-array-2.ll
@@ -17,7 +17,7 @@ target triple = "bpf"
%struct.s1 = type { i32 }
; Function Attrs: nounwind readnone
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !17 {
+define dso_local i32 @test() local_unnamed_addr !dbg !17 {
entry:
call void @llvm.dbg.value(metadata ptr null, metadata !21, metadata !DIExpression()), !dbg !22
%0 = tail call ptr @llvm.preserve.array.access.index.p0.s1s.p0.s1s(ptr elementtype(%struct.s1) null, i32 0, i32 0), !dbg !23, !llvm.preserve.access.index !8
@@ -40,17 +40,13 @@ entry:
; CHECK-NEXT: .long 2
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.s1s.p0.s1s(ptr, i32 immarg, i32 immarg) #1
+declare ptr @llvm.preserve.array.access.index.p0.s1s.p0.s1s(ptr, i32 immarg, i32 immarg)
; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0.s1s(ptr, i64 immarg) #1
+declare i32 @llvm.bpf.preserve.field.info.p0.s1s(ptr, i64 immarg)
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!13, !14, !15}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-array.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-array.ll
index d6bed6c..7e2e8e6 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-array.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-array.ll
@@ -15,12 +15,12 @@ target triple = "bpf"
%struct.s = type { i32, i32 }
; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !7 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !17, metadata !DIExpression()), !dbg !18
%0 = tail call ptr @llvm.preserve.array.access.index.p0.ss.p0.ss(ptr elementtype(%struct.s) %arg, i32 0, i32 2), !dbg !19, !llvm.preserve.access.index !11
%1 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %0, i32 1, i32 1), !dbg !19, !llvm.preserve.access.index !12
- %call = tail call i32 @get_value(ptr %1) #4, !dbg !20
+ %call = tail call i32 @get_value(ptr %1), !dbg !20
ret i32 %call, !dbg !21
}
; CHECK-LABEL: test
@@ -39,22 +39,16 @@ entry:
; CHECK-NEXT: .long 26
; CHECK-NEXT: .long 0
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.ss.p0.ss(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.array.access.index.p0.ss.p0.ss(ptr, i32 immarg, i32 immarg)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32 immarg, i32 immarg)
; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-1.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-1.ll
index 525f38d..cb6674f 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-1.ll
@@ -22,7 +22,7 @@ target triple = "bpf"
%struct.s1 = type { i32 }
; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !11 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !28, metadata !DIExpression()), !dbg !33
%0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !34, !llvm.preserve.access.index !16
@@ -85,20 +85,16 @@ entry:
; CHECK-NEXT: .long 1
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-2.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-2.ll
index 11235b5..2697201 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-2.ll
@@ -21,7 +21,7 @@ target triple = "bpf"
%struct.s1 = type { i32, i8 }
; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !11 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !27, metadata !DIExpression()), !dbg !31
%0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !32, !llvm.preserve.access.index !16
@@ -71,27 +71,23 @@ entry:
; CHECK-NEXT: .long 1
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0.s1s(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0.s1s(ptr, i64)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
; Function Attrs: nounwind readnone
; Function Attrs: nounwind readnone
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-3.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-3.ll
index e3382d6..b7541f0 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-3.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-3.ll
@@ -20,7 +20,7 @@ target triple = "bpf"
%struct.s1 = type { [10 x [10 x i32]] }
; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !18 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !18 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !31, metadata !DIExpression()), !dbg !34
%0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !35, !llvm.preserve.access.index !22
@@ -60,27 +60,23 @@ entry:
; CHECK-NEXT: .long 1
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #1
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
; Function Attrs: nounwind readnone
; Function Attrs: nounwind readnone
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-4.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-4.ll
index fda7592..0220567 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-4.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-4.ll
@@ -15,7 +15,7 @@ target triple = "bpf"
%struct.s1 = type { i32, i8, i32 }
; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr readnone %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr readnone %arg) local_unnamed_addr !dbg !11 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !23, metadata !DIExpression()), !dbg !24
%0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr elementtype(%struct.s1) %arg, i32 1, i32 1), !dbg !25, !llvm.preserve.access.index !17
@@ -41,17 +41,13 @@ entry:
; CHECK-NEXT: .long 1
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-1.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-1.ll
index 69872db3..0404deb 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-1.ll
@@ -22,7 +22,7 @@ target triple = "bpf"
%union.u1 = type { i32 }
; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg1, ptr %arg2) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg1, ptr %arg2) local_unnamed_addr !dbg !11 {
entry:
call void @llvm.dbg.value(metadata ptr %arg1, metadata !29, metadata !DIExpression()), !dbg !35
call void @llvm.dbg.value(metadata ptr %arg2, metadata !30, metadata !DIExpression()), !dbg !35
@@ -85,29 +85,25 @@ entry:
; CHECK-NEXT: .long 2
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
; Function Attrs: nounwind readnone
; Function Attrs: nounwind readnone
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.u1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.u1s(ptr, i32, i32)
; Function Attrs: nounwind readnone
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-2.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-2.ll
index 90706e9..240083f 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-2.ll
@@ -20,7 +20,7 @@ target triple = "bpf"
%struct.s1 = type { i32, i16 }
; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !11 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !27, metadata !DIExpression()), !dbg !30
%0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !31, !llvm.preserve.access.index !16
@@ -59,24 +59,20 @@ entry:
; CHECK-NEXT: .long 2
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
; Function Attrs: nounwind readnone
; Function Attrs: nounwind readnone
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-3.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-3.ll
index 2297040..57dd5b7 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-3.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-3.ll
@@ -19,7 +19,7 @@ target triple = "bpf"
%struct.s1 = type { [10 x [10 x i32]] }
; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !18 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !18 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !31, metadata !DIExpression()), !dbg !34
%0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !35, !llvm.preserve.access.index !22
@@ -59,27 +59,23 @@ entry:
; CHECK-NEXT: .long 2
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #1
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
; Function Attrs: nounwind readnone
; Function Attrs: nounwind readnone
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-1-bpfeb.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-1-bpfeb.ll
index 503a26c..7caa667 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-1-bpfeb.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-1-bpfeb.ll
@@ -23,7 +23,7 @@ target triple = "bpfeb"
%struct.s1 = type { i32 }
; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !11 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !28, metadata !DIExpression()), !dbg !33
%0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !34, !llvm.preserve.access.index !16
@@ -86,20 +86,16 @@ entry:
; CHECK-NEXT: .long 4
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-1.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-1.ll
index 0327f1a..c518573 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-1.ll
@@ -23,7 +23,7 @@ target triple = "bpfel"
%struct.s1 = type { i32 }
; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !11 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !28, metadata !DIExpression()), !dbg !33
%0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !34, !llvm.preserve.access.index !16
@@ -86,20 +86,16 @@ entry:
; CHECK-NEXT: .long 4
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-2.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-2.ll
index 2a92d08..6bf29d4 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-2.ll
@@ -21,7 +21,7 @@ target triple = "bpf"
%struct.s1 = type { i32, i16 }
; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !11 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !27, metadata !DIExpression()), !dbg !30
%0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !31, !llvm.preserve.access.index !16
@@ -60,24 +60,20 @@ entry:
; CHECK-NEXT: .long 4
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
; Function Attrs: nounwind readnone
; Function Attrs: nounwind readnone
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-1.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-1.ll
index 6e62bb3..441366f 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-1.ll
@@ -22,7 +22,7 @@ target triple = "bpf"
%struct.s1 = type { i32 }
; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !11 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !28, metadata !DIExpression()), !dbg !33
%0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !34, !llvm.preserve.access.index !16
@@ -85,20 +85,16 @@ entry:
; CHECK-NEXT: .long 5
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-2.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-2.ll
index 77ea26a..7bc994d 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-2.ll
@@ -20,7 +20,7 @@ target triple = "bpf"
%struct.s1 = type { i32, i8 }
; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !11 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !27, metadata !DIExpression()), !dbg !30
%0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !31, !llvm.preserve.access.index !16
@@ -59,24 +59,20 @@ entry:
; CHECK-NEXT: .long 5
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
; Function Attrs: nounwind readnone
; Function Attrs: nounwind readnone
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-3.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-3.ll
index 556f69f..ebfecff 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-3.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-3.ll
@@ -20,7 +20,7 @@ target triple = "bpf"
%struct.s1 = type { [5 x [5 x i8]] }
; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !18 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !18 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !32, metadata !DIExpression()), !dbg !35
%0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !36, !llvm.preserve.access.index !23
@@ -60,27 +60,23 @@ entry:
; CHECK-NEXT: .long 5
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #1
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
; Function Attrs: nounwind readnone
; Function Attrs: nounwind readnone
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-1.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-1.ll
index 2741050..d50701c 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-1.ll
@@ -22,7 +22,7 @@ target triple = "bpf"
%union.u1 = type { i32 }
; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg1, ptr %arg2) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg1, ptr %arg2) local_unnamed_addr !dbg !11 {
entry:
call void @llvm.dbg.value(metadata ptr %arg1, metadata !29, metadata !DIExpression()), !dbg !35
call void @llvm.dbg.value(metadata ptr %arg2, metadata !30, metadata !DIExpression()), !dbg !35
@@ -85,29 +85,25 @@ entry:
; CHECK-NEXT: .long 3
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
; Function Attrs: nounwind readnone
; Function Attrs: nounwind readnone
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.u1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.u1s(ptr, i32, i32)
; Function Attrs: nounwind readnone
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-2.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-2.ll
index b71bbf3..312d40f 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-2.ll
@@ -25,7 +25,7 @@ target triple = "bpf"
%struct.s1 = type { i32, i16 }
; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !20 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !20 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !37, metadata !DIExpression()), !dbg !41
%0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !42, !llvm.preserve.access.index !24
@@ -76,24 +76,20 @@ entry:
; CHECK-NEXT: .long 3
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
; Function Attrs: nounwind readnone
; Function Attrs: nounwind readnone
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!16, !17, !18}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-3.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-3.ll
index 5caea97..12a21c7 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-3.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-3.ll
@@ -24,7 +24,7 @@ target triple = "bpf"
%struct.s1 = type { [10 x i32], [10 x [10 x i32]] }
; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !29 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !29 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !43, metadata !DIExpression()), !dbg !46
%0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !47, !llvm.preserve.access.index !33
@@ -66,27 +66,23 @@ entry:
; CHECK-NEXT: .long 3
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #1
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
; Function Attrs: nounwind readnone
; Function Attrs: nounwind readnone
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!25, !26, !27}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-struct.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-struct.ll
index 8b95b1c4..13c7d1d 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-struct.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-struct.ll
@@ -15,11 +15,11 @@ target triple = "bpf"
%struct.s = type { i32, i32 }
; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !7 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !17, metadata !DIExpression()), !dbg !18
%0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg, i32 1, i32 1), !dbg !19, !llvm.preserve.access.index !12
- %call = tail call i32 @get_value(ptr %0) #4, !dbg !20
+ %call = tail call i32 @get_value(ptr %0), !dbg !20
ret i32 %call, !dbg !21
}
@@ -39,19 +39,13 @@ entry:
; CHECK-NEXT: .long 26
; CHECK-NEXT: .long 0
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32 immarg, i32 immarg)
; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-enum-value.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-enum-value.ll
index 88658b6..8583322 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-enum-value.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-enum-value.ll
@@ -20,7 +20,7 @@ target triple = "bpf"
@2 = private unnamed_addr constant [18 x i8] c"VAL10:-2147483648\00", align 1
; Function Attrs: nounwind readnone
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !18 {
+define dso_local i32 @test() local_unnamed_addr !dbg !18 {
entry:
%0 = tail call i64 @llvm.bpf.preserve.enum.value(i32 0, ptr @0, i64 0), !dbg !23, !llvm.preserve.access.index !3
%1 = tail call i64 @llvm.bpf.preserve.enum.value(i32 1, ptr @1, i64 1), !dbg !24, !llvm.preserve.access.index !3
@@ -81,10 +81,7 @@ entry:
; CHECK-NEXT: .long 11
; Function Attrs: nounwind readnone
-declare i64 @llvm.bpf.preserve.enum.value(i32, ptr, i64) #1
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare i64 @llvm.bpf.preserve.enum.value(i32, ptr, i64)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-exist.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-exist.ll
index 0bdf954..6f316d9 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-exist.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-exist.ll
@@ -17,7 +17,7 @@
target triple = "bpf"
; Function Attrs: nounwind readnone
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !17 {
+define dso_local i32 @test() local_unnamed_addr !dbg !17 {
entry:
%0 = tail call i32 @llvm.bpf.preserve.type.info(i32 0, i64 0), !dbg !19, !llvm.preserve.access.index !8
%1 = tail call i32 @llvm.bpf.preserve.type.info(i32 1, i64 0), !dbg !20, !llvm.preserve.access.index !21
@@ -59,10 +59,7 @@ entry:
; CHECK-NEXT: .long 8
; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.type.info(i32, i64) #1
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare i32 @llvm.bpf.preserve.type.info(i32, i64)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!13, !14, !15}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-size-1.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-size-1.ll
index ddd3711..d3aacc72 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-size-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-size-1.ll
@@ -17,7 +17,7 @@
target triple = "bpf"
; Function Attrs: nounwind readnone
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !17 {
+define dso_local i32 @test() local_unnamed_addr !dbg !17 {
entry:
%0 = tail call i32 @llvm.bpf.preserve.type.info(i32 0, i64 1), !dbg !19, !llvm.preserve.access.index !8
%1 = tail call i32 @llvm.bpf.preserve.type.info(i32 1, i64 1), !dbg !20, !llvm.preserve.access.index !21
@@ -59,10 +59,7 @@ entry:
; CHECK-NEXT: .long 9
; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.type.info(i32, i64) #1
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare i32 @llvm.bpf.preserve.type.info(i32, i64)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!13, !14, !15}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-size-2.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-size-2.ll
index b2f8e48..ad4fc96 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-size-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-size-2.ll
@@ -20,7 +20,7 @@
target triple = "bpf"
; Function Attrs: nounwind readnone
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !17 {
+define dso_local i32 @test() local_unnamed_addr !dbg !17 {
entry:
call void @llvm.dbg.declare(metadata ptr undef, metadata !20, metadata !DIExpression()), !dbg !28
call void @llvm.dbg.declare(metadata ptr undef, metadata !19, metadata !DIExpression()), !dbg !29
@@ -65,14 +65,10 @@ entry:
; CHECK-NEXT: .long 9
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.type.info(i32, i64) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable willreturn }
-attributes #2 = { nounwind readnone }
+declare i32 @llvm.bpf.preserve.type.info(i32, i64)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!13, !14, !15}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-union.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-union.ll
index ef360929..e0217dd 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-union.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-union.ll
@@ -15,11 +15,11 @@ target triple = "bpf"
%union.u = type { i32 }
; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !7 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !17, metadata !DIExpression()), !dbg !18
%0 = tail call ptr @llvm.preserve.union.access.index.p0.us.p0.us(ptr %arg, i32 1), !dbg !19, !llvm.preserve.access.index !12
- %call = tail call i32 @get_value(ptr %0) #4, !dbg !20
+ %call = tail call i32 @get_value(ptr %0), !dbg !20
ret i32 %call, !dbg !21
}
; CHECK-LABEL: test
@@ -38,19 +38,13 @@ entry:
; CHECK-NEXT: .long 26
; CHECK-NEXT: .long 0
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.us.p0.us(ptr, i32 immarg) #2
+declare ptr @llvm.preserve.union.access.index.p0.us.p0.us(ptr, i32 immarg)
; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/no-elf-ama-symbol.ll b/llvm/test/CodeGen/BPF/CORE/no-elf-ama-symbol.ll
index 4c6ce1e..819ee31 100644
--- a/llvm/test/CodeGen/BPF/CORE/no-elf-ama-symbol.ll
+++ b/llvm/test/CodeGen/BPF/CORE/no-elf-ama-symbol.ll
@@ -15,7 +15,7 @@ target triple = "bpf"
%struct.tt = type { i32 }
; Function Attrs: nounwind readonly
-define dso_local i32 @test(ptr readonly %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr readonly %arg) local_unnamed_addr !dbg !7 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !16, metadata !DIExpression()), !dbg !17
%0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.tts(ptr elementtype(%struct.tt) %arg, i32 0, i32 0), !dbg !18, !llvm.preserve.access.index !12
@@ -26,14 +26,10 @@ entry:
; CHECK-NOT: llvm.tt:0:0$0:0
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.tts(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.tts(ptr, i32, i32)
; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable}
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/no-narrow-load.ll b/llvm/test/CodeGen/BPF/CORE/no-narrow-load.ll
index 9998c98..c3f8395 100644
--- a/llvm/test/CodeGen/BPF/CORE/no-narrow-load.ll
+++ b/llvm/test/CodeGen/BPF/CORE/no-narrow-load.ll
@@ -28,7 +28,7 @@ target triple = "bpf"
%struct.data_t = type { i32, i32 }
; Function Attrs: nounwind
-define dso_local void @test(ptr readonly %args) local_unnamed_addr #0 !dbg !12 {
+define dso_local void @test(ptr readonly %args) local_unnamed_addr !dbg !12 {
entry:
%data = alloca i64, align 8
call void @llvm.dbg.value(metadata ptr %args, metadata !22, metadata !DIExpression()), !dbg !29
@@ -36,7 +36,7 @@ entry:
%1 = load i32, ptr %0, align 4, !dbg !30, !tbaa !31
%and = and i32 %1, 65536, !dbg !36
call void @llvm.dbg.value(metadata i32 %and, metadata !23, metadata !DIExpression()), !dbg !29
- call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %data) #5, !dbg !37
+ call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %data), !dbg !37
call void @llvm.dbg.declare(metadata ptr %data, metadata !24, metadata !DIExpression()), !dbg !38
store i64 0, ptr %data, align 8, !dbg !38
%tobool = icmp eq i32 %and, 0, !dbg !39
@@ -60,8 +60,8 @@ lor.end: ; preds = %lor.end.critedge, %
%5 = phi i32 [ %phitmp, %cond.false ], [ 1, %lor.end.critedge ]
%d2 = getelementptr inbounds %struct.data_t, ptr %data, i64 0, i32 1, !dbg !49
store i32 %5, ptr %d2, align 4, !dbg !50, !tbaa !51
- call void @output(ptr nonnull %data) #5, !dbg !52
- call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %data) #5, !dbg !53
+ call void @output(ptr nonnull %data), !dbg !52
+ call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %data), !dbg !53
ret void, !dbg !53
}
@@ -71,28 +71,21 @@ lor.end: ; preds = %lor.end.critedge, %
; CHECK: r[[LOAD]] &= 32768
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #2
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.info_ts(ptr, i32 immarg, i32 immarg) #3
+declare ptr @llvm.preserve.struct.access.index.p0.p0.info_ts(ptr, i32 immarg, i32 immarg)
-declare !dbg !4 dso_local void @output(ptr) local_unnamed_addr #4
+declare !dbg !4 dso_local void @output(ptr) local_unnamed_addr
; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #2
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable willreturn }
-attributes #2 = { argmemonly nounwind willreturn }
-attributes #3 = { nounwind readnone }
-attributes #4 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #5 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!8, !9, !10}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-access-str.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-access-str.ll
index 5da2bbd..1ce453c 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-access-str.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-access-str.ll
@@ -18,13 +18,13 @@ target triple = "bpf"
%struct.t = type { i32, i32 }
; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg1, ptr %arg2) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr %arg1, ptr %arg2) local_unnamed_addr !dbg !7 {
entry:
call void @llvm.dbg.value(metadata ptr %arg1, metadata !22, metadata !DIExpression()), !dbg !24
call void @llvm.dbg.value(metadata ptr %arg2, metadata !23, metadata !DIExpression()), !dbg !24
%0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg1, i32 1, i32 1), !dbg !25, !llvm.preserve.access.index !12
%1 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ts(ptr elementtype(%struct.t) %arg2, i32 1, i32 1), !dbg !26, !llvm.preserve.access.index !17
- %call = tail call i32 @get_value(ptr %0, ptr %1) #4, !dbg !27
+ %call = tail call i32 @get_value(ptr %0, ptr %1), !dbg !27
ret i32 %call, !dbg !28
}
@@ -46,22 +46,16 @@ entry:
; CHECK-NEXT: .long [[ACCESS_STR]]
; CHECK-NEXT: .long 0
-declare dso_local i32 @get_value(ptr, ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr, ptr) local_unnamed_addr
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32 immarg, i32 immarg)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ts(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ts(ptr, i32 immarg, i32 immarg)
; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-basic.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-basic.ll
index 024ed04..0fdd704 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-basic.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-basic.ll
@@ -24,19 +24,19 @@ target triple = "bpf"
%struct.net_device = type opaque
; Function Attrs: nounwind
-define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @bpf_prog(ptr) local_unnamed_addr !dbg !15 {
%2 = alloca ptr, align 8
call void @llvm.dbg.value(metadata ptr %0, metadata !26, metadata !DIExpression()), !dbg !28
- call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %2) #4, !dbg !29
+ call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %2), !dbg !29
call void @llvm.dbg.value(metadata ptr null, metadata !27, metadata !DIExpression()), !dbg !28
store ptr null, ptr %2, align 8, !dbg !30, !tbaa !31
%3 = tail call ptr @llvm.preserve.struct.access.index.p0.net_devices.p0.sk_buffs(ptr elementtype(%struct.sk_buff) %0, i32 1, i32 1), !dbg !35, !llvm.preserve.access.index !19
- %4 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 8, ptr %3) #4, !dbg !36
+ %4 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 8, ptr %3), !dbg !36
%5 = load ptr, ptr %2, align 8, !dbg !37, !tbaa !31
call void @llvm.dbg.value(metadata ptr %5, metadata !27, metadata !DIExpression()), !dbg !28
%6 = icmp ne ptr %5, null, !dbg !38
%7 = zext i1 %6 to i32, !dbg !38
- call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %2) #4, !dbg !39
+ call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %2), !dbg !39
ret i32 %7, !dbg !40
}
@@ -122,22 +122,16 @@ define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
; CHECK-NEXT: .long 0
; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.net_devices.p0.sk_buffs(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.net_devices.p0.sk_buffs(ptr, i32 immarg, i32 immarg)
; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-array-1.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-array-1.ll
index e12221e..65859c86 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-array-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-array-1.ll
@@ -21,7 +21,7 @@ target triple = "bpf"
%struct.v1 = type { i32, i32 }
; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !22 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !22 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !32, metadata !DIExpression()), !dbg !33
%0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0(ptr elementtype(%struct.v3) %arg, i32 1, i32 1), !dbg !34, !llvm.preserve.access.index !26
@@ -30,7 +30,7 @@ entry:
%3 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([4 x %struct.v1]) %2, i32 0, i32 0), !dbg !34, !llvm.preserve.access.index !4
%4 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([4 x %struct.v1]) %3, i32 1, i32 2), !dbg !34, !llvm.preserve.access.index !5
%5 = tail call ptr @llvm.preserve.struct.access.index.p0.p0(ptr elementtype(%struct.v1) %4, i32 1, i32 1), !dbg !34, !llvm.preserve.access.index !8
- %call = tail call i32 @get_value(ptr %5) #4, !dbg !35
+ %call = tail call i32 @get_value(ptr %5), !dbg !35
ret i32 %call, !dbg !36
}
@@ -60,13 +60,13 @@ entry:
; CHECK-NEXT: .long 107
; CHECK-NEXT: .long 0
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0(ptr, i32, i32)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
; Function Attrs: nounwind readnone
@@ -75,13 +75,7 @@ declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #2
; Function Attrs: nounwind readnone
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!18, !19, !20}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-array-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-array-2.ll
index 1764c9d..f42e7e6 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-array-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-array-2.ll
@@ -21,7 +21,7 @@ target triple = "bpf"
%struct.v1 = type { i32, i32 }
; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !24 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !24 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !34, metadata !DIExpression()), !dbg !35
%0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0(ptr elementtype(%struct.v3) %arg, i32 1, i32 1), !dbg !36, !llvm.preserve.access.index !28
@@ -31,7 +31,7 @@ entry:
%4 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([4 x [4 x %struct.v1]]) %3, i32 1, i32 2), !dbg !36, !llvm.preserve.access.index !5
%5 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([4 x %struct.v1]) %4, i32 1, i32 3), !dbg !36, !llvm.preserve.access.index !18
%6 = tail call ptr @llvm.preserve.struct.access.index.p0.p0(ptr elementtype(%struct.v1) %5, i32 1, i32 1), !dbg !36, !llvm.preserve.access.index !8
- %call = tail call i32 @get_value(ptr %6) #4, !dbg !37
+ %call = tail call i32 @get_value(ptr %6), !dbg !37
ret i32 %call, !dbg !38
}
@@ -62,13 +62,13 @@ entry:
; CHECK-NEXT: .long 107
; CHECK-NEXT: .long 0
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0(ptr, i32, i32)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
; Function Attrs: nounwind readnone
@@ -79,13 +79,7 @@ declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #2
; Function Attrs: nounwind readnone
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!20, !21, !22}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-1.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-1.ll
index bbff3f6..38b1c99 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-1.ll
@@ -21,12 +21,12 @@ target triple = "bpf"
%struct.v1 = type { i32, i32 }
; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !14 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !14 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !28, metadata !DIExpression()), !dbg !29
%0 = tail call ptr @llvm.preserve.struct.access.index.p0.v2s.p0.v3s(ptr elementtype(%struct.v3) %arg, i32 1, i32 1), !dbg !30, !llvm.preserve.access.index !18
%1 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.v1s(ptr elementtype(%struct.v1) %0, i32 1, i32 1), !dbg !30, !llvm.preserve.access.index !5
- %call = tail call i32 @get_value(ptr %1) #4, !dbg !31
+ %call = tail call i32 @get_value(ptr %1), !dbg !31
ret i32 %call, !dbg !32
}
@@ -60,22 +60,16 @@ entry:
; CHECK-NEXT: .long [[ACCESS_STR]]
; CHECK-NEXT: .long 0
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.v2s.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.v2s.p0.v3s(ptr, i32, i32)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.v1s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.v1s(ptr, i32, i32)
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!10, !11, !12}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-2.ll
index bdc17e6..7730ee3a 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-2.ll
@@ -24,12 +24,12 @@ target triple = "bpf"
%struct.v1 = type { i32, i32 }
; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !15 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !33, metadata !DIExpression()), !dbg !34
%0 = tail call ptr @llvm.preserve.struct.access.index.p0.v2s.p0.v3s(ptr elementtype(%struct.v3) %arg, i32 1, i32 1), !dbg !35, !llvm.preserve.access.index !20
%1 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.v1s(ptr elementtype(%struct.v1) %0, i32 1, i32 1), !dbg !35, !llvm.preserve.access.index !6
- %call = tail call i32 @get_value(ptr %1) #4, !dbg !36
+ %call = tail call i32 @get_value(ptr %1), !dbg !36
ret i32 %call, !dbg !37
}
@@ -47,7 +47,6 @@ entry:
; CHECK: .ascii "0:1" # string offset=45
; CHECK: .ascii "v1" # string offset=91
-
; CHECK: .long 16 # FieldReloc
; CHECK-NEXT: .long 39 # Field reloc section string offset=39
; CHECK-NEXT: .long 2
@@ -60,22 +59,16 @@ entry:
; CHECK-NEXT: .long 45
; CHECK-NEXT: .long 0
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.v2s.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.v2s.p0.v3s(ptr, i32, i32)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.v1s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.v1s(ptr, i32, i32)
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-3.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-3.ll
index dea6e40..e5ef549 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-3.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-3.ll
@@ -22,14 +22,14 @@ target triple = "bpf"
%struct.v1 = type { i32, i32 }
; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !19 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !19 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !30, metadata !DIExpression()), !dbg !31
%0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0(ptr elementtype(%struct.v3) %arg, i32 1, i32 1), !dbg !32, !llvm.preserve.access.index !24
%1 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([40 x i32]) %0, i32 1, i32 4), !dbg !32, !llvm.preserve.access.index !11
%2 = bitcast ptr %1 to ptr, !dbg !32
%3 = tail call ptr @llvm.preserve.struct.access.index.p0.p0(ptr elementtype(%struct.v1) %2, i32 1, i32 1), !dbg !32, !llvm.preserve.access.index !6
- %call = tail call i32 @get_value(ptr %3) #4, !dbg !33
+ %call = tail call i32 @get_value(ptr %3), !dbg !33
ret i32 %call, !dbg !34
}
@@ -60,24 +60,18 @@ entry:
; CHECK-NEXT: .long 118
; CHECK-NEXT: .long 0
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0(ptr, i32, i32)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
; Function Attrs: nounwind readnone
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!15, !16, !17}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-union-1.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-union-1.ll
index 98fdfde..7aeaed4 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-union-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-union-1.ll
@@ -24,14 +24,14 @@ target triple = "bpf"
%union.v1 = type { i32 }
; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !15 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !33, metadata !DIExpression()), !dbg !34
%0 = tail call ptr @llvm.preserve.union.access.index.p0.p0(ptr %arg, i32 1), !dbg !35, !llvm.preserve.access.index !20
%1 = bitcast ptr %0 to ptr, !dbg !35
%2 = tail call ptr @llvm.preserve.union.access.index.p0.p0(ptr %1, i32 1), !dbg !35, !llvm.preserve.access.index !6
%b = getelementptr inbounds %union.v1, ptr %2, i64 0, i32 0, !dbg !35
- %call = tail call i32 @get_value(ptr %b) #4, !dbg !36
+ %call = tail call i32 @get_value(ptr %b), !dbg !36
ret i32 %call, !dbg !37
}
@@ -61,21 +61,15 @@ entry:
; CHECK-NEXT: .long 45
; CHECK-NEXT: .long 0
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.p0(ptr, i32) #2
+declare ptr @llvm.preserve.union.access.index.p0.p0(ptr, i32)
; Function Attrs: nounwind readnone
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-union-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-union-2.ll
index 7b63699..12c3936 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-union-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-union-2.ll
@@ -22,7 +22,7 @@ target triple = "bpf"
%union.v1 = type { i32 }
; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !19 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !19 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !30, metadata !DIExpression()), !dbg !31
%0 = tail call ptr @llvm.preserve.union.access.index.p0.p0(ptr %arg, i32 1), !dbg !32, !llvm.preserve.access.index !24
@@ -31,7 +31,7 @@ entry:
%2 = bitcast ptr %1 to ptr, !dbg !32
%3 = tail call ptr @llvm.preserve.union.access.index.p0.p0(ptr %2, i32 1), !dbg !32, !llvm.preserve.access.index !6
%b = getelementptr inbounds %union.v1, ptr %3, i64 0, i32 0, !dbg !32
- %call = tail call i32 @get_value(ptr %b) #4, !dbg !33
+ %call = tail call i32 @get_value(ptr %b), !dbg !33
ret i32 %call, !dbg !34
}
@@ -62,24 +62,18 @@ entry:
; CHECK-NEXT: .long 118
; CHECK-NEXT: .long 0
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.p0(ptr, i32) #2
+declare ptr @llvm.preserve.union.access.index.p0.p0(ptr, i32)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
; Function Attrs: nounwind readnone
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!15, !16, !17}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-end-load.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-end-load.ll
index 499e368..ee1f0e2 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-end-load.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-end-load.ll
@@ -14,7 +14,7 @@ target triple = "bpf"
%struct.s = type { i32, i32 }
; Function Attrs: nounwind readonly
-define dso_local i32 @test(ptr readonly %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr readonly %arg) local_unnamed_addr !dbg !11 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !20, metadata !DIExpression()), !dbg !21
%0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg, i32 1, i32 1), !dbg !22, !llvm.preserve.access.index !15
@@ -42,14 +42,10 @@ entry:
; CHECK-NEXT: .long 0
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32)
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-end-ret.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-end-ret.ll
index 2aadbdf..3d66435 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-end-ret.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-end-ret.ll
@@ -14,7 +14,7 @@ target triple = "bpf"
%struct.s = type { i32, i32 }
; Function Attrs: nounwind readnone
-define dso_local ptr @test(ptr readnone %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local ptr @test(ptr readnone %arg) local_unnamed_addr !dbg !7 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !19, metadata !DIExpression()), !dbg !20
%0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg, i32 1, i32 1), !dbg !21, !llvm.preserve.access.index !13
@@ -42,14 +42,10 @@ entry:
; CHECK-NEXT: .long 0
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32)
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-1.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-1.ll
index 34ea050..cf75909 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-1.ll
@@ -40,11 +40,11 @@ target triple = "bpfel"
%struct.s = type { i32, i16 }
; Function Attrs: nounwind
-define dso_local i32 @field_read(ptr %arg) local_unnamed_addr #0 !dbg !20 {
+define dso_local i32 @field_read(ptr %arg) local_unnamed_addr !dbg !20 {
entry:
%ull = alloca i64, align 8
call void @llvm.dbg.value(metadata ptr %arg, metadata !31, metadata !DIExpression()), !dbg !37
- call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %ull) #5, !dbg !38
+ call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %ull), !dbg !38
%0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg, i32 1, i32 2), !dbg !39, !llvm.preserve.access.index !25
%1 = tail call i32 @llvm.bpf.preserve.field.info.p0(ptr %0, i64 0), !dbg !40
call void @llvm.dbg.value(metadata i32 %1, metadata !34, metadata !DIExpression()), !dbg !37
@@ -52,7 +52,7 @@ entry:
call void @llvm.dbg.value(metadata i32 %2, metadata !35, metadata !DIExpression()), !dbg !37
%idx.ext = zext i32 %1 to i64, !dbg !43
%add.ptr = getelementptr i8, ptr %arg, i64 %idx.ext, !dbg !43
- call void @bpf_probe_read(ptr nonnull %ull, i32 %2, ptr %add.ptr) #5, !dbg !44
+ call void @bpf_probe_read(ptr nonnull %ull, i32 %2, ptr %add.ptr), !dbg !44
%3 = call i32 @llvm.bpf.preserve.field.info.p0(ptr %0, i64 4), !dbg !45
call void @llvm.dbg.value(metadata i32 %3, metadata !36, metadata !DIExpression()), !dbg !37
%4 = load i64, ptr %ull, align 8, !dbg !46, !tbaa !47
@@ -68,7 +68,7 @@ entry:
%shr3 = lshr i64 %shl, %sh_prom1, !dbg !53
%retval.0.in = select i1 %tobool, i64 %shr3, i64 %shr, !dbg !53
%retval.0 = trunc i64 %retval.0.in to i32, !dbg !37
- call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %ull) #5, !dbg !54
+ call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %ull), !dbg !54
ret i32 %retval.0, !dbg !54
}
@@ -114,28 +114,21 @@ entry:
; CHECK-NEXT: .long 3
; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32)
; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #2
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
-declare dso_local void @bpf_probe_read(ptr, i32, ptr) local_unnamed_addr #3
+declare dso_local void @bpf_probe_read(ptr, i32, ptr) local_unnamed_addr
; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #4
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { nounwind readnone }
-attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind readnone speculatable willreturn }
-attributes #5 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!16, !17, !18}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2-bpfeb.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2-bpfeb.ll
index 01c5e69..d5b2d052 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2-bpfeb.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2-bpfeb.ll
@@ -42,7 +42,7 @@ target triple = "bpfeb"
%struct.s = type { i32, i16 }
; Function Attrs: nounwind readonly
-define dso_local i32 @field_read(ptr %arg) local_unnamed_addr #0 !dbg !26 {
+define dso_local i32 @field_read(ptr %arg) local_unnamed_addr !dbg !26 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !37, metadata !DIExpression()), !dbg !41
%0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg, i32 1, i32 2), !dbg !42, !llvm.preserve.access.index !31
@@ -157,17 +157,13 @@ sw.epilog: ; preds = %entry, %sw.bb9, %sw
; CHECK-NEXT: .long 3
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32)
; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!22, !23, !24}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2.ll
index d458d41..5076e79 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2.ll
@@ -42,7 +42,7 @@ target triple = "bpfel"
%struct.s = type { i32, i16 }
; Function Attrs: nounwind readonly
-define dso_local i32 @field_read(ptr %arg) local_unnamed_addr #0 !dbg !26 {
+define dso_local i32 @field_read(ptr %arg) local_unnamed_addr !dbg !26 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !37, metadata !DIExpression()), !dbg !41
%0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg, i32 1, i32 2), !dbg !42, !llvm.preserve.access.index !31
@@ -157,17 +157,13 @@ sw.epilog: ; preds = %entry, %sw.bb9, %sw
; CHECK-NEXT: .long 3
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32)
; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!22, !23, !24}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-1.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-1.ll
index 7657b78..2f42118 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-1.ll
@@ -19,10 +19,10 @@ target triple = "bpf"
@g = dso_local global %struct.v3 zeroinitializer, section "stats", align 4, !dbg !0
; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !16 {
+define dso_local i32 @test() local_unnamed_addr !dbg !16 {
entry:
%0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr elementtype(%struct.v3) nonnull @g, i32 1, i32 1), !dbg !19, !llvm.preserve.access.index !7
- %call = tail call i32 @get_value(ptr %0) #3, !dbg !20
+ %call = tail call i32 @get_value(ptr %0), !dbg !20
ret i32 %call, !dbg !21
}
@@ -45,15 +45,10 @@ entry:
; CHECK-NEXT: .long 23
; CHECK-NEXT: .long 0
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32) #2
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind }
+declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32)
!llvm.dbg.cu = !{!2}
!llvm.module.flags = !{!12, !13, !14}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-2.ll
index bed14ab..f43df76 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-2.ll
@@ -19,12 +19,12 @@ target triple = "bpf"
@g = dso_local global [4 x [5 x %struct.v3]] zeroinitializer, section "stats", align 4, !dbg !0
; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !23 {
+define dso_local i32 @test() local_unnamed_addr !dbg !23 {
entry:
%0 = tail call ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr elementtype([4 x [5 x %struct.v3]]) nonnull @g, i32 1, i32 1), !dbg !26, !llvm.preserve.access.index !6
%1 = tail call ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr elementtype([5 x %struct.v3]) %0, i32 1, i32 2), !dbg !26, !llvm.preserve.access.index !16
%2 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr elementtype(%struct.v3) %1, i32 1, i32 1), !dbg !26, !llvm.preserve.access.index !8
- %call = tail call i32 @get_value(ptr %2) #3, !dbg !27
+ %call = tail call i32 @get_value(ptr %2), !dbg !27
ret i32 %call, !dbg !28
}
@@ -47,21 +47,15 @@ entry:
; CHECK-NEXT: .long 23
; CHECK-NEXT: .long 0
-
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32)
; Function Attrs: nounwind readnone
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32) #2
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind }
+declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32)
!llvm.dbg.cu = !{!2}
!llvm.module.flags = !{!19, !20, !21}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-3.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-3.ll
index 49b89e2..5bc2bf9 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-3.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-3.ll
@@ -19,11 +19,11 @@ target triple = "bpf"
@g = dso_local local_unnamed_addr global ptr null, section "stats", align 8, !dbg !0
; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !17 {
+define dso_local i32 @test() local_unnamed_addr !dbg !17 {
entry:
%0 = load ptr, ptr @g, align 8, !dbg !20, !tbaa !21
%1 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr elementtype(%struct.v3) %0, i32 1, i32 1), !dbg !20, !llvm.preserve.access.index !8
- %call = tail call i32 @get_value(ptr %1) #3, !dbg !25
+ %call = tail call i32 @get_value(ptr %1), !dbg !25
ret i32 %call, !dbg !26
}
@@ -45,15 +45,10 @@ entry:
; CHECK-NEXT: .long 23
; CHECK-NEXT: .long 0
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32) #2
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind }
+declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32)
!llvm.dbg.cu = !{!2}
!llvm.module.flags = !{!13, !14, !15}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-ignore.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-ignore.ll
index 4ff170cf..983383c 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-ignore.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-ignore.ll
@@ -13,11 +13,11 @@
target triple = "bpf"
; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !10 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !10 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !14, metadata !DIExpression()), !dbg !15
%0 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype(i32) %arg, i32 0, i32 4), !dbg !16, !llvm.preserve.access.index !4
- %call = tail call i32 @get_value(ptr %0) #4, !dbg !17
+ %call = tail call i32 @get_value(ptr %0), !dbg !17
ret i32 %call, !dbg !18
}
@@ -26,19 +26,13 @@ entry:
; CHECK: .section .BTF.ext,"",@progbits
; CHECK-NOT: .long 16 # FieldReloc
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!6, !7, !8}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-middle-chain.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-middle-chain.ll
index e5f86c2..c67d57f 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-middle-chain.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-middle-chain.ll
@@ -29,7 +29,7 @@ target triple = "bpf"
%struct.t1 = type { i32 }
; Function Attrs: nounwind
-define dso_local void @test(ptr %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local void @test(ptr %arg) local_unnamed_addr !dbg !7 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !22, metadata !DIExpression()), !dbg !29
%0 = tail call ptr @llvm.preserve.struct.access.index.p0.s1s.p0.r1s(ptr elementtype(%struct.r1) %arg, i32 0, i32 0), !dbg !30, !llvm.preserve.access.index !11
@@ -38,7 +38,7 @@ entry:
call void @llvm.dbg.value(metadata ptr %1, metadata !25, metadata !DIExpression()), !dbg !29
%2 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.t1s(ptr elementtype(%struct.t1) %1, i32 0, i32 0), !dbg !32, !llvm.preserve.access.index !17
call void @llvm.dbg.value(metadata ptr %2, metadata !27, metadata !DIExpression()), !dbg !29
- tail call void @test1(ptr %0, ptr %1, ptr %2) #4, !dbg !36
+ tail call void @test1(ptr %0, ptr %1, ptr %2), !dbg !36
ret void, !dbg !37
}
@@ -67,24 +67,18 @@ entry:
; CHECK-NEXT: .long 0
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.s1s.p0.r1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.s1s.p0.r1s(ptr, i32, i32)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.t1s.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.t1s.p0.s1s(ptr, i32, i32)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.t1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.t1s(ptr, i32, i32)
-declare dso_local void @test1(ptr, ptr, ptr) local_unnamed_addr #2
+declare dso_local void @test1(ptr, ptr, ptr) local_unnamed_addr
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-multi-array-1.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-multi-array-1.ll
index 8ca3ef5..7ffb4de 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-multi-array-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-multi-array-1.ll
@@ -17,14 +17,14 @@ target triple = "bpf"
%struct.v3 = type { i32, [4 x [4 x i32]] }
; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !21 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !21 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !25, metadata !DIExpression()), !dbg !26
%0 = tail call ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr elementtype(%struct.v3) %arg, i32 0, i32 1), !dbg !27, !llvm.preserve.access.index !4
%1 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr elementtype(%struct.v3) %0, i32 1, i32 1), !dbg !27, !llvm.preserve.access.index !6
%2 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([4 x [4 x i32]]) %1, i32 1, i32 2), !dbg !27, !llvm.preserve.access.index !11
%3 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([4 x i32]) %2, i32 1, i32 3), !dbg !27, !llvm.preserve.access.index !15
- %call = tail call i32 @get_value(ptr %3) #4, !dbg !28
+ %call = tail call i32 @get_value(ptr %3), !dbg !28
ret i32 %call, !dbg !29
}
@@ -46,27 +46,21 @@ entry:
; CHECK-NEXT: .long 58
; CHECK-NEXT: .long 0
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
; Function Attrs: nounwind readnone
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!17, !18, !19}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-multi-array-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-multi-array-2.ll
index b2ba5a8..55bb7c58 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-multi-array-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-multi-array-2.ll
@@ -17,7 +17,7 @@ target triple = "bpf"
%struct.v3 = type { i32, [4 x [4 x [4 x i32]]] }
; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !23 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !23 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !27, metadata !DIExpression()), !dbg !28
%0 = tail call ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr elementtype(%struct.v3) %arg, i32 0, i32 1), !dbg !29, !llvm.preserve.access.index !4
@@ -25,7 +25,7 @@ entry:
%2 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([4 x [4 x [4 x i32]]]) %1, i32 1, i32 2), !dbg !29, !llvm.preserve.access.index !11
%3 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([4 x [4 x i32]]) %2, i32 1, i32 3), !dbg !29, !llvm.preserve.access.index !15
%4 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([4 x i32]) %3, i32 1, i32 2), !dbg !29, !llvm.preserve.access.index !17
- %call = tail call i32 @get_value(ptr %4) #4, !dbg !30
+ %call = tail call i32 @get_value(ptr %4), !dbg !30
ret i32 %call, !dbg !31
}
@@ -47,29 +47,23 @@ entry:
; CHECK-NEXT: .long 58
; CHECK-NEXT: .long 0
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
; Function Attrs: nounwind readnone
; Function Attrs: nounwind readnone
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!19, !20, !21}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-multilevel.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-multilevel.ll
index e00bbb8..a5b4604 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-multilevel.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-multilevel.ll
@@ -28,16 +28,16 @@ target triple = "bpf"
%struct.net_device = type { i32, i32 }
; Function Attrs: nounwind
-define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @bpf_prog(ptr) local_unnamed_addr !dbg !15 {
%2 = alloca i32, align 4
call void @llvm.dbg.value(metadata ptr %0, metadata !28, metadata !DIExpression()), !dbg !30
- call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %2) #4, !dbg !31
+ call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %2), !dbg !31
%3 = tail call ptr @llvm.preserve.struct.access.index.p0.net_devices.p0.sk_buffs(ptr elementtype(%struct.sk_buff) %0, i32 1, i32 1), !dbg !32, !llvm.preserve.access.index !19
%4 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.net_devices(ptr elementtype(%struct.net_device) %3, i32 0, i32 0), !dbg !32, !llvm.preserve.access.index !23
- %5 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 4, ptr %4) #4, !dbg !33
+ %5 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 4, ptr %4), !dbg !33
%6 = load i32, ptr %2, align 4, !dbg !34, !tbaa !35
call void @llvm.dbg.value(metadata i32 %6, metadata !29, metadata !DIExpression()), !dbg !30
- call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %2) #4, !dbg !39
+ call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %2), !dbg !39
ret i32 %6, !dbg !40
}
@@ -130,25 +130,19 @@ define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
; CHECK-NEXT: .long 0
; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.net_devices.p0.sk_buffs(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.net_devices.p0.sk_buffs(ptr, i32 immarg, i32 immarg)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.net_devices(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.net_devices(ptr, i32 immarg, i32 immarg)
; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-pointer-1.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-pointer-1.ll
index b4d1844..ffd77ed 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-pointer-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-pointer-1.ll
@@ -16,11 +16,11 @@ target triple = "bpf"
%struct.v3 = type { i32, i32 }
; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !15 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !19, metadata !DIExpression()), !dbg !20
%0 = tail call ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr elementtype(%struct.v3) %arg, i32 0, i32 1), !dbg !21, !llvm.preserve.access.index !4
- %call = tail call i32 @get_value(ptr %0) #4, !dbg !22
+ %call = tail call i32 @get_value(ptr %0), !dbg !22
ret i32 %call, !dbg !23
}
@@ -42,19 +42,13 @@ entry:
; CHECK-NEXT: .long 32
; CHECK-NEXT: .long 0
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32)
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-pointer-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-pointer-2.ll
index 87b88bc..cb0aff3 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-pointer-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-pointer-2.ll
@@ -16,12 +16,12 @@ target triple = "bpf"
%struct.v3 = type { i32, i32 }
; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !15 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !19, metadata !DIExpression()), !dbg !20
%0 = tail call ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr elementtype(%struct.v3) %arg, i32 0, i32 1), !dbg !21, !llvm.preserve.access.index !4
%1 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr elementtype(%struct.v3) %0, i32 1, i32 1), !dbg !21, !llvm.preserve.access.index !6
- %call = tail call i32 @get_value(ptr %1) #4, !dbg !22
+ %call = tail call i32 @get_value(ptr %1), !dbg !22
ret i32 %call, !dbg !23
}
@@ -42,22 +42,16 @@ entry:
; CHECK-NEXT: .long 32
; CHECK-NEXT: .long 0
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32)
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-struct-anonymous.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-struct-anonymous.ll
index 8ebbfea..2081b3f 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-struct-anonymous.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-struct-anonymous.ll
@@ -27,17 +27,17 @@ target triple = "bpf"
%struct.anon = type { i32, i32 }
; Function Attrs: nounwind
-define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @bpf_prog(ptr) local_unnamed_addr !dbg !15 {
%2 = alloca i32, align 4
call void @llvm.dbg.value(metadata ptr %0, metadata !31, metadata !DIExpression()), !dbg !33
- call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %2) #4, !dbg !34
+ call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %2), !dbg !34
%3 = tail call ptr @llvm.preserve.struct.access.index.p0.anons.p0.sk_buffs(ptr elementtype(%struct.sk_buff) %0, i32 1, i32 1), !dbg !35, !llvm.preserve.access.index !19
%4 = tail call ptr @llvm.preserve.array.access.index.p0.anons.p0.anons(ptr elementtype([10 x %struct.anon]) %3, i32 1, i32 5), !dbg !35, !llvm.preserve.access.index !23
%5 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.anons(ptr elementtype(%struct.anon) %4, i32 0, i32 0), !dbg !35, !llvm.preserve.access.index !24
- %6 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 4, ptr %5) #4, !dbg !36
+ %6 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 4, ptr %5), !dbg !36
%7 = load i32, ptr %2, align 4, !dbg !37, !tbaa !38
call void @llvm.dbg.value(metadata i32 %7, metadata !32, metadata !DIExpression()), !dbg !33
- call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %2) #4, !dbg !42
+ call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %2), !dbg !42
ret i32 %7, !dbg !43
}
@@ -140,28 +140,22 @@ define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
; CHECK-NEXT: .long 0
; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.anons.p0.sk_buffs(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.anons.p0.sk_buffs(ptr, i32 immarg, i32 immarg)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.anons.p0.anons(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.array.access.index.p0.anons.p0.anons(ptr, i32 immarg, i32 immarg)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.anons(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.anons(ptr, i32 immarg, i32 immarg)
; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-struct-array.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-struct-array.ll
index 64ec250..4e51366 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-struct-array.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-struct-array.ll
@@ -28,17 +28,17 @@ target triple = "bpf"
%struct.net_device = type { i32, i32 }
; Function Attrs: nounwind
-define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @bpf_prog(ptr) local_unnamed_addr !dbg !15 {
%2 = alloca i32, align 4
call void @llvm.dbg.value(metadata ptr %0, metadata !31, metadata !DIExpression()), !dbg !33
- call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %2) #4, !dbg !34
+ call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %2), !dbg !34
%3 = tail call ptr @llvm.preserve.struct.access.index.p0.net_devices.p0.sk_buffs(ptr elementtype(%struct.sk_buff) %0, i32 1, i32 1), !dbg !35, !llvm.preserve.access.index !19
%4 = tail call ptr @llvm.preserve.array.access.index.p0.net_devices.p0.net_devices(ptr elementtype([10 x %struct.net_device]) %3, i32 1, i32 5), !dbg !35, !llvm.preserve.access.index !23
%5 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.net_devices(ptr elementtype(%struct.net_device) %4, i32 0, i32 0), !dbg !35, !llvm.preserve.access.index !24
- %6 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 4, ptr %5) #4, !dbg !36
+ %6 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 4, ptr %5), !dbg !36
%7 = load i32, ptr %2, align 4, !dbg !37, !tbaa !38
call void @llvm.dbg.value(metadata i32 %7, metadata !32, metadata !DIExpression()), !dbg !33
- call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %2) #4, !dbg !42
+ call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %2), !dbg !42
ret i32 %7, !dbg !43
}
@@ -143,28 +143,22 @@ define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
; CHECK-NEXT: .long 0
; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.net_devices.p0.sk_buffs(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.net_devices.p0.sk_buffs(ptr, i32 immarg, i32 immarg)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.net_devices.p0.net_devices(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.array.access.index.p0.net_devices.p0.net_devices(ptr, i32 immarg, i32 immarg)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.net_devices(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.net_devices(ptr, i32 immarg, i32 immarg)
; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-array.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-array.ll
index ed462e1..eb0620d 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-array.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-array.ll
@@ -20,12 +20,12 @@ target triple = "bpf"
%struct.__s = type { [7 x i32] }
; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !7 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !24, metadata !DIExpression()), !dbg !25
%0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.__ss(ptr elementtype(%struct.__s) %arg, i32 0, i32 0), !dbg !26, !llvm.preserve.access.index !13
%1 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([7 x i32]) %0, i32 1, i32 1), !dbg !26, !llvm.preserve.access.index !19
- %call = tail call i32 @get_value(ptr %1) #4, !dbg !27
+ %call = tail call i32 @get_value(ptr %1), !dbg !27
ret i32 %call, !dbg !28
}
@@ -48,22 +48,16 @@ entry:
; CHECK-NEXT: .long [[ACCESS_STR]]
; CHECK-NEXT: .long 0
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.__ss(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.__ss(ptr, i32 immarg, i32 immarg)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32 immarg, i32 immarg)
; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-struct-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-struct-2.ll
index 6b806ae..c4edda1 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-struct-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-struct-2.ll
@@ -18,7 +18,7 @@ target triple = "bpf"
%struct.__t = type { i32 }
; Function Attrs: nounwind readonly
-define dso_local i32 @test(ptr readonly %arg) local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test(ptr readonly %arg) local_unnamed_addr !dbg !13 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !18, metadata !DIExpression()), !dbg !19
%0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.__ts(ptr elementtype(%struct.__t) %arg, i32 0, i32 0), !dbg !20, !llvm.preserve.access.index !4
@@ -50,14 +50,10 @@ entry:
; CHECK-NEXT: .long 0
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.__ts(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.__ts(ptr, i32, i32)
; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable}
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!9, !10, !11}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-struct.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-struct.ll
index c2b5a11..f8cf253 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-struct.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-struct.ll
@@ -20,11 +20,11 @@ target triple = "bpf"
%struct.__s = type { i32, i32 }
; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !7 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !21, metadata !DIExpression()), !dbg !22
%0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.__ss(ptr elementtype(%struct.__s) %arg, i32 1, i32 1), !dbg !23, !llvm.preserve.access.index !14
- %call = tail call i32 @get_value(ptr %0) #4, !dbg !24
+ %call = tail call i32 @get_value(ptr %0), !dbg !24
ret i32 %call, !dbg !25
}
@@ -47,19 +47,13 @@ entry:
; CHECK-NEXT: .long [[ACCESS_STR]]
; CHECK-NEXT: .long 0
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.__ss(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.__ss(ptr, i32 immarg, i32 immarg)
; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-union-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-union-2.ll
index a63b7e7..0fe7c1f 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-union-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-union-2.ll
@@ -18,7 +18,7 @@ target triple = "bpf"
%union.__t = type { i32 }
; Function Attrs: nounwind readonly
-define dso_local i32 @test(ptr readonly %arg) local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test(ptr readonly %arg) local_unnamed_addr !dbg !13 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !18, metadata !DIExpression()), !dbg !19
%0 = tail call ptr @llvm.preserve.union.access.index.p0.__ts.p0.__ts(ptr %arg, i32 0), !dbg !20, !llvm.preserve.access.index !4
@@ -50,14 +50,10 @@ entry:
; CHECK-NEXT: .long 0
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.__ts.p0.__ts(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.__ts.p0.__ts(ptr, i32)
; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable}
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!9, !10, !11}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-union.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-union.ll
index 4b3d178..aa8705d 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-union.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-union.ll
@@ -20,11 +20,11 @@ target triple = "bpf"
%union.__s = type { i32 }
; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !7 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !21, metadata !DIExpression()), !dbg !22
%0 = tail call ptr @llvm.preserve.union.access.index.p0.__ss.p0.__ss(ptr %arg, i32 1), !dbg !23, !llvm.preserve.access.index !14
- %call = tail call i32 @get_value(ptr %0) #4, !dbg !24
+ %call = tail call i32 @get_value(ptr %0), !dbg !24
ret i32 %call, !dbg !25
}
@@ -47,19 +47,13 @@ entry:
; CHECK-NEXT: .long [[ACCESS_STR]]
; CHECK-NEXT: .long 0
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.__ss.p0.__ss(ptr, i32 immarg) #2
+declare ptr @llvm.preserve.union.access.index.p0.__ss.p0.__ss(ptr, i32 immarg)
; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef.ll
index e757327..5195d17 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef.ll
@@ -24,13 +24,13 @@ target triple = "bpf"
%struct.s = type { i32, i32 }
; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !7 {
entry:
call void @llvm.dbg.value(metadata ptr %arg, metadata !28, metadata !DIExpression()), !dbg !29
%0 = tail call ptr @llvm.preserve.array.access.index.p0.us.p0.us(ptr elementtype([7 x %union.u]) %arg, i32 0, i32 1), !dbg !30, !llvm.preserve.access.index !14
%1 = tail call ptr @llvm.preserve.union.access.index.p0.us.p0.us(ptr %0, i32 1), !dbg !30, !llvm.preserve.access.index !16
%2 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %1, i32 1, i32 1), !dbg !30, !llvm.preserve.access.index !20
- %call = tail call i32 @get_value(ptr %2) #4, !dbg !31
+ %call = tail call i32 @get_value(ptr %2), !dbg !31
ret i32 %call, !dbg !32
}
@@ -53,25 +53,19 @@ entry:
; CHECK-NEXT: .long [[ACCESS_STR:[0-9]+]]
; CHECK-NEXT: .long 0
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.us.p0.us(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.array.access.index.p0.us.p0.us(ptr, i32 immarg, i32 immarg)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.us.p0.us(ptr, i32 immarg) #2
+declare ptr @llvm.preserve.union.access.index.p0.us.p0.us(ptr, i32 immarg)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32 immarg, i32 immarg)
; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-union.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-union.ll
index 824eba9a..e156999 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-union.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-union.ll
@@ -31,17 +31,17 @@ target triple = "bpf"
%union.anon = type { i32 }
; Function Attrs: nounwind
-define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @bpf_prog(ptr) local_unnamed_addr !dbg !15 {
%2 = alloca i32, align 4
call void @llvm.dbg.value(metadata ptr %0, metadata !32, metadata !DIExpression()), !dbg !34
- call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %2) #4, !dbg !35
+ call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %2), !dbg !35
%3 = tail call ptr @llvm.preserve.union.access.index.p0.sk_buffs.p0.sk_buffs(ptr %0, i32 1), !dbg !36, !llvm.preserve.access.index !19
%4 = tail call ptr @llvm.preserve.struct.access.index.p0.anons.p0.anons(ptr elementtype(%struct.anon) %3, i32 1, i32 1), !dbg !36, !llvm.preserve.access.index !23
%5 = tail call ptr @llvm.preserve.union.access.index.p0.anons.p0.anons(ptr %4, i32 0), !dbg !36, !llvm.preserve.access.index !27
- %6 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 4, ptr %5) #4, !dbg !37
+ %6 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 4, ptr %5), !dbg !37
%7 = load i32, ptr %2, align 4, !dbg !38, !tbaa !39
call void @llvm.dbg.value(metadata i32 %7, metadata !33, metadata !DIExpression()), !dbg !34
- call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %2) #4, !dbg !43
+ call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %2), !dbg !43
ret i32 %7, !dbg !44
}
@@ -145,28 +145,22 @@ define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
; CHECK-NEXT: .long 0
; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.sk_buffs.p0.sk_buffs(ptr, i32 immarg) #2
+declare ptr @llvm.preserve.union.access.index.p0.sk_buffs.p0.sk_buffs(ptr, i32 immarg)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.anons.p0.anons(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.anons.p0.anons(ptr, i32 immarg, i32 immarg)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.anons.p0.anons(ptr, i32 immarg) #2
+declare ptr @llvm.preserve.union.access.index.p0.anons.p0.anons(ptr, i32 immarg)
; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/store-addr.ll b/llvm/test/CodeGen/BPF/CORE/store-addr.ll
index 33bbd01..2c8a0c4 100644
--- a/llvm/test/CodeGen/BPF/CORE/store-addr.ll
+++ b/llvm/test/CodeGen/BPF/CORE/store-addr.ll
@@ -22,17 +22,17 @@ target triple = "bpf"
%struct.t = type { i32 }
; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !14 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !14 {
entry:
%param = alloca [1 x i64], align 8
call void @llvm.dbg.value(metadata ptr %arg, metadata !22, metadata !DIExpression()), !dbg !27
- call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %param) #5, !dbg !28
+ call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %param), !dbg !28
call void @llvm.dbg.declare(metadata ptr %param, metadata !23, metadata !DIExpression()), !dbg !29
%0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ts(ptr elementtype(%struct.t) %arg, i32 0, i32 0), !dbg !30, !llvm.preserve.access.index !18
%1 = ptrtoint ptr %0 to i64, !dbg !31
store i64 %1, ptr %param, align 8, !dbg !33, !tbaa !34
- %call = call i32 @foo(ptr nonnull %param) #5, !dbg !38
- call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %param) #5, !dbg !39
+ %call = call i32 @foo(ptr nonnull %param), !dbg !38
+ call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %param), !dbg !39
ret i32 %call, !dbg !40
}
@@ -41,28 +41,21 @@ entry:
; CHECK: *(u64 *)(r10 - 8) = r1
; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #2
+declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ts(ptr, i32, i32) #3
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ts(ptr, i32, i32)
-declare !dbg !5 dso_local i32 @foo(ptr) local_unnamed_addr #4
+declare !dbg !5 dso_local i32 @foo(ptr) local_unnamed_addr
; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #2
+declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
-attributes #2 = { argmemonly nounwind }
-attributes #3 = { nounwind readnone }
-attributes #4 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #5 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!10, !11, !12}
diff --git a/llvm/test/CodeGen/BPF/adjust-opt-icmp1.ll b/llvm/test/CodeGen/BPF/adjust-opt-icmp1.ll
index 8a4b37d..09ca422 100644
--- a/llvm/test/CodeGen/BPF/adjust-opt-icmp1.ll
+++ b/llvm/test/CodeGen/BPF/adjust-opt-icmp1.ll
@@ -20,12 +20,12 @@
; clang -target bpf -O2 -S -emit-llvm -Xclang -disable-llvm-passes test.c
; Function Attrs: nounwind
-define dso_local i32 @test() #0 {
+define dso_local i32 @test() {
entry:
%retval = alloca i32, align 4
%ret = alloca i32, align 4
%cleanup.dest.slot = alloca i32, align 4
- call void @llvm.lifetime.start.p0(i64 4, ptr %ret) #3
+ call void @llvm.lifetime.start.p0(i64 4, ptr %ret)
%call = call i32 @foo()
store i32 %call, ptr %ret, align 4, !tbaa !2
%0 = load i32, ptr %ret, align 4, !tbaa !2
@@ -62,25 +62,20 @@ if.end: ; preds = %lor.lhs.false
br label %cleanup
cleanup: ; preds = %if.end, %if.then
- call void @llvm.lifetime.end.p0(i64 4, ptr %ret) #3
+ call void @llvm.lifetime.end.p0(i64 4, ptr %ret)
%3 = load i32, ptr %retval, align 4
ret i32 %3
}
; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
-declare dso_local i32 @foo(...) #2
+declare dso_local i32 @foo(...)
-declare dso_local i32 @bar(i32) #2
+declare dso_local i32 @bar(i32)
; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
-
-attributes #0 = { nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/BPF/adjust-opt-icmp2.ll b/llvm/test/CodeGen/BPF/adjust-opt-icmp2.ll
index ad157fe..bbda062 100644
--- a/llvm/test/CodeGen/BPF/adjust-opt-icmp2.ll
+++ b/llvm/test/CodeGen/BPF/adjust-opt-icmp2.ll
@@ -18,12 +18,12 @@
; clang -target bpf -O2 -S -emit-llvm -Xclang -disable-llvm-passes test.c
; Function Attrs: nounwind
-define dso_local i32 @test() #0 {
+define dso_local i32 @test() {
entry:
%retval = alloca i32, align 4
%ret = alloca i32, align 4
%cleanup.dest.slot = alloca i32, align 4
- call void @llvm.lifetime.start.p0(i64 4, ptr %ret) #3
+ call void @llvm.lifetime.start.p0(i64 4, ptr %ret)
%call = call i32 @foo()
store i32 %call, ptr %ret, align 4, !tbaa !2
%0 = load i32, ptr %ret, align 4, !tbaa !2
@@ -65,25 +65,20 @@ if.end3: ; preds = %if.end
br label %cleanup
cleanup: ; preds = %if.end3, %if.then2, %if.then
- call void @llvm.lifetime.end.p0(i64 4, ptr %ret) #3
+ call void @llvm.lifetime.end.p0(i64 4, ptr %ret)
%3 = load i32, ptr %retval, align 4
ret i32 %3
}
; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
-declare dso_local i32 @foo(...) #2
+declare dso_local i32 @foo(...)
-declare dso_local i32 @bar(i32) #2
+declare dso_local i32 @bar(i32)
; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
-
-attributes #0 = { nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/BPF/adjust-opt-speculative1.ll b/llvm/test/CodeGen/BPF/adjust-opt-speculative1.ll
index d118fa0..d34d652 100644
--- a/llvm/test/CodeGen/BPF/adjust-opt-speculative1.ll
+++ b/llvm/test/CodeGen/BPF/adjust-opt-speculative1.ll
@@ -15,12 +15,12 @@
; clang -target bpf -O2 -S -emit-llvm -Xclang -disable-llvm-passes test.c
; Function Attrs: nounwind
-define dso_local ptr @test(ptr %p) #0 {
+define dso_local ptr @test(ptr %p) {
entry:
%p.addr = alloca ptr, align 8
%ret = alloca i64, align 8
store ptr %p, ptr %p.addr, align 8, !tbaa !2
- call void @llvm.lifetime.start.p0(i64 8, ptr %ret) #3
+ call void @llvm.lifetime.start.p0(i64 8, ptr %ret)
%call = call i64 @foo()
store i64 %call, ptr %ret, align 8, !tbaa !6
%0 = load i64, ptr %ret, align 8, !tbaa !6
@@ -36,7 +36,7 @@ if.then: ; preds = %entry
if.end: ; preds = %if.then, %entry
%3 = load ptr, ptr %p.addr, align 8, !tbaa !2
- call void @llvm.lifetime.end.p0(i64 8, ptr %ret) #3
+ call void @llvm.lifetime.end.p0(i64 8, ptr %ret)
ret ptr %3
}
; CHECK-COMMON: [[REG6:r[0-9]+]] = r1
@@ -57,17 +57,12 @@ if.end: ; preds = %if.then, %entry
; CHECK-COMMON: exit
; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
-declare dso_local i64 @foo(...) #2
+declare dso_local i64 @foo(...)
; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
-
-attributes #0 = { nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/BPF/adjust-opt-speculative2.ll b/llvm/test/CodeGen/BPF/adjust-opt-speculative2.ll
index 218fa5d..5f3fa94 100644
--- a/llvm/test/CodeGen/BPF/adjust-opt-speculative2.ll
+++ b/llvm/test/CodeGen/BPF/adjust-opt-speculative2.ll
@@ -15,12 +15,12 @@
; clang -target bpf -O2 -S -emit-llvm -Xclang -disable-llvm-passes test.c
; Function Attrs: nounwind
-define dso_local ptr @test(ptr %p) #0 {
+define dso_local ptr @test(ptr %p) {
entry:
%p.addr = alloca ptr, align 8
%ret = alloca i32, align 4
store ptr %p, ptr %p.addr, align 8, !tbaa !2
- call void @llvm.lifetime.start.p0(i64 4, ptr %ret) #3
+ call void @llvm.lifetime.start.p0(i64 4, ptr %ret)
%call = call i32 @foo()
store i32 %call, ptr %ret, align 4, !tbaa !6
%0 = load i32, ptr %ret, align 4, !tbaa !6
@@ -37,7 +37,7 @@ if.then: ; preds = %entry
if.end: ; preds = %if.then, %entry
%3 = load ptr, ptr %p.addr, align 8, !tbaa !2
- call void @llvm.lifetime.end.p0(i64 4, ptr %ret) #3
+ call void @llvm.lifetime.end.p0(i64 4, ptr %ret)
ret ptr %3
}
@@ -66,17 +66,12 @@ if.end: ; preds = %if.then, %entry
; CHECK-COMMON: exit
; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
-declare dso_local i32 @foo(...) #2
+declare dso_local i32 @foo(...)
; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
-
-attributes #0 = { nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/BPF/callx.ll b/llvm/test/CodeGen/BPF/callx.ll
index d83e0f6..e027c1f 100644
--- a/llvm/test/CodeGen/BPF/callx.ll
+++ b/llvm/test/CodeGen/BPF/callx.ll
@@ -3,16 +3,13 @@
; int test(int (*f)(void)) { return f(); }
; Function Attrs: nounwind
-define dso_local i32 @test(ptr nocapture %f) local_unnamed_addr #0 {
+define dso_local i32 @test(ptr nocapture %f) local_unnamed_addr {
entry:
- %call = tail call i32 %f() #1
+ %call = tail call i32 %f()
; CHECK: callx r{{[0-9]+}}
ret i32 %call
}
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind }
-
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/BPF/dwarfdump.ll b/llvm/test/CodeGen/BPF/dwarfdump.ll
index a3a6b52..d50c025 100644
--- a/llvm/test/CodeGen/BPF/dwarfdump.ll
+++ b/llvm/test/CodeGen/BPF/dwarfdump.ll
@@ -10,7 +10,7 @@ target triple = "bpf"
@testprog.myvar_c = internal unnamed_addr global i32 0, align 4, !dbg !0
; Function Attrs: nounwind
-define i32 @testprog(i32, i32) local_unnamed_addr #0 !dbg !2 {
+define i32 @testprog(i32, i32) local_unnamed_addr !dbg !2 {
tail call void @llvm.dbg.value(metadata i32 %0, i64 0, metadata !11, metadata !16), !dbg !17
tail call void @llvm.dbg.value(metadata i32 %1, i64 0, metadata !12, metadata !16), !dbg !18
%3 = load i32, ptr @testprog.myvar_c, align 4, !dbg !19, !tbaa !20
@@ -21,10 +21,7 @@ define i32 @testprog(i32, i32) local_unnamed_addr #0 !dbg !2 {
}
; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
!llvm.dbg.cu = !{!7}
!llvm.module.flags = !{!13, !14}
diff --git a/llvm/test/CodeGen/BPF/i128.ll b/llvm/test/CodeGen/BPF/i128.ll
index a966e3e..3c94e0c 100644
--- a/llvm/test/CodeGen/BPF/i128.ll
+++ b/llvm/test/CodeGen/BPF/i128.ll
@@ -19,14 +19,14 @@
%struct.ipv6_key_t = type { i32, i128, i16 }
; Function Attrs: nounwind
-define dso_local i32 @test(i32 %pid) local_unnamed_addr #0 {
+define dso_local i32 @test(i32 %pid) local_unnamed_addr {
entry:
%ipv6_key = alloca %struct.ipv6_key_t, align 16
- call void @llvm.lifetime.start.p0(i64 48, ptr nonnull %ipv6_key) #4
+ call void @llvm.lifetime.start.p0(i64 48, ptr nonnull %ipv6_key)
call void @llvm.memset.p0.i64(ptr nonnull align 16 dereferenceable(48) %ipv6_key, i8 0, i64 48, i1 false)
store i32 %pid, ptr %ipv6_key, align 16, !tbaa !2
- call void @test1(ptr nonnull %ipv6_key) #4
- call void @llvm.lifetime.end.p0(i64 48, ptr nonnull %ipv6_key) #4
+ call void @test1(ptr nonnull %ipv6_key)
+ call void @llvm.lifetime.end.p0(i64 48, ptr nonnull %ipv6_key)
ret i32 0
}
@@ -35,21 +35,15 @@ entry:
; CHECK: *(u32 *)(r10 - 48) = r{{[0-9]+}}
; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
; Function Attrs: argmemonly nounwind willreturn writeonly
-declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #2
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg)
-declare dso_local void @test1(ptr) local_unnamed_addr #3
+declare dso_local void @test1(ptr) local_unnamed_addr
; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { argmemonly nounwind willreturn writeonly }
-attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind }
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/BPF/is_trunc_free.ll b/llvm/test/CodeGen/BPF/is_trunc_free.ll
index fe00731..6bb8568 100644
--- a/llvm/test/CodeGen/BPF/is_trunc_free.ll
+++ b/llvm/test/CodeGen/BPF/is_trunc_free.ll
@@ -29,7 +29,7 @@
%struct.env_t = type { i32, i32 }
; Function Attrs: nounwind
-define dso_local i32 @test(ptr %skb) local_unnamed_addr #0 {
+define dso_local i32 @test(ptr %skb) local_unnamed_addr {
entry:
%data_end1 = getelementptr inbounds %struct.env_t, ptr %skb, i64 0, i32 1
%0 = load i32, ptr %data_end1, align 4, !tbaa !2
@@ -49,7 +49,7 @@ if.end10: ; preds = %entry
%sub.ptr.lhs.cast = ptrtoint ptr %add.ptr to i64
%4 = trunc i64 %sub.ptr.lhs.cast to i32
%conv13 = sub i32 %4, %2
- %call = tail call i32 @work(ptr nonnull %skb, i32 %conv13) #2
+ %call = tail call i32 @work(ptr nonnull %skb, i32 %conv13)
br label %cleanup
cleanup: ; preds = %entry, %if.end10
@@ -59,11 +59,7 @@ cleanup: ; preds = %entry, %if.end10
; CHECK: w{{[0-9]+}} = *(u32 *)(r{{[0-9]+}} + 0)
-declare dso_local i32 @work(ptr, i32) local_unnamed_addr #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
+declare dso_local i32 @work(ptr, i32) local_unnamed_addr
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/BPF/is_zext_free.ll b/llvm/test/CodeGen/BPF/is_zext_free.ll
index 4b81a90..3b794a9 100644
--- a/llvm/test/CodeGen/BPF/is_zext_free.ll
+++ b/llvm/test/CodeGen/BPF/is_zext_free.ll
@@ -7,7 +7,7 @@
; clang -target bpf -O2 -emit-llvm -S test.c
; Function Attrs: norecurse nounwind readnone
-define dso_local i32 @test(i64 %x, i64 %y) local_unnamed_addr #0 {
+define dso_local i32 @test(i64 %x, i64 %y) local_unnamed_addr {
entry:
%and = and i64 %y, %x
%conv = trunc i64 %and to i32
@@ -17,8 +17,6 @@ entry:
; CHECK: r[[REG1:[0-9]+]] = r{{[0-9]+}}
; CHECK: w[[REG1]] &= w{{[0-9]+}}
-attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/BPF/objdump_two_funcs.ll b/llvm/test/CodeGen/BPF/objdump_two_funcs.ll
index fb1043c..8158a1b 100644
--- a/llvm/test/CodeGen/BPF/objdump_two_funcs.ll
+++ b/llvm/test/CodeGen/BPF/objdump_two_funcs.ll
@@ -14,7 +14,7 @@
; clang -target bpf -S -gdwarf-5 -gembed-source -emit-llvm -g -O2 bug.c
; Function Attrs: norecurse nounwind readnone
-define dso_local i32 @func1(i32 %a) local_unnamed_addr #0 section "s1" !dbg !7 {
+define dso_local i32 @func1(i32 %a) local_unnamed_addr section "s1" !dbg !7 {
entry:
; CHECK: <func1>:
call void @llvm.dbg.value(metadata i32 %a, metadata !12, metadata !DIExpression()), !dbg !13
@@ -24,7 +24,7 @@ entry:
}
; Function Attrs: norecurse nounwind readnone
-define dso_local i32 @func2(i32 %a) local_unnamed_addr #0 section "s2" !dbg !16 {
+define dso_local i32 @func2(i32 %a) local_unnamed_addr section "s2" !dbg !16 {
entry:
; CHECK: <func2>:
call void @llvm.dbg.value(metadata i32 %a, metadata !18, metadata !DIExpression()), !dbg !19
@@ -35,10 +35,7 @@ entry:
}
; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/optnone-1.ll b/llvm/test/CodeGen/BPF/optnone-1.ll
index 68046bf..f45c85b 100644
--- a/llvm/test/CodeGen/BPF/optnone-1.ll
+++ b/llvm/test/CodeGen/BPF/optnone-1.ll
@@ -5,7 +5,7 @@
; clang -target bpf -g -S -emit-llvm test.c
; Function Attrs: noinline nounwind optnone
-define dso_local i32 @test(i32 %a, i32 %b) #0 !dbg !7 {
+define dso_local i32 @test(i32 %a, i32 %b) !dbg !7 {
entry:
%a.addr = alloca i32, align 4
%b.addr = alloca i32, align 4
@@ -22,10 +22,7 @@ entry:
; CHECK-LABEL: test
; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
-
-attributes #0 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable}
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/reloc-btf-2.ll b/llvm/test/CodeGen/BPF/reloc-btf-2.ll
index 7398257..430abc7 100644
--- a/llvm/test/CodeGen/BPF/reloc-btf-2.ll
+++ b/llvm/test/CodeGen/BPF/reloc-btf-2.ll
@@ -14,7 +14,7 @@
@s = internal global i32 0, align 4, !dbg !6
; Function Attrs: norecurse nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !14 {
+define dso_local i32 @test() local_unnamed_addr !dbg !14 {
%1 = load i32, ptr @g, align 4, !dbg !17, !tbaa !18
%2 = load volatile i32, ptr @s, align 4, !dbg !22, !tbaa !18
%3 = add nsw i32 %2, %1, !dbg !23
@@ -27,8 +27,6 @@ define dso_local i32 @test() local_unnamed_addr #0 !dbg !14 {
; CHECK-RELOC: R_BPF_64_NODYLD32 g
; CHECK-RELOC: RELOCATION RECORDS FOR [.BTF.ext]:
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.dbg.cu = !{!2}
!llvm.module.flags = !{!10, !11, !12}
!llvm.ident = !{!13}
diff --git a/llvm/test/CodeGen/BPF/reloc-btf.ll b/llvm/test/CodeGen/BPF/reloc-btf.ll
index b9f6e3a..875bfa1 100644
--- a/llvm/test/CodeGen/BPF/reloc-btf.ll
+++ b/llvm/test/CodeGen/BPF/reloc-btf.ll
@@ -1,7 +1,7 @@
; RUN: llc -mtriple=bpfel -filetype=obj < %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-RELOC %s
; Function Attrs: norecurse nounwind readnone
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test() local_unnamed_addr !dbg !7 {
entry:
ret i32 0, !dbg !11
}
@@ -13,8 +13,6 @@ entry:
; CHECK-RELOC: RELOCATION RECORDS FOR [.BTF.ext]:
; CHECK-RELOC: R_BPF_64_NODYLD32 .text
-attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
!llvm.ident = !{!6}
diff --git a/llvm/test/CodeGen/BPF/simplifycfg.ll b/llvm/test/CodeGen/BPF/simplifycfg.ll
index fcd2321..d53b51a 100644
--- a/llvm/test/CodeGen/BPF/simplifycfg.ll
+++ b/llvm/test/CodeGen/BPF/simplifycfg.ll
@@ -38,15 +38,15 @@ target triple = "bpf"
%struct.FrameData = type { ptr }
; Function Attrs: nounwind
-define dso_local i32 @test() #0 {
+define dso_local i32 @test() {
entry:
%frame_ptr = alloca ptr, align 8
%frame = alloca %struct.FrameData, align 8
%i = alloca i32, align 4
- call void @llvm.lifetime.start.p0(i64 8, ptr %frame_ptr) #3
- call void @llvm.lifetime.start.p0(i64 8, ptr %frame) #3
+ call void @llvm.lifetime.start.p0(i64 8, ptr %frame_ptr)
+ call void @llvm.lifetime.start.p0(i64 8, ptr %frame)
call void @get_frame_ptr(ptr %frame_ptr)
- call void @llvm.lifetime.start.p0(i64 4, ptr %i) #3
+ call void @llvm.lifetime.start.p0(i64 4, ptr %i)
store i32 0, ptr %i, align 4, !tbaa !2
br label %for.cond
@@ -61,7 +61,7 @@ for.cond: ; preds = %for.inc, %entry
br i1 %cmp, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond
- call void @llvm.lifetime.end.p0(i64 4, ptr %i) #3
+ call void @llvm.lifetime.end.p0(i64 4, ptr %i)
br label %for.end
for.body: ; preds = %for.cond
@@ -93,25 +93,20 @@ for.end: ; preds = %for.cond.cleanup
%5 = load ptr, ptr %frame_ptr, align 8, !tbaa !6
%cmp2 = icmp eq ptr %5, null
%conv = zext i1 %cmp2 to i32
- call void @llvm.lifetime.end.p0(i64 8, ptr %frame) #3
- call void @llvm.lifetime.end.p0(i64 8, ptr %frame_ptr) #3
+ call void @llvm.lifetime.end.p0(i64 8, ptr %frame)
+ call void @llvm.lifetime.end.p0(i64 8, ptr %frame_ptr)
ret i32 %conv
}
; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
-declare dso_local void @get_frame_ptr(ptr) #2
+declare dso_local void @get_frame_ptr(ptr)
-declare dso_local i32 @get_data(ptr, ptr) #2
+declare dso_local i32 @get_data(ptr, ptr)
; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/BPF/warn-stack.ll b/llvm/test/CodeGen/BPF/warn-stack.ll
index 58a6e4c..5e62a91 100644
--- a/llvm/test/CodeGen/BPF/warn-stack.ll
+++ b/llvm/test/CodeGen/BPF/warn-stack.ll
@@ -1,43 +1,37 @@
; RUN: not llc -mtriple=bpfel < %s 2>&1 >/dev/null | FileCheck %s
;; CHECK-NOT: nowarn
-define void @nowarn() local_unnamed_addr #0 !dbg !6 {
+define void @nowarn() local_unnamed_addr !dbg !6 {
%1 = alloca [504 x i8], align 1
- call void @llvm.lifetime.start.p0(i64 504, ptr nonnull %1) #4, !dbg !15
+ call void @llvm.lifetime.start.p0(i64 504, ptr nonnull %1), !dbg !15
tail call void @llvm.dbg.declare(metadata ptr %1, metadata !10, metadata !16), !dbg !17
- call void @doit(ptr nonnull %1) #4, !dbg !18
- call void @llvm.lifetime.end.p0(i64 504, ptr nonnull %1) #4, !dbg !19
+ call void @doit(ptr nonnull %1), !dbg !18
+ call void @llvm.lifetime.end.p0(i64 504, ptr nonnull %1), !dbg !19
ret void, !dbg !19
}
; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #2
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
-declare void @doit(ptr) local_unnamed_addr #3
+declare void @doit(ptr) local_unnamed_addr
; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
; CHECK: error: warn_stack.c
; CHECK: BPF stack limit
-define void @warn() local_unnamed_addr #0 !dbg !20 {
+define void @warn() local_unnamed_addr !dbg !20 {
%1 = alloca [512 x i8], align 1
- call void @llvm.lifetime.start.p0(i64 512, ptr nonnull %1) #4, !dbg !26
+ call void @llvm.lifetime.start.p0(i64 512, ptr nonnull %1), !dbg !26
tail call void @llvm.dbg.declare(metadata ptr %1, metadata !22, metadata !16), !dbg !27
- call void @doit(ptr nonnull %1) #4, !dbg !28
- call void @llvm.lifetime.end.p0(i64 512, ptr nonnull %1) #4, !dbg !29
+ call void @doit(ptr nonnull %1), !dbg !28
+ call void @llvm.lifetime.end.p0(i64 512, ptr nonnull %1), !dbg !29
ret void, !dbg !29
}
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind readnone }
-attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind }
-
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4}
!llvm.ident = !{!5}
diff --git a/llvm/test/CodeGen/BPF/xadd.ll b/llvm/test/CodeGen/BPF/xadd.ll
index a3ec323..8d232ffb 100644
--- a/llvm/test/CodeGen/BPF/xadd.ll
+++ b/llvm/test/CodeGen/BPF/xadd.ll
@@ -17,7 +17,7 @@ target datalayout = "e-m:e-p:64:64-i64:64-n32:64-S128"
target triple = "bpf"
; Function Attrs: nounwind
-define dso_local i32 @test(ptr nocapture %ptr) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr nocapture %ptr) local_unnamed_addr !dbg !7 {
entry:
call void @llvm.dbg.value(metadata ptr %ptr, metadata !13, metadata !DIExpression()), !dbg !15
%0 = atomicrmw add ptr %ptr, i32 4 seq_cst, !dbg !16
@@ -28,10 +28,7 @@ entry:
}
; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/DirectX/CBufferAccess/unused.ll b/llvm/test/CodeGen/DirectX/CBufferAccess/unused.ll
new file mode 100644
index 0000000..6f1bbd0
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/CBufferAccess/unused.ll
@@ -0,0 +1,13 @@
+; RUN: opt -S -dxil-cbuffer-access -mtriple=dxil--shadermodel6.3-library %s | FileCheck %s
+; Check that we correctly ignore cbuffers that were nulled out by optimizations.
+
+%__cblayout_CB = type <{ float }>
+@CB.cb = local_unnamed_addr global target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 4, 0)) poison
+@x = external local_unnamed_addr addrspace(2) global float, align 4
+
+; CHECK-NOT: !hlsl.cbs =
+!hlsl.cbs = !{!0, !1, !2}
+
+!0 = !{ptr @CB.cb, ptr addrspace(2) @x}
+!1 = !{ptr @CB.cb, null}
+!2 = !{null, null}
diff --git a/llvm/test/CodeGen/DirectX/Metadata/resource-symbols.ll b/llvm/test/CodeGen/DirectX/Metadata/resource-symbols.ll
index 4f13f47..56798c8 100644
--- a/llvm/test/CodeGen/DirectX/Metadata/resource-symbols.ll
+++ b/llvm/test/CodeGen/DirectX/Metadata/resource-symbols.ll
@@ -28,6 +28,11 @@ define void @test() {
@llvm.dx.resource.handlefrombinding(i32 0, i32 10, i32 1, i32 0, ptr @SB.str)
; CHECK: %"StructuredBuffer<struct.S>" = type { %struct.S }
+ ; StructuredBuffer<float[3][2]>
+ %struct1 = call target("dx.RawBuffer", [3 x [2 x float]], 0, 0)
+ @llvm.dx.resource.handlefrombinding(i32 0, i32 12, i32 1, i32 0, ptr null)
+ ; CHECK: %"StructuredBuffer<float[3][2]>" = type { [3 x [2 x float]] }
+
; ByteAddressBuffer
%byteaddr = call target("dx.RawBuffer", i8, 0, 0)
@llvm.dx.resource.handlefrombinding(i32 0, i32 20, i32 1, i32 0, ptr null)
@@ -40,12 +45,14 @@ define void @test() {
; CHECK-NEXT: @[[T1:.*]] = external constant %"Buffer<int32_t>"
; CHECK-NEXT: @[[T2:.*]] = external constant %"Buffer<uint32_t3>"
; CHECK-NEXT: @[[S0:.*]] = external constant %"StructuredBuffer<struct.S>"
+; CHECK-NEXT: @[[S1:.*]] = external constant %"StructuredBuffer<float[3][2]>"
; CHECK-NEXT: @[[B0:.*]] = external constant %ByteAddressBuffer
; CHECK: !{i32 0, ptr @[[T0]], !"A"
; CHECK: !{i32 1, ptr @[[T1]], !""
; CHECK: !{i32 2, ptr @[[T2]], !""
; CHECK: !{i32 3, ptr @[[S0]], !"SB"
-; CHECK: !{i32 4, ptr @[[B0]], !""
+; CHECK: !{i32 4, ptr @[[S1]], !""
+; CHECK: !{i32 5, ptr @[[B0]], !""
attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) }
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/ripple_scalarize_scatter.ll b/llvm/test/CodeGen/Hexagon/autohvx/ripple_scalarize_scatter.ll
new file mode 100644
index 0000000..4385da3
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/ripple_scalarize_scatter.ll
@@ -0,0 +1,63 @@
+; Make sure we do not assert for the cases we do not handle.
+; RUN: llc -march=hexagon -mattr=+hvx,+hvx-length128b,+hvxv75,+v75,-long-calls < %s | FileCheck %s
+
+; Mainly make sure we do not core dump.
+; CHECK-NOT: scatter
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind memory(argmem: write, inaccessiblemem: readwrite)
+define dso_local void @foo(ptr noundef writeonly captures(none) %cptr, i32 noundef %T, i32 noundef %W) local_unnamed_addr #0 {
+entry:
+ %invariant.gep11 = getelementptr i8, ptr %cptr, i32 0
+ %invariant.gep13 = getelementptr i8, ptr %invariant.gep11, i32 0
+ %cmp.not15 = icmp ugt i32 8, %T
+ br i1 %cmp.not15, label %for.cond.cleanup, label %for.cond1.preheader.lr.ph
+
+for.cond1.preheader.lr.ph: ; preds = %entry
+ %cmp3.not8 = icmp ugt i32 8, %W
+ %conv.ripple.LS.instance = trunc i32 %W to i8
+ %conv.ripple.LS.instance.ripple.bcast.splatinsert = insertelement <64 x i8> poison, i8 %conv.ripple.LS.instance, i64 0
+ %conv.ripple.LS.instance.ripple.bcast.splat = shufflevector <64 x i8> %conv.ripple.LS.instance.ripple.bcast.splatinsert, <64 x i8> poison, <64 x i32> zeroinitializer
+ br label %for.cond1.preheader
+
+for.cond.loopexit: ; preds = %for.body5, %for.cond1.preheader
+ %add = add i32 %add17, 8
+ %cmp.not = icmp ugt i32 %add, %T
+ br i1 %cmp.not, label %for.cond.cleanup, label %for.cond1.preheader
+
+for.cond1.preheader: ; preds = %for.cond1.preheader.lr.ph, %for.cond.loopexit
+ %add17 = phi i32 [ 8, %for.cond1.preheader.lr.ph ], [ %add, %for.cond.loopexit ]
+ %t.016 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %add17, %for.cond.loopexit ]
+ br i1 %cmp3.not8, label %for.cond.loopexit, label %for.body5.lr.ph
+
+for.body5.lr.ph: ; preds = %for.cond1.preheader
+ %gep14 = getelementptr i8, ptr %invariant.gep13, i32 %t.016
+ br label %for.body5
+
+for.cond.cleanup: ; preds = %for.cond.loopexit, %entry
+ ret void
+
+for.body5: ; preds = %for.body5.lr.ph, %for.body5
+ %add210 = phi i32 [ 8, %for.body5.lr.ph ], [ %add2, %for.body5 ]
+ %w.09 = phi i32 [ 0, %for.body5.lr.ph ], [ %add210, %for.body5 ]
+ %gep = getelementptr i8, ptr %gep14, i32 %w.09
+ %gep.ripple.LS.instance = getelementptr i8, ptr %gep, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+ call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> %conv.ripple.LS.instance.ripple.bcast.splat, <64 x ptr> %gep.ripple.LS.instance, i32 1, <64 x i1> splat (i1 true))
+ %add2 = add i32 %add210, 8
+ %cmp3.not = icmp ugt i32 %add2, %W
+ br i1 %cmp3.not, label %for.cond.loopexit, label %for.body5
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write)
+declare void @llvm.ripple.block.setsize.i32(i32 immarg %0, i32 immarg %1, i32 %2) #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read)
+declare i32 @llvm.ripple.block.index.i32(i32 immarg %0, i32 immarg %1) #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read)
+declare i32 @llvm.ripple.block.getsize.i32(i32 immarg %0, i32 immarg %1) #2
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(write)
+declare void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> %0, <64 x ptr> %1, i32 immarg %2, <64 x i1> %3) #3
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather.ll b/llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather.ll
new file mode 100644
index 0000000..83fd63e
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather.ll
@@ -0,0 +1,55 @@
+; RUN: llc -march=hexagon -mattr=+hvxv73,+hvx-length128b,-long-calls -hexagon-allow-scatter-gather-hvx < %s | FileCheck %s
+
+; CHECK-LABEL: Ripple_gather_32:
+; CHECK: vtmp.w = vgather
+; CHECK-LABEL: Ripple_gather_16:
+; CHECK: vtmp.h = vgather
+; CHECK-LABEL: Ripple_gather_8:
+; CHECK: vand
+; CHECK: vpacke
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+; Function Attrs: nofree noinline norecurse nosync nounwind memory(argmem: readwrite, inaccessiblemem: readwrite)
+define dso_local void @Ripple_gather_32(ptr nocapture noundef writeonly %destination, ptr nocapture noundef readonly %source, ptr nocapture noundef readonly %indexes) local_unnamed_addr #0 {
+entry:
+ %source.ripple.bcast.splatinsert = insertelement <32 x ptr> poison, ptr %source, i64 0
+ %source.ripple.bcast.splat = shufflevector <32 x ptr> %source.ripple.bcast.splatinsert, <32 x ptr> poison, <32 x i32> zeroinitializer
+ %0 = load <32 x i32>, ptr %indexes, align 4
+ %arrayidx2.ripple.vectorized = getelementptr inbounds i32, <32 x ptr> %source.ripple.bcast.splat, <32 x i32> %0
+ %1 = tail call <32 x i32> @llvm.masked.gather.v32i32.v32p0(<32 x ptr> %arrayidx2.ripple.vectorized, i32 4, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i32> poison)
+ store <32 x i32> %1, ptr %destination, align 4
+ ret void
+}
+
+; Function Attrs: nofree noinline norecurse nosync nounwind memory(argmem: readwrite, inaccessiblemem: readwrite)
+define dso_local void @Ripple_gather_16(ptr nocapture noundef writeonly %destination, ptr nocapture noundef readonly %source, ptr nocapture noundef readonly %indexes) local_unnamed_addr #0 {
+entry:
+ %source.ripple.bcast.splatinsert = insertelement <64 x ptr> poison, ptr %source, i64 0
+ %source.ripple.bcast.splat = shufflevector <64 x ptr> %source.ripple.bcast.splatinsert, <64 x ptr> poison, <64 x i32> zeroinitializer
+ %0 = load <64 x i16>, ptr %indexes, align 2
+ %idxprom.ripple.vectorized = zext <64 x i16> %0 to <64 x i32>
+ %arrayidx2.ripple.vectorized = getelementptr inbounds i16, <64 x ptr> %source.ripple.bcast.splat, <64 x i32> %idxprom.ripple.vectorized
+ %1 = tail call <64 x i16> @llvm.masked.gather.v64i16.v64p0(<64 x ptr> %arrayidx2.ripple.vectorized, i32 2, <64 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <64 x i16> poison)
+ store <64 x i16> %1, ptr %destination, align 2
+ ret void
+}
+
+; Function Attrs: nofree noinline norecurse nosync nounwind memory(argmem: readwrite, inaccessiblemem: readwrite)
+define dso_local void @Ripple_gather_8(ptr nocapture noundef writeonly %destination, ptr nocapture noundef readonly %source, ptr nocapture noundef readonly %indexes) local_unnamed_addr #0 {
+entry:
+ %source.ripple.bcast.splatinsert = insertelement <128 x ptr> poison, ptr %source, i64 0
+ %source.ripple.bcast.splat = shufflevector <128 x ptr> %source.ripple.bcast.splatinsert, <128 x ptr> poison, <128 x i32> zeroinitializer
+ %0 = load <128 x i8>, ptr %indexes, align 1
+ %idxprom.ripple.vectorized = zext <128 x i8> %0 to <128 x i32>
+ %arrayidx2.ripple.vectorized = getelementptr inbounds i8, <128 x ptr> %source.ripple.bcast.splat, <128 x i32> %idxprom.ripple.vectorized
+ %1 = tail call <128 x i8> @llvm.masked.gather.v128i8.v128p0(<128 x ptr> %arrayidx2.ripple.vectorized, i32 1, <128 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <128 x i8> poison)
+ store <128 x i8> %1, ptr %destination, align 1
+ ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read)
+declare <32 x i32> @llvm.masked.gather.v32i32.v32p0(<32 x ptr>, i32 immarg, <32 x i1>, <32 x i32>) #1
+declare <64 x i16> @llvm.masked.gather.v64i16.v64p0(<64 x ptr>, i32 immarg, <64 x i1>, <64 x i16>) #1
+declare <128 x i8> @llvm.masked.gather.v128i8.v128p0(<128 x ptr> %0, i32 immarg %1, <128 x i1> %2, <128 x i8> %3) #1
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather_SpVV.ll b/llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather_SpVV.ll
new file mode 100644
index 0000000..1bd79d7
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather_SpVV.ll
@@ -0,0 +1,54 @@
+; Verify that we generate HVX vgather for the given input.
+; RUN: llc -march=hexagon -mattr=+hvxv73,+hvx-length128b,-long-calls -hexagon-allow-scatter-gather-hvx < %s | FileCheck %s
+; CHECK-LABEL: SpVV_Ripple:
+; CHECK: vtmp.h = vgather(r{{[0-9]+}},m0,v{{[0-9]+}}.h).h
+; CHECK: vmem(r0+#0) = vtmp.new
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+define dso_local i32 @SpVV_Ripple(ptr nocapture noundef writeonly %scratchpad, ptr nocapture noundef readonly %Source_1, ptr nocapture noundef readonly %S_index, i32 noundef %nS, ptr nocapture noundef readonly %Source_2) local_unnamed_addr #1 {
+entry:
+ %Source_2.ripple.bcast.splatinsert = insertelement <64 x ptr> poison, ptr %Source_2, i64 0
+ %Source_2.ripple.bcast.splat = shufflevector <64 x ptr> %Source_2.ripple.bcast.splatinsert, <64 x ptr> poison, <64 x i32> zeroinitializer
+ %div16 = lshr i32 %nS, 6
+ %cmp6.not = icmp ult i32 %nS, 64
+ br i1 %cmp6.not, label %for.end, label %for.body
+
+for.body: ; preds = %entry, %for.body
+ %lsr.iv17 = phi ptr [ %scevgep18, %for.body ], [ %S_index, %entry ]
+ %lsr.iv = phi ptr [ %scevgep, %for.body ], [ %Source_1, %entry ]
+ %result.08.ripple.vectorized = phi <64 x i32> [ %add8.ripple.vectorized, %for.body ], [ zeroinitializer, %entry ]
+ %_ripple_block_0.07 = phi i32 [ %add9, %for.body ], [ 0, %entry ]
+ %.ripple.LS.instance = load <64 x i16>, ptr %lsr.iv17, align 2
+ %idxprom.ripple.LS.instance = sext <64 x i16> %.ripple.LS.instance to <64 x i32>
+ %arrayidx2.ripple.LS.instance = getelementptr inbounds i16, <64 x ptr> %Source_2.ripple.bcast.splat, <64 x i32> %idxprom.ripple.LS.instance
+ %.ripple.LS.instance13 = tail call <64 x i16> @llvm.masked.gather.v64i16.v64p0(<64 x ptr> %arrayidx2.ripple.LS.instance, i32 2, <64 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <64 x i16> poison)
+ store <64 x i16> %.ripple.LS.instance13, ptr %scratchpad, align 2
+ %.ripple.LS.instance15 = load <64 x i16>, ptr %lsr.iv, align 2
+ %conv.ripple.LS.instance = sext <64 x i16> %.ripple.LS.instance15 to <64 x i32>
+ %conv6.ripple.LS.instance = sext <64 x i16> %.ripple.LS.instance13 to <64 x i32>
+ %mul7.ripple.LS.instance = mul nsw <64 x i32> %conv.ripple.LS.instance, %conv6.ripple.LS.instance
+ %add8.ripple.vectorized = add <64 x i32> %mul7.ripple.LS.instance, %result.08.ripple.vectorized
+ %add9 = add nuw nsw i32 %_ripple_block_0.07, 1
+ %scevgep = getelementptr i8, ptr %lsr.iv, i32 128
+ %scevgep18 = getelementptr i8, ptr %lsr.iv17, i32 128
+ %cmp = icmp ult i32 %add9, %div16
+ br i1 %cmp, label %for.body, label %for.end
+for.end: ; preds = %for.body, %entry
+ %result.0.lcssa.ripple.LS.instance = phi <64 x i32> [ zeroinitializer, %entry ], [ %add8.ripple.vectorized, %for.body ]
+ %rdx.shuf = shufflevector <64 x i32> %result.0.lcssa.ripple.LS.instance, <64 x i32> poison, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %bin.rdx = add <64 x i32> %result.0.lcssa.ripple.LS.instance, %rdx.shuf
+ %rdx.shuf19 = shufflevector <64 x i32> %bin.rdx, <64 x i32> poison, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %bin.rdx20 = add <64 x i32> %bin.rdx, %rdx.shuf19
+ %rdx.shuf21 = shufflevector <64 x i32> %bin.rdx20, <64 x i32> poison, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %bin.rdx22 = add <64 x i32> %bin.rdx20, %rdx.shuf21
+ %rdx.shuf23 = shufflevector <64 x i32> %bin.rdx22, <64 x i32> poison, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %bin.rdx24 = add <64 x i32> %bin.rdx22, %rdx.shuf23
+ %rdx.shuf25 = shufflevector <64 x i32> %bin.rdx24, <64 x i32> poison, <64 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %bin.rdx26 = add <64 x i32> %bin.rdx24, %rdx.shuf25
+ %rdx.shuf27 = shufflevector <64 x i32> %bin.rdx26, <64 x i32> poison, <64 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %bin.rdx28 = add <64 x i32> %bin.rdx26, %rdx.shuf27
+ %0 = extractelement <64 x i32> %bin.rdx28, i32 0
+ ret i32 %0
+}
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/ripple_vscatter.ll b/llvm/test/CodeGen/Hexagon/autohvx/ripple_vscatter.ll
new file mode 100644
index 0000000..85d2999
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/ripple_vscatter.ll
@@ -0,0 +1,52 @@
+; RUN: llc -march=hexagon -mattr=+hvx-length128b,+hvxv73,+v73,-long-calls -hexagon-allow-scatter-gather-hvx < %s | FileCheck %s
+
+; CHECK-LABEL: Ripple_scatter_8:
+; CHECK: if (q{{[0-9]+}}) vscatter(r{{[0-9]+}},m0,v{{[0-9]+}}.h).h
+; CHECK: if (q{{[0-9]+}}) vscatter(r{{[0-9]+}},m0,v{{[0-9]+}}.h).h
+; CHECK-LABEL: Ripple_scatter_16:
+; CHECK: vscatter(r{{[0-9]+}},m0,v{{[0-9]+}}.h).h = v{{[0-9]+}}
+; CHECK-LABEL: Ripple_scatter_32:
+; CHECK: vscatter(r{{[0-9]+}},m0,v{{[0-9]+}}.w).w = v{{[0-9]+}}
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+define dso_local void @Ripple_scatter_8(ptr nocapture noundef writeonly %destination, ptr nocapture noundef readonly %source, ptr nocapture noundef readonly %indexes) local_unnamed_addr #0 {
+entry:
+ %destination.ripple.bcast.splatinsert = insertelement <128 x ptr> poison, ptr %destination, i64 0
+ %destination.ripple.bcast.splat = shufflevector <128 x ptr> %destination.ripple.bcast.splatinsert, <128 x ptr> poison, <128 x i32> zeroinitializer
+ %.ripple.LS.instance11 = load <128 x i8>, ptr %source, align 1
+ %.ripple.LS.instance = load <128 x i8>, ptr %indexes, align 1
+ %idxprom.ripple.LS.instance = zext <128 x i8> %.ripple.LS.instance to <128 x i32>
+ %arrayidx3.ripple.LS.instance = getelementptr inbounds i8, <128 x ptr> %destination.ripple.bcast.splat, <128 x i32> %idxprom.ripple.LS.instance
+ %cst_ptr_to_i32 = ptrtoint ptr %destination to i32
+ tail call void @llvm.masked.scatter.v128i8.v128p0(<128 x i8> %.ripple.LS.instance11, <128 x ptr> %arrayidx3.ripple.LS.instance, i32 1, <128 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+define dso_local void @Ripple_scatter_16(ptr nocapture noundef writeonly %destination, ptr nocapture noundef readonly %source, ptr nocapture noundef readonly %indexes) local_unnamed_addr #0 {
+entry:
+ %destination.ripple.bcast.splatinsert = insertelement <64 x ptr> poison, ptr %destination, i64 0
+ %destination.ripple.bcast.splat = shufflevector <64 x ptr> %destination.ripple.bcast.splatinsert, <64 x ptr> poison, <64 x i32> zeroinitializer
+ %.ripple.LS.instance11 = load <64 x i16>, ptr %source, align 2
+ %.ripple.LS.instance = load <64 x i16>, ptr %indexes, align 2
+ %idxprom.ripple.LS.instance = zext <64 x i16> %.ripple.LS.instance to <64 x i32>
+ %arrayidx3.ripple.LS.instance = getelementptr inbounds i16, <64 x ptr> %destination.ripple.bcast.splat, <64 x i32> %idxprom.ripple.LS.instance
+ tail call void @llvm.masked.scatter.v64i16.v64p0(<64 x i16> %.ripple.LS.instance11, <64 x ptr> %arrayidx3.ripple.LS.instance, i32 2, <64 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+define dso_local void @Ripple_scatter_32(ptr nocapture noundef writeonly %destination, ptr nocapture noundef readonly %source, ptr nocapture noundef readonly %indexes) local_unnamed_addr #0 {
+entry:
+ %destination.ripple.bcast.splatinsert = insertelement <32 x ptr> poison, ptr %destination, i64 0
+ %destination.ripple.bcast.splat = shufflevector <32 x ptr> %destination.ripple.bcast.splatinsert, <32 x ptr> poison, <32 x i32> zeroinitializer
+ %.ripple.LS.instance11 = load <32 x i32>, ptr %source, align 4
+ %.ripple.LS.instance = load <32 x i32>, ptr %indexes, align 4
+ %arrayidx3.ripple.LS.instance = getelementptr inbounds i32, <32 x ptr> %destination.ripple.bcast.splat, <32 x i32> %.ripple.LS.instance
+ tail call void @llvm.masked.scatter.v32i32.v32p0(<32 x i32> %.ripple.LS.instance11, <32 x ptr> %arrayidx3.ripple.LS.instance, i32 4, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+declare void @llvm.masked.scatter.v128i8.v128p0(<128 x i8> %0, <128 x ptr> %1, i32 immarg %2, <128 x i1> %3) #2
+declare void @llvm.masked.scatter.v64i16.v64p0(<64 x i16> %0, <64 x ptr> %1, i32 immarg %2, <64 x i1> %3) #2
+declare void @llvm.masked.scatter.v32i32.v32p0(<32 x i32> %0, <32 x ptr> %1, i32 immarg %2, <32 x i1> %3) #2
diff --git a/llvm/test/CodeGen/Hexagon/masked_gather.ll b/llvm/test/CodeGen/Hexagon/masked_gather.ll
new file mode 100644
index 0000000..461fd79
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/masked_gather.ll
@@ -0,0 +1,58 @@
+; This produced masked gather that we are not yet handling
+; REQUIRES: asserts
+; RUN: opt -march=hexagon -passes=loop-vectorize -hexagon-autohvx -mattr=+hvx-length128b,+hvxv68,+v68,+hvx-ieee-fp,-long-calls,-packets -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s
+
+; Original C++
+; clang -c -Os -mhvx -mhvx-ieee-fp -fvectorize -mno-packets -fno-strict-aliasing -Os -mhvx -mhvx-ieee-fp -mno-packets -mv68
+;typedef struct poptContext_s * poptContext;
+;typedef struct { unsigned int bits[1]; } pbm_set;
+;struct poptContext_s { pbm_set * arg_strip; };
+;
+;int poptStrippedArgv(poptContext con, int argc, char ** argv) {
+; int numargs = argc;
+; for (int i = 1; i < argc; i++) {
+; if (((((con->arg_strip)->bits)[((i) / (8 * sizeof (unsigned int)))] & ((unsigned int) 1 << ((i) % (8 * sizeof (unsigned int))))) != 0))
+; numargs--;
+; }
+; return numargs;
+;}
+
+; CHECK-NOT: masked_gather
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon-unknown-unknown-elf"
+
+; Function Attrs: nofree norecurse nosync nounwind optsize memory(read, inaccessiblemem: none)
+define dso_local i32 @poptStrippedArgv(ptr noundef readonly captures(none) %con, i32 noundef %argc, ptr noundef readnone captures(none) %argv) local_unnamed_addr #0 {
+entry:
+ %cmp8 = icmp sgt i32 %argc, 1
+ br i1 %cmp8, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph: ; preds = %entry
+ %0 = load ptr, ptr %con, align 4
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ %spec.select.lcssa = phi i32 [ %spec.select, %for.body ]
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %numargs.0.lcssa = phi i32 [ %argc, %entry ], [ %spec.select.lcssa, %for.cond.cleanup.loopexit ]
+ ret i32 %numargs.0.lcssa
+
+for.body: ; preds = %for.body.lr.ph, %for.body
+ %i.010 = phi i32 [ 1, %for.body.lr.ph ], [ %inc, %for.body ]
+ %numargs.09 = phi i32 [ %argc, %for.body.lr.ph ], [ %spec.select, %for.body ]
+ %div7 = lshr i32 %i.010, 5
+ %arrayidx = getelementptr inbounds nuw [1 x i32], ptr %0, i32 0, i32 %div7
+ %1 = load i32, ptr %arrayidx, align 4
+ %rem = and i32 %i.010, 31
+ %shl = shl nuw i32 1, %rem
+ %and = and i32 %1, %shl
+ %cmp1.not = icmp ne i32 %and, 0
+ %dec = sext i1 %cmp1.not to i32
+ %spec.select = add nsw i32 %numargs.09, %dec
+ %inc = add nuw nsw i32 %i.010, 1
+ %exitcond.not = icmp eq i32 %inc, %argc
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
diff --git a/llvm/test/CodeGen/Hexagon/vector-gather.ll b/llvm/test/CodeGen/Hexagon/vector-gather.ll
new file mode 100644
index 0000000..5700380
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/vector-gather.ll
@@ -0,0 +1,27 @@
+; REQUIRES: hexagon-registered-target
+; RUN: llc -march=hexagon -mcpu=hexagonv73 -mattr=+hvxv73,+hvx-length128b < %s | FileCheck %s
+
+target triple = "hexagon"
+
+@VTCM_SCATTER16_ADDRESS = dso_local global i32 0, align 4
+@region_len = dso_local global i32 16383, align 4
+
+; CHECK: [[ADR:r[0-9]+]] = memw(gp+#VTCM_SCATTER16_ADDRESS)
+; CHECK: vtmp.h = vgather([[ADR]],m0,v0.h).h
+; CHECK: vmem(r0+#0) = vtmp.new
+
+define dso_local void @vector_gather_16(ptr noundef %vgather, <32 x i32> noundef %offsets) #0 {
+entry:
+ %vgather.addr = alloca ptr, align 4
+ %offsets.addr = alloca <32 x i32>, align 128
+ store ptr %vgather, ptr %vgather.addr, align 4
+ store <32 x i32> %offsets, ptr %offsets.addr, align 128
+ %0 = load ptr, ptr %vgather.addr, align 4
+ %1 = load i32, ptr @VTCM_SCATTER16_ADDRESS, align 4
+ %2 = load i32, ptr @region_len, align 4
+ %3 = load <32 x i32>, ptr %offsets.addr, align 128
+ call void @llvm.hexagon.V6.vgathermh.128B(ptr %0, i32 %1, i32 %2, <32 x i32> %3)
+ ret void
+}
+
+declare <128 x i1> @llvm.hexagon.V6.vandvrt.128B(<32 x i32>, i32)
diff --git a/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll b/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll
index 245f764..7149cdb 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll
@@ -32,9 +32,7 @@ define <16 x i16> @shuffle_v16i16(<16 x i16> %a) {
; CHECK: # %bb.0:
; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI2_0)
; CHECK-NEXT: xvld $xr1, $a0, %pc_lo12(.LCPI2_0)
-; CHECK-NEXT: xvpermi.d $xr2, $xr0, 78
-; CHECK-NEXT: xvshuf.w $xr1, $xr2, $xr0
-; CHECK-NEXT: xvori.b $xr0, $xr1, 0
+; CHECK-NEXT: xvperm.w $xr0, $xr0, $xr1
; CHECK-NEXT: ret
%shuffle = shufflevector <16 x i16> %a, <16 x i16> poison, <16 x i32> <i32 8, i32 9, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x i16> %shuffle
@@ -55,9 +53,7 @@ define <16 x i16> @shuffle_v16i16_same_lane(<16 x i16> %a) {
define <8 x i32> @shuffle_v8i32(<8 x i32> %a) {
; CHECK-LABEL: shuffle_v8i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI4_0)
-; CHECK-NEXT: xvld $xr1, $a0, %pc_lo12(.LCPI4_0)
-; CHECK-NEXT: xvperm.w $xr0, $xr0, $xr1
+; CHECK-NEXT: xvpermi.d $xr0, $xr0, 226
; CHECK-NEXT: ret
%shuffle = shufflevector <8 x i32> %a, <8 x i32> poison, <8 x i32> <i32 4, i32 5, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7>
ret <8 x i32> %shuffle
@@ -93,9 +89,7 @@ define <4 x i64> @shuffle_v4i64_same_lane(<4 x i64> %a) {
define <8 x float> @shuffle_v8f32(<8 x float> %a) {
; CHECK-LABEL: shuffle_v8f32:
; CHECK: # %bb.0:
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_0)
-; CHECK-NEXT: xvld $xr1, $a0, %pc_lo12(.LCPI8_0)
-; CHECK-NEXT: xvperm.w $xr0, $xr0, $xr1
+; CHECK-NEXT: xvpermi.d $xr0, $xr0, 226
; CHECK-NEXT: ret
%shuffle = shufflevector <8 x float> %a, <8 x float> poison, <8 x i32> <i32 4, i32 5, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7>
ret <8 x float> %shuffle
diff --git a/llvm/test/CodeGen/MIR/AArch64/parse-shufflemask-invalid-scalar.mir b/llvm/test/CodeGen/MIR/AArch64/parse-shufflemask-invalid-scalar.mir
new file mode 100644
index 0000000..6c6f9b6
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/AArch64/parse-shufflemask-invalid-scalar.mir
@@ -0,0 +1,15 @@
+# RUN: not llc -mtriple=aarch64-- -run-pass=none -o /dev/null %s 2>&1 | FileCheck %s
+
+---
+name: test_shuffle_0
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $d0
+ %0:_(<2 x s32>) = COPY $d0
+ %2:_(<2 x s32>) = G_IMPLICIT_DEF
+ ; CHECK: [[@LINE+1]]:67: shufflemask should have > 1 element
+ %1:_(s32) = G_SHUFFLE_VECTOR %0(<2 x s32>), %2, shufflemask(0)
+ $w0 = COPY %1
+ RET_ReallyLR implicit $w0
+...
diff --git a/llvm/test/CodeGen/MIR/AArch64/parse-shufflemask.mir b/llvm/test/CodeGen/MIR/AArch64/parse-shufflemask.mir
index c529d63..4cc3bc6 100644
--- a/llvm/test/CodeGen/MIR/AArch64/parse-shufflemask.mir
+++ b/llvm/test/CodeGen/MIR/AArch64/parse-shufflemask.mir
@@ -122,54 +122,3 @@ body: |
RET_ReallyLR implicit $d0
...
-
-# CHECK-LABEL: name: test_shuffle_0
-# CHECK: G_SHUFFLE_VECTOR %0(<2 x s32>), %1, shufflemask(0)
----
-name: test_shuffle_0
-tracksRegLiveness: true
-body: |
- bb.0:
- liveins: $d0
-
- %0:_(<2 x s32>) = COPY $d0
- %2:_(<2 x s32>) = G_IMPLICIT_DEF
- %1:_(s32) = G_SHUFFLE_VECTOR %0(<2 x s32>), %2, shufflemask(0)
- $w0 = COPY %1
- RET_ReallyLR implicit $w0
-
-...
-
-# CHECK-LABEL: name: test_shuffle_1
-# CHECK: G_SHUFFLE_VECTOR %0(<2 x s32>), %1, shufflemask(1)
----
-name: test_shuffle_1
-tracksRegLiveness: true
-body: |
- bb.0:
- liveins: $d0
-
- %0:_(<2 x s32>) = COPY $d0
- %2:_(<2 x s32>) = G_IMPLICIT_DEF
- %1:_(s32) = G_SHUFFLE_VECTOR %0(<2 x s32>), %2, shufflemask(1)
- $w0 = COPY %1
- RET_ReallyLR implicit $w0
-
-...
-
-# CHECK-LABEL: name: test_shuffle_undef
-# CHECK: G_SHUFFLE_VECTOR %0(<2 x s32>), %1, shufflemask(undef)
----
-name: test_shuffle_undef
-tracksRegLiveness: true
-body: |
- bb.0:
- liveins: $d0
-
- %0:_(<2 x s32>) = COPY $d0
- %2:_(<2 x s32>) = G_IMPLICIT_DEF
- %1:_(s32) = G_SHUFFLE_VECTOR %0(<2 x s32>), %2, shufflemask(undef)
- $w0 = COPY %1
- RET_ReallyLR implicit $w0
-
-...
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_2D_vocab.json b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_2D_vocab.json
index 2894fff..da0d13d 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_2D_vocab.json
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_2D_vocab.json
@@ -1,5 +1,5 @@
{
- "entities" : {
+ "Opcodes" : {
"ABS_Fp":[1, 2],
"ADC":[3, 4],
"ADD":[5, 6],
@@ -7,5 +7,21 @@
"ADDPDrr":[9, 10],
"ADDPSrr":[11, 12],
"ADDSDrm":[13, 14]
+ },
+ "CommonOperands": {
+ "Immediate": [0.1, 0.1],
+ "MBB": [0.2, 0.2],
+ "FrameIndex": [0.3, 0.3],
+ "GlobalAddress": [0.4, 0.4]
+ },
+ "PhysicalRegisters": {
+ "GR32": [0.5, 0.5],
+ "GR64": [0.6, 0.6],
+ "XMM": [0.7, 0.7]
+ },
+ "VirtualRegisters": {
+ "GR32": [0.8, 0.8],
+ "GR64": [0.9, 0.9],
+ "XMM": [1.0, 1.0]
}
} \ No newline at end of file
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json
index 5de715b..f4b14a4 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json
@@ -1,5 +1,5 @@
{
- "entities": {
+ "Opcodes": {
"KILL": [0.1, 0.2, 0.3],
"MOV": [0.4, 0.5, 0.6],
"LEA": [0.7, 0.8, 0.9],
@@ -18,5 +18,21 @@
"POP": [4.6, 4.7, 4.8],
"NOP": [4.9, 5.0, 5.1],
"COPY": [5.2, 5.3, 5.4]
+ },
+ "CommonOperands": {
+ "Immediate": [0.1, 0.1, 0.1],
+ "MBB": [0.2, 0.2, 0.2],
+ "FrameIndex": [0.3, 0.3, 0.3],
+ "GlobalAddress": [0.4, 0.4, 0.4]
+ },
+ "PhysicalRegisters": {
+ "GR32": [0.5, 0.5, 0.5],
+ "GR64": [0.6, 0.6, 0.6],
+ "XMM": [0.7, 0.7, 0.7]
+ },
+ "VirtualRegisters": {
+ "GR32": [0.8, 0.8, 0.8],
+ "GR64": [0.9, 0.9, 0.9],
+ "XMM": [1.0, 1.0, 1.0]
}
} \ No newline at end of file
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_inconsistent_dims.json b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_inconsistent_dims.json
index bf04163..6274fb7 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_inconsistent_dims.json
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_inconsistent_dims.json
@@ -1,7 +1,16 @@
{
- "entities": {
+ "Opcodes": {
"ADD": [1.0, 2.0, 3.0],
"SUB": [1.5],
"MUL": [2.0, 3.0]
+ },
+ "CommonOperands": {
+ "Immediate": [1.0]
+ },
+ "PhysicalRegisters": {
+ "GR32": [1.0, 2.0]
+ },
+ "VirtualRegisters": {
+ "GR32": [1.0, 2.0, 3.0]
}
}
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_zero_vocab.json b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_zero_vocab.json
index 63e8ccbd..7bfdf3b 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_zero_vocab.json
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_zero_vocab.json
@@ -1,5 +1,5 @@
{
- "entities": {
+ "Opcodes": {
"ADD": [],
"SUB": [],
"MUL": [],
@@ -8,5 +8,14 @@
"JMP": [],
"CALL": [],
"RET": []
+ },
+ "CommonOperands": {
+ "Immediate": []
+ },
+ "PhysicalRegisters": {
+ "GR32": []
+ },
+ "VirtualRegisters": {
+ "GR32": []
}
} \ No newline at end of file
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt
index 6327cff..d3c0da9 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt
@@ -6880,3 +6880,294 @@ Key: XSHA: [ 0.00 0.00 ]
Key: XSTORE: [ 0.00 0.00 ]
Key: XSUSLDTRK: [ 0.00 0.00 ]
Key: XTEST: [ 0.00 0.00 ]
+Key: Immediate: [ 0.10 0.10 ]
+Key: CImmediate: [ 0.00 0.00 ]
+Key: FPImmediate: [ 0.00 0.00 ]
+Key: MBB: [ 0.20 0.20 ]
+Key: FrameIndex: [ 0.30 0.30 ]
+Key: ConstantPoolIndex: [ 0.00 0.00 ]
+Key: TargetIndex: [ 0.00 0.00 ]
+Key: JumpTableIndex: [ 0.00 0.00 ]
+Key: ExternalSymbol: [ 0.00 0.00 ]
+Key: GlobalAddress: [ 0.40 0.40 ]
+Key: BlockAddress: [ 0.00 0.00 ]
+Key: RegisterMask: [ 0.00 0.00 ]
+Key: RegisterLiveOut: [ 0.00 0.00 ]
+Key: Metadata: [ 0.00 0.00 ]
+Key: MCSymbol: [ 0.00 0.00 ]
+Key: CFIIndex: [ 0.00 0.00 ]
+Key: IntrinsicID: [ 0.00 0.00 ]
+Key: Predicate: [ 0.00 0.00 ]
+Key: ShuffleMask: [ 0.00 0.00 ]
+Key: PhyReg_GR8: [ 0.00 0.00 ]
+Key: PhyReg_GRH8: [ 0.00 0.00 ]
+Key: PhyReg_GR8_NOREX2: [ 0.00 0.00 ]
+Key: PhyReg_GR8_NOREX: [ 0.00 0.00 ]
+Key: PhyReg_GR8_ABCD_H: [ 0.00 0.00 ]
+Key: PhyReg_GR8_ABCD_L: [ 0.00 0.00 ]
+Key: PhyReg_GRH16: [ 0.00 0.00 ]
+Key: PhyReg_GR16: [ 0.00 0.00 ]
+Key: PhyReg_GR16_NOREX2: [ 0.00 0.00 ]
+Key: PhyReg_GR16_NOREX: [ 0.00 0.00 ]
+Key: PhyReg_VK1: [ 0.00 0.00 ]
+Key: PhyReg_VK16: [ 0.00 0.00 ]
+Key: PhyReg_VK2: [ 0.00 0.00 ]
+Key: PhyReg_VK4: [ 0.00 0.00 ]
+Key: PhyReg_VK8: [ 0.00 0.00 ]
+Key: PhyReg_VK16WM: [ 0.00 0.00 ]
+Key: PhyReg_VK1WM: [ 0.00 0.00 ]
+Key: PhyReg_VK2WM: [ 0.00 0.00 ]
+Key: PhyReg_VK4WM: [ 0.00 0.00 ]
+Key: PhyReg_VK8WM: [ 0.00 0.00 ]
+Key: PhyReg_SEGMENT_REG: [ 0.00 0.00 ]
+Key: PhyReg_GR16_ABCD: [ 0.00 0.00 ]
+Key: PhyReg_FPCCR: [ 0.00 0.00 ]
+Key: PhyReg_FR16X: [ 0.00 0.00 ]
+Key: PhyReg_FR16: [ 0.00 0.00 ]
+Key: PhyReg_VK16PAIR: [ 0.00 0.00 ]
+Key: PhyReg_VK1PAIR: [ 0.00 0.00 ]
+Key: PhyReg_VK2PAIR: [ 0.00 0.00 ]
+Key: PhyReg_VK4PAIR: [ 0.00 0.00 ]
+Key: PhyReg_VK8PAIR: [ 0.00 0.00 ]
+Key: PhyReg_VK1PAIR_with_sub_mask_0_in_VK1WM: [ 0.00 0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP: [ 0.00 0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS: [ 0.00 0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit: [ 0.00 0.00 ]
+Key: PhyReg_FR32X: [ 0.00 0.00 ]
+Key: PhyReg_GR32: [ 0.50 0.50 ]
+Key: PhyReg_GR32_NOSP: [ 0.00 0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2: [ 0.00 0.00 ]
+Key: PhyReg_DEBUG_REG: [ 0.00 0.00 ]
+Key: PhyReg_FR32: [ 0.00 0.00 ]
+Key: PhyReg_GR32_NOREX2: [ 0.00 0.00 ]
+Key: PhyReg_GR32_NOREX2_NOSP: [ 0.00 0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ]
+Key: PhyReg_GR32_NOREX: [ 0.00 0.00 ]
+Key: PhyReg_VK32: [ 0.00 0.00 ]
+Key: PhyReg_GR32_NOREX_NOSP: [ 0.00 0.00 ]
+Key: PhyReg_RFP32: [ 0.00 0.00 ]
+Key: PhyReg_VK32WM: [ 0.00 0.00 ]
+Key: PhyReg_GR32_ABCD: [ 0.00 0.00 ]
+Key: PhyReg_GR32_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR32_ABCD_and_GR32_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR32_AD: [ 0.00 0.00 ]
+Key: PhyReg_GR32_ArgRef: [ 0.00 0.00 ]
+Key: PhyReg_GR32_BPSP: [ 0.00 0.00 ]
+Key: PhyReg_GR32_BSI: [ 0.00 0.00 ]
+Key: PhyReg_GR32_CB: [ 0.00 0.00 ]
+Key: PhyReg_GR32_DC: [ 0.00 0.00 ]
+Key: PhyReg_GR32_DIBP: [ 0.00 0.00 ]
+Key: PhyReg_GR32_SIDI: [ 0.00 0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit: [ 0.00 0.00 ]
+Key: PhyReg_CCR: [ 0.00 0.00 ]
+Key: PhyReg_DFCCR: [ 0.00 0.00 ]
+Key: PhyReg_GR32_ABCD_and_GR32_BSI: [ 0.00 0.00 ]
+Key: PhyReg_GR32_AD_and_GR32_ArgRef: [ 0.00 0.00 ]
+Key: PhyReg_GR32_ArgRef_and_GR32_CB: [ 0.00 0.00 ]
+Key: PhyReg_GR32_BPSP_and_GR32_DIBP: [ 0.00 0.00 ]
+Key: PhyReg_GR32_BPSP_and_GR32_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR32_BSI_and_GR32_SIDI: [ 0.00 0.00 ]
+Key: PhyReg_GR32_DIBP_and_GR32_SIDI: [ 0.00 0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit: [ 0.00 0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_with_sub_32bit: [ 0.00 0.00 ]
+Key: PhyReg_RFP64: [ 0.00 0.00 ]
+Key: PhyReg_GR64: [ 0.60 0.60 ]
+Key: PhyReg_FR64X: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_8bit: [ 0.00 0.00 ]
+Key: PhyReg_GR64_NOSP: [ 0.00 0.00 ]
+Key: PhyReg_GR64_NOREX2: [ 0.00 0.00 ]
+Key: PhyReg_CONTROL_REG: [ 0.00 0.00 ]
+Key: PhyReg_FR64: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_16bit_in_GR16_NOREX2: [ 0.00 0.00 ]
+Key: PhyReg_GR64_NOREX2_NOSP: [ 0.00 0.00 ]
+Key: PhyReg_GR64PLTSafe: [ 0.00 0.00 ]
+Key: PhyReg_GR64_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR64_NOREX: [ 0.00 0.00 ]
+Key: PhyReg_GR64_TCW64: [ 0.00 0.00 ]
+Key: PhyReg_GR64_TC_with_sub_8bit: [ 0.00 0.00 ]
+Key: PhyReg_GR64_NOREX2_NOSP_and_GR64_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR64_TCW64_with_sub_8bit: [ 0.00 0.00 ]
+Key: PhyReg_GR64_TC_and_GR64_TCW64: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ]
+Key: PhyReg_VK64: [ 0.00 0.00 ]
+Key: PhyReg_VR64: [ 0.00 0.00 ]
+Key: PhyReg_GR64PLTSafe_and_GR64_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR64_NOREX2_NOSP_and_GR64_TCW64: [ 0.00 0.00 ]
+Key: PhyReg_GR64_NOREX_NOSP: [ 0.00 0.00 ]
+Key: PhyReg_GR64_NOREX_and_GR64_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR64_TCW64_and_GR64_TC_with_sub_8bit: [ 0.00 0.00 ]
+Key: PhyReg_VK64WM: [ 0.00 0.00 ]
+Key: PhyReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64: [ 0.00 0.00 ]
+Key: PhyReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ]
+Key: PhyReg_GR64PLTSafe_and_GR64_TCW64: [ 0.00 0.00 ]
+Key: PhyReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR64_NOREX_and_GR64_TCW64: [ 0.00 0.00 ]
+Key: PhyReg_GR64_ABCD: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR64_AD: [ 0.00 0.00 ]
+Key: PhyReg_GR64_ArgRef: [ 0.00 0.00 ]
+Key: PhyReg_GR64_and_LOW32_ADDR_ACCESS_RBP: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BSI: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_CB: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_DIBP: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_SIDI: [ 0.00 0.00 ]
+Key: PhyReg_GR64_A: [ 0.00 0.00 ]
+Key: PhyReg_GR64_ArgRef_and_GR64_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR64_and_LOW32_ADDR_ACCESS: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI: [ 0.00 0.00 ]
+Key: PhyReg_RST: [ 0.00 0.00 ]
+Key: PhyReg_RFP80: [ 0.00 0.00 ]
+Key: PhyReg_RFP80_7: [ 0.00 0.00 ]
+Key: PhyReg_VR128X: [ 0.00 0.00 ]
+Key: PhyReg_VR128: [ 0.00 0.00 ]
+Key: PhyReg_VR256X: [ 0.00 0.00 ]
+Key: PhyReg_VR256: [ 0.00 0.00 ]
+Key: PhyReg_VR512: [ 0.00 0.00 ]
+Key: PhyReg_VR512_0_15: [ 0.00 0.00 ]
+Key: PhyReg_TILE: [ 0.00 0.00 ]
+Key: PhyReg_TILEPAIR: [ 0.00 0.00 ]
+Key: VirtReg_GR8: [ 0.00 0.00 ]
+Key: VirtReg_GRH8: [ 0.00 0.00 ]
+Key: VirtReg_GR8_NOREX2: [ 0.00 0.00 ]
+Key: VirtReg_GR8_NOREX: [ 0.00 0.00 ]
+Key: VirtReg_GR8_ABCD_H: [ 0.00 0.00 ]
+Key: VirtReg_GR8_ABCD_L: [ 0.00 0.00 ]
+Key: VirtReg_GRH16: [ 0.00 0.00 ]
+Key: VirtReg_GR16: [ 0.00 0.00 ]
+Key: VirtReg_GR16_NOREX2: [ 0.00 0.00 ]
+Key: VirtReg_GR16_NOREX: [ 0.00 0.00 ]
+Key: VirtReg_VK1: [ 0.00 0.00 ]
+Key: VirtReg_VK16: [ 0.00 0.00 ]
+Key: VirtReg_VK2: [ 0.00 0.00 ]
+Key: VirtReg_VK4: [ 0.00 0.00 ]
+Key: VirtReg_VK8: [ 0.00 0.00 ]
+Key: VirtReg_VK16WM: [ 0.00 0.00 ]
+Key: VirtReg_VK1WM: [ 0.00 0.00 ]
+Key: VirtReg_VK2WM: [ 0.00 0.00 ]
+Key: VirtReg_VK4WM: [ 0.00 0.00 ]
+Key: VirtReg_VK8WM: [ 0.00 0.00 ]
+Key: VirtReg_SEGMENT_REG: [ 0.00 0.00 ]
+Key: VirtReg_GR16_ABCD: [ 0.00 0.00 ]
+Key: VirtReg_FPCCR: [ 0.00 0.00 ]
+Key: VirtReg_FR16X: [ 0.00 0.00 ]
+Key: VirtReg_FR16: [ 0.00 0.00 ]
+Key: VirtReg_VK16PAIR: [ 0.00 0.00 ]
+Key: VirtReg_VK1PAIR: [ 0.00 0.00 ]
+Key: VirtReg_VK2PAIR: [ 0.00 0.00 ]
+Key: VirtReg_VK4PAIR: [ 0.00 0.00 ]
+Key: VirtReg_VK8PAIR: [ 0.00 0.00 ]
+Key: VirtReg_VK1PAIR_with_sub_mask_0_in_VK1WM: [ 0.00 0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP: [ 0.00 0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS: [ 0.00 0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit: [ 0.00 0.00 ]
+Key: VirtReg_FR32X: [ 0.00 0.00 ]
+Key: VirtReg_GR32: [ 0.80 0.80 ]
+Key: VirtReg_GR32_NOSP: [ 0.00 0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2: [ 0.00 0.00 ]
+Key: VirtReg_DEBUG_REG: [ 0.00 0.00 ]
+Key: VirtReg_FR32: [ 0.00 0.00 ]
+Key: VirtReg_GR32_NOREX2: [ 0.00 0.00 ]
+Key: VirtReg_GR32_NOREX2_NOSP: [ 0.00 0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ]
+Key: VirtReg_GR32_NOREX: [ 0.00 0.00 ]
+Key: VirtReg_VK32: [ 0.00 0.00 ]
+Key: VirtReg_GR32_NOREX_NOSP: [ 0.00 0.00 ]
+Key: VirtReg_RFP32: [ 0.00 0.00 ]
+Key: VirtReg_VK32WM: [ 0.00 0.00 ]
+Key: VirtReg_GR32_ABCD: [ 0.00 0.00 ]
+Key: VirtReg_GR32_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR32_ABCD_and_GR32_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR32_AD: [ 0.00 0.00 ]
+Key: VirtReg_GR32_ArgRef: [ 0.00 0.00 ]
+Key: VirtReg_GR32_BPSP: [ 0.00 0.00 ]
+Key: VirtReg_GR32_BSI: [ 0.00 0.00 ]
+Key: VirtReg_GR32_CB: [ 0.00 0.00 ]
+Key: VirtReg_GR32_DC: [ 0.00 0.00 ]
+Key: VirtReg_GR32_DIBP: [ 0.00 0.00 ]
+Key: VirtReg_GR32_SIDI: [ 0.00 0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit: [ 0.00 0.00 ]
+Key: VirtReg_CCR: [ 0.00 0.00 ]
+Key: VirtReg_DFCCR: [ 0.00 0.00 ]
+Key: VirtReg_GR32_ABCD_and_GR32_BSI: [ 0.00 0.00 ]
+Key: VirtReg_GR32_AD_and_GR32_ArgRef: [ 0.00 0.00 ]
+Key: VirtReg_GR32_ArgRef_and_GR32_CB: [ 0.00 0.00 ]
+Key: VirtReg_GR32_BPSP_and_GR32_DIBP: [ 0.00 0.00 ]
+Key: VirtReg_GR32_BPSP_and_GR32_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR32_BSI_and_GR32_SIDI: [ 0.00 0.00 ]
+Key: VirtReg_GR32_DIBP_and_GR32_SIDI: [ 0.00 0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit: [ 0.00 0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_with_sub_32bit: [ 0.00 0.00 ]
+Key: VirtReg_RFP64: [ 0.00 0.00 ]
+Key: VirtReg_GR64: [ 0.90 0.90 ]
+Key: VirtReg_FR64X: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_8bit: [ 0.00 0.00 ]
+Key: VirtReg_GR64_NOSP: [ 0.00 0.00 ]
+Key: VirtReg_GR64_NOREX2: [ 0.00 0.00 ]
+Key: VirtReg_CONTROL_REG: [ 0.00 0.00 ]
+Key: VirtReg_FR64: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_16bit_in_GR16_NOREX2: [ 0.00 0.00 ]
+Key: VirtReg_GR64_NOREX2_NOSP: [ 0.00 0.00 ]
+Key: VirtReg_GR64PLTSafe: [ 0.00 0.00 ]
+Key: VirtReg_GR64_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR64_NOREX: [ 0.00 0.00 ]
+Key: VirtReg_GR64_TCW64: [ 0.00 0.00 ]
+Key: VirtReg_GR64_TC_with_sub_8bit: [ 0.00 0.00 ]
+Key: VirtReg_GR64_NOREX2_NOSP_and_GR64_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR64_TCW64_with_sub_8bit: [ 0.00 0.00 ]
+Key: VirtReg_GR64_TC_and_GR64_TCW64: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ]
+Key: VirtReg_VK64: [ 0.00 0.00 ]
+Key: VirtReg_VR64: [ 0.00 0.00 ]
+Key: VirtReg_GR64PLTSafe_and_GR64_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR64_NOREX2_NOSP_and_GR64_TCW64: [ 0.00 0.00 ]
+Key: VirtReg_GR64_NOREX_NOSP: [ 0.00 0.00 ]
+Key: VirtReg_GR64_NOREX_and_GR64_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR64_TCW64_and_GR64_TC_with_sub_8bit: [ 0.00 0.00 ]
+Key: VirtReg_VK64WM: [ 0.00 0.00 ]
+Key: VirtReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64: [ 0.00 0.00 ]
+Key: VirtReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ]
+Key: VirtReg_GR64PLTSafe_and_GR64_TCW64: [ 0.00 0.00 ]
+Key: VirtReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR64_NOREX_and_GR64_TCW64: [ 0.00 0.00 ]
+Key: VirtReg_GR64_ABCD: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR64_AD: [ 0.00 0.00 ]
+Key: VirtReg_GR64_ArgRef: [ 0.00 0.00 ]
+Key: VirtReg_GR64_and_LOW32_ADDR_ACCESS_RBP: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BSI: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_CB: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_DIBP: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_SIDI: [ 0.00 0.00 ]
+Key: VirtReg_GR64_A: [ 0.00 0.00 ]
+Key: VirtReg_GR64_ArgRef_and_GR64_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR64_and_LOW32_ADDR_ACCESS: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI: [ 0.00 0.00 ]
+Key: VirtReg_RST: [ 0.00 0.00 ]
+Key: VirtReg_RFP80: [ 0.00 0.00 ]
+Key: VirtReg_RFP80_7: [ 0.00 0.00 ]
+Key: VirtReg_VR128X: [ 0.00 0.00 ]
+Key: VirtReg_VR128: [ 0.00 0.00 ]
+Key: VirtReg_VR256X: [ 0.00 0.00 ]
+Key: VirtReg_VR256: [ 0.00 0.00 ]
+Key: VirtReg_VR512: [ 0.00 0.00 ]
+Key: VirtReg_VR512_0_15: [ 0.00 0.00 ]
+Key: VirtReg_TILE: [ 0.00 0.00 ]
+Key: VirtReg_TILEPAIR: [ 0.00 0.00 ]
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt
index 4409e6d..c6e5508 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt
@@ -6880,3 +6880,294 @@ Key: XSHA: [ 0.00 0.00 ]
Key: XSTORE: [ 0.00 0.00 ]
Key: XSUSLDTRK: [ 0.00 0.00 ]
Key: XTEST: [ 0.00 0.00 ]
+Key: Immediate: [ 0.10 0.10 ]
+Key: CImmediate: [ 0.00 0.00 ]
+Key: FPImmediate: [ 0.00 0.00 ]
+Key: MBB: [ 0.20 0.20 ]
+Key: FrameIndex: [ 0.30 0.30 ]
+Key: ConstantPoolIndex: [ 0.00 0.00 ]
+Key: TargetIndex: [ 0.00 0.00 ]
+Key: JumpTableIndex: [ 0.00 0.00 ]
+Key: ExternalSymbol: [ 0.00 0.00 ]
+Key: GlobalAddress: [ 0.40 0.40 ]
+Key: BlockAddress: [ 0.00 0.00 ]
+Key: RegisterMask: [ 0.00 0.00 ]
+Key: RegisterLiveOut: [ 0.00 0.00 ]
+Key: Metadata: [ 0.00 0.00 ]
+Key: MCSymbol: [ 0.00 0.00 ]
+Key: CFIIndex: [ 0.00 0.00 ]
+Key: IntrinsicID: [ 0.00 0.00 ]
+Key: Predicate: [ 0.00 0.00 ]
+Key: ShuffleMask: [ 0.00 0.00 ]
+Key: PhyReg_GR8: [ 0.00 0.00 ]
+Key: PhyReg_GRH8: [ 0.00 0.00 ]
+Key: PhyReg_GR8_NOREX2: [ 0.00 0.00 ]
+Key: PhyReg_GR8_NOREX: [ 0.00 0.00 ]
+Key: PhyReg_GR8_ABCD_H: [ 0.00 0.00 ]
+Key: PhyReg_GR8_ABCD_L: [ 0.00 0.00 ]
+Key: PhyReg_GRH16: [ 0.00 0.00 ]
+Key: PhyReg_GR16: [ 0.00 0.00 ]
+Key: PhyReg_GR16_NOREX2: [ 0.00 0.00 ]
+Key: PhyReg_GR16_NOREX: [ 0.00 0.00 ]
+Key: PhyReg_VK1: [ 0.00 0.00 ]
+Key: PhyReg_VK16: [ 0.00 0.00 ]
+Key: PhyReg_VK2: [ 0.00 0.00 ]
+Key: PhyReg_VK4: [ 0.00 0.00 ]
+Key: PhyReg_VK8: [ 0.00 0.00 ]
+Key: PhyReg_VK16WM: [ 0.00 0.00 ]
+Key: PhyReg_VK1WM: [ 0.00 0.00 ]
+Key: PhyReg_VK2WM: [ 0.00 0.00 ]
+Key: PhyReg_VK4WM: [ 0.00 0.00 ]
+Key: PhyReg_VK8WM: [ 0.00 0.00 ]
+Key: PhyReg_SEGMENT_REG: [ 0.00 0.00 ]
+Key: PhyReg_GR16_ABCD: [ 0.00 0.00 ]
+Key: PhyReg_FPCCR: [ 0.00 0.00 ]
+Key: PhyReg_FR16X: [ 0.00 0.00 ]
+Key: PhyReg_FR16: [ 0.00 0.00 ]
+Key: PhyReg_VK16PAIR: [ 0.00 0.00 ]
+Key: PhyReg_VK1PAIR: [ 0.00 0.00 ]
+Key: PhyReg_VK2PAIR: [ 0.00 0.00 ]
+Key: PhyReg_VK4PAIR: [ 0.00 0.00 ]
+Key: PhyReg_VK8PAIR: [ 0.00 0.00 ]
+Key: PhyReg_VK1PAIR_with_sub_mask_0_in_VK1WM: [ 0.00 0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP: [ 0.00 0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS: [ 0.00 0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit: [ 0.00 0.00 ]
+Key: PhyReg_FR32X: [ 0.00 0.00 ]
+Key: PhyReg_GR32: [ 0.50 0.50 ]
+Key: PhyReg_GR32_NOSP: [ 0.00 0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2: [ 0.00 0.00 ]
+Key: PhyReg_DEBUG_REG: [ 0.00 0.00 ]
+Key: PhyReg_FR32: [ 0.00 0.00 ]
+Key: PhyReg_GR32_NOREX2: [ 0.00 0.00 ]
+Key: PhyReg_GR32_NOREX2_NOSP: [ 0.00 0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ]
+Key: PhyReg_GR32_NOREX: [ 0.00 0.00 ]
+Key: PhyReg_VK32: [ 0.00 0.00 ]
+Key: PhyReg_GR32_NOREX_NOSP: [ 0.00 0.00 ]
+Key: PhyReg_RFP32: [ 0.00 0.00 ]
+Key: PhyReg_VK32WM: [ 0.00 0.00 ]
+Key: PhyReg_GR32_ABCD: [ 0.00 0.00 ]
+Key: PhyReg_GR32_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR32_ABCD_and_GR32_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR32_AD: [ 0.00 0.00 ]
+Key: PhyReg_GR32_ArgRef: [ 0.00 0.00 ]
+Key: PhyReg_GR32_BPSP: [ 0.00 0.00 ]
+Key: PhyReg_GR32_BSI: [ 0.00 0.00 ]
+Key: PhyReg_GR32_CB: [ 0.00 0.00 ]
+Key: PhyReg_GR32_DC: [ 0.00 0.00 ]
+Key: PhyReg_GR32_DIBP: [ 0.00 0.00 ]
+Key: PhyReg_GR32_SIDI: [ 0.00 0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit: [ 0.00 0.00 ]
+Key: PhyReg_CCR: [ 0.00 0.00 ]
+Key: PhyReg_DFCCR: [ 0.00 0.00 ]
+Key: PhyReg_GR32_ABCD_and_GR32_BSI: [ 0.00 0.00 ]
+Key: PhyReg_GR32_AD_and_GR32_ArgRef: [ 0.00 0.00 ]
+Key: PhyReg_GR32_ArgRef_and_GR32_CB: [ 0.00 0.00 ]
+Key: PhyReg_GR32_BPSP_and_GR32_DIBP: [ 0.00 0.00 ]
+Key: PhyReg_GR32_BPSP_and_GR32_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR32_BSI_and_GR32_SIDI: [ 0.00 0.00 ]
+Key: PhyReg_GR32_DIBP_and_GR32_SIDI: [ 0.00 0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit: [ 0.00 0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_with_sub_32bit: [ 0.00 0.00 ]
+Key: PhyReg_RFP64: [ 0.00 0.00 ]
+Key: PhyReg_GR64: [ 0.60 0.60 ]
+Key: PhyReg_FR64X: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_8bit: [ 0.00 0.00 ]
+Key: PhyReg_GR64_NOSP: [ 0.00 0.00 ]
+Key: PhyReg_GR64_NOREX2: [ 0.00 0.00 ]
+Key: PhyReg_CONTROL_REG: [ 0.00 0.00 ]
+Key: PhyReg_FR64: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_16bit_in_GR16_NOREX2: [ 0.00 0.00 ]
+Key: PhyReg_GR64_NOREX2_NOSP: [ 0.00 0.00 ]
+Key: PhyReg_GR64PLTSafe: [ 0.00 0.00 ]
+Key: PhyReg_GR64_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR64_NOREX: [ 0.00 0.00 ]
+Key: PhyReg_GR64_TCW64: [ 0.00 0.00 ]
+Key: PhyReg_GR64_TC_with_sub_8bit: [ 0.00 0.00 ]
+Key: PhyReg_GR64_NOREX2_NOSP_and_GR64_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR64_TCW64_with_sub_8bit: [ 0.00 0.00 ]
+Key: PhyReg_GR64_TC_and_GR64_TCW64: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ]
+Key: PhyReg_VK64: [ 0.00 0.00 ]
+Key: PhyReg_VR64: [ 0.00 0.00 ]
+Key: PhyReg_GR64PLTSafe_and_GR64_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR64_NOREX2_NOSP_and_GR64_TCW64: [ 0.00 0.00 ]
+Key: PhyReg_GR64_NOREX_NOSP: [ 0.00 0.00 ]
+Key: PhyReg_GR64_NOREX_and_GR64_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR64_TCW64_and_GR64_TC_with_sub_8bit: [ 0.00 0.00 ]
+Key: PhyReg_VK64WM: [ 0.00 0.00 ]
+Key: PhyReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64: [ 0.00 0.00 ]
+Key: PhyReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ]
+Key: PhyReg_GR64PLTSafe_and_GR64_TCW64: [ 0.00 0.00 ]
+Key: PhyReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR64_NOREX_and_GR64_TCW64: [ 0.00 0.00 ]
+Key: PhyReg_GR64_ABCD: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR64_AD: [ 0.00 0.00 ]
+Key: PhyReg_GR64_ArgRef: [ 0.00 0.00 ]
+Key: PhyReg_GR64_and_LOW32_ADDR_ACCESS_RBP: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BSI: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_CB: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_DIBP: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_SIDI: [ 0.00 0.00 ]
+Key: PhyReg_GR64_A: [ 0.00 0.00 ]
+Key: PhyReg_GR64_ArgRef_and_GR64_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR64_and_LOW32_ADDR_ACCESS: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI: [ 0.00 0.00 ]
+Key: PhyReg_RST: [ 0.00 0.00 ]
+Key: PhyReg_RFP80: [ 0.00 0.00 ]
+Key: PhyReg_RFP80_7: [ 0.00 0.00 ]
+Key: PhyReg_VR128X: [ 0.00 0.00 ]
+Key: PhyReg_VR128: [ 0.00 0.00 ]
+Key: PhyReg_VR256X: [ 0.00 0.00 ]
+Key: PhyReg_VR256: [ 0.00 0.00 ]
+Key: PhyReg_VR512: [ 0.00 0.00 ]
+Key: PhyReg_VR512_0_15: [ 0.00 0.00 ]
+Key: PhyReg_TILE: [ 0.00 0.00 ]
+Key: PhyReg_TILEPAIR: [ 0.00 0.00 ]
+Key: VirtReg_GR8: [ 0.00 0.00 ]
+Key: VirtReg_GRH8: [ 0.00 0.00 ]
+Key: VirtReg_GR8_NOREX2: [ 0.00 0.00 ]
+Key: VirtReg_GR8_NOREX: [ 0.00 0.00 ]
+Key: VirtReg_GR8_ABCD_H: [ 0.00 0.00 ]
+Key: VirtReg_GR8_ABCD_L: [ 0.00 0.00 ]
+Key: VirtReg_GRH16: [ 0.00 0.00 ]
+Key: VirtReg_GR16: [ 0.00 0.00 ]
+Key: VirtReg_GR16_NOREX2: [ 0.00 0.00 ]
+Key: VirtReg_GR16_NOREX: [ 0.00 0.00 ]
+Key: VirtReg_VK1: [ 0.00 0.00 ]
+Key: VirtReg_VK16: [ 0.00 0.00 ]
+Key: VirtReg_VK2: [ 0.00 0.00 ]
+Key: VirtReg_VK4: [ 0.00 0.00 ]
+Key: VirtReg_VK8: [ 0.00 0.00 ]
+Key: VirtReg_VK16WM: [ 0.00 0.00 ]
+Key: VirtReg_VK1WM: [ 0.00 0.00 ]
+Key: VirtReg_VK2WM: [ 0.00 0.00 ]
+Key: VirtReg_VK4WM: [ 0.00 0.00 ]
+Key: VirtReg_VK8WM: [ 0.00 0.00 ]
+Key: VirtReg_SEGMENT_REG: [ 0.00 0.00 ]
+Key: VirtReg_GR16_ABCD: [ 0.00 0.00 ]
+Key: VirtReg_FPCCR: [ 0.00 0.00 ]
+Key: VirtReg_FR16X: [ 0.00 0.00 ]
+Key: VirtReg_FR16: [ 0.00 0.00 ]
+Key: VirtReg_VK16PAIR: [ 0.00 0.00 ]
+Key: VirtReg_VK1PAIR: [ 0.00 0.00 ]
+Key: VirtReg_VK2PAIR: [ 0.00 0.00 ]
+Key: VirtReg_VK4PAIR: [ 0.00 0.00 ]
+Key: VirtReg_VK8PAIR: [ 0.00 0.00 ]
+Key: VirtReg_VK1PAIR_with_sub_mask_0_in_VK1WM: [ 0.00 0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP: [ 0.00 0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS: [ 0.00 0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit: [ 0.00 0.00 ]
+Key: VirtReg_FR32X: [ 0.00 0.00 ]
+Key: VirtReg_GR32: [ 0.80 0.80 ]
+Key: VirtReg_GR32_NOSP: [ 0.00 0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2: [ 0.00 0.00 ]
+Key: VirtReg_DEBUG_REG: [ 0.00 0.00 ]
+Key: VirtReg_FR32: [ 0.00 0.00 ]
+Key: VirtReg_GR32_NOREX2: [ 0.00 0.00 ]
+Key: VirtReg_GR32_NOREX2_NOSP: [ 0.00 0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ]
+Key: VirtReg_GR32_NOREX: [ 0.00 0.00 ]
+Key: VirtReg_VK32: [ 0.00 0.00 ]
+Key: VirtReg_GR32_NOREX_NOSP: [ 0.00 0.00 ]
+Key: VirtReg_RFP32: [ 0.00 0.00 ]
+Key: VirtReg_VK32WM: [ 0.00 0.00 ]
+Key: VirtReg_GR32_ABCD: [ 0.00 0.00 ]
+Key: VirtReg_GR32_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR32_ABCD_and_GR32_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR32_AD: [ 0.00 0.00 ]
+Key: VirtReg_GR32_ArgRef: [ 0.00 0.00 ]
+Key: VirtReg_GR32_BPSP: [ 0.00 0.00 ]
+Key: VirtReg_GR32_BSI: [ 0.00 0.00 ]
+Key: VirtReg_GR32_CB: [ 0.00 0.00 ]
+Key: VirtReg_GR32_DC: [ 0.00 0.00 ]
+Key: VirtReg_GR32_DIBP: [ 0.00 0.00 ]
+Key: VirtReg_GR32_SIDI: [ 0.00 0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit: [ 0.00 0.00 ]
+Key: VirtReg_CCR: [ 0.00 0.00 ]
+Key: VirtReg_DFCCR: [ 0.00 0.00 ]
+Key: VirtReg_GR32_ABCD_and_GR32_BSI: [ 0.00 0.00 ]
+Key: VirtReg_GR32_AD_and_GR32_ArgRef: [ 0.00 0.00 ]
+Key: VirtReg_GR32_ArgRef_and_GR32_CB: [ 0.00 0.00 ]
+Key: VirtReg_GR32_BPSP_and_GR32_DIBP: [ 0.00 0.00 ]
+Key: VirtReg_GR32_BPSP_and_GR32_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR32_BSI_and_GR32_SIDI: [ 0.00 0.00 ]
+Key: VirtReg_GR32_DIBP_and_GR32_SIDI: [ 0.00 0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit: [ 0.00 0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_with_sub_32bit: [ 0.00 0.00 ]
+Key: VirtReg_RFP64: [ 0.00 0.00 ]
+Key: VirtReg_GR64: [ 0.90 0.90 ]
+Key: VirtReg_FR64X: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_8bit: [ 0.00 0.00 ]
+Key: VirtReg_GR64_NOSP: [ 0.00 0.00 ]
+Key: VirtReg_GR64_NOREX2: [ 0.00 0.00 ]
+Key: VirtReg_CONTROL_REG: [ 0.00 0.00 ]
+Key: VirtReg_FR64: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_16bit_in_GR16_NOREX2: [ 0.00 0.00 ]
+Key: VirtReg_GR64_NOREX2_NOSP: [ 0.00 0.00 ]
+Key: VirtReg_GR64PLTSafe: [ 0.00 0.00 ]
+Key: VirtReg_GR64_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR64_NOREX: [ 0.00 0.00 ]
+Key: VirtReg_GR64_TCW64: [ 0.00 0.00 ]
+Key: VirtReg_GR64_TC_with_sub_8bit: [ 0.00 0.00 ]
+Key: VirtReg_GR64_NOREX2_NOSP_and_GR64_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR64_TCW64_with_sub_8bit: [ 0.00 0.00 ]
+Key: VirtReg_GR64_TC_and_GR64_TCW64: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ]
+Key: VirtReg_VK64: [ 0.00 0.00 ]
+Key: VirtReg_VR64: [ 0.00 0.00 ]
+Key: VirtReg_GR64PLTSafe_and_GR64_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR64_NOREX2_NOSP_and_GR64_TCW64: [ 0.00 0.00 ]
+Key: VirtReg_GR64_NOREX_NOSP: [ 0.00 0.00 ]
+Key: VirtReg_GR64_NOREX_and_GR64_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR64_TCW64_and_GR64_TC_with_sub_8bit: [ 0.00 0.00 ]
+Key: VirtReg_VK64WM: [ 0.00 0.00 ]
+Key: VirtReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64: [ 0.00 0.00 ]
+Key: VirtReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ]
+Key: VirtReg_GR64PLTSafe_and_GR64_TCW64: [ 0.00 0.00 ]
+Key: VirtReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR64_NOREX_and_GR64_TCW64: [ 0.00 0.00 ]
+Key: VirtReg_GR64_ABCD: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR64_AD: [ 0.00 0.00 ]
+Key: VirtReg_GR64_ArgRef: [ 0.00 0.00 ]
+Key: VirtReg_GR64_and_LOW32_ADDR_ACCESS_RBP: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BSI: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_CB: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_DIBP: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_SIDI: [ 0.00 0.00 ]
+Key: VirtReg_GR64_A: [ 0.00 0.00 ]
+Key: VirtReg_GR64_ArgRef_and_GR64_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR64_and_LOW32_ADDR_ACCESS: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI: [ 0.00 0.00 ]
+Key: VirtReg_RST: [ 0.00 0.00 ]
+Key: VirtReg_RFP80: [ 0.00 0.00 ]
+Key: VirtReg_RFP80_7: [ 0.00 0.00 ]
+Key: VirtReg_VR128X: [ 0.00 0.00 ]
+Key: VirtReg_VR128: [ 0.00 0.00 ]
+Key: VirtReg_VR256X: [ 0.00 0.00 ]
+Key: VirtReg_VR256: [ 0.00 0.00 ]
+Key: VirtReg_VR512: [ 0.00 0.00 ]
+Key: VirtReg_VR512_0_15: [ 0.00 0.00 ]
+Key: VirtReg_TILE: [ 0.00 0.00 ]
+Key: VirtReg_TILEPAIR: [ 0.00 0.00 ]
diff --git a/llvm/test/CodeGen/MIR2Vec/if-else.mir b/llvm/test/CodeGen/MIR2Vec/if-else.mir
index 5734a23..f2572f5 100644
--- a/llvm/test/CodeGen/MIR2Vec/if-else.mir
+++ b/llvm/test/CodeGen/MIR2Vec/if-else.mir
@@ -135,10 +135,10 @@ body: |
# CHECK: Machine basic block vectors:
# CHECK-NEXT: Machine basic block: abc:entry:
-# CHECK-NEXT: [ 16.50 17.10 17.70 ]
+# CHECK-NEXT: [ 23.60 24.20 24.80 ]
# CHECK-NEXT: Machine basic block: abc:if.then:
-# CHECK-NEXT: [ 4.50 4.80 5.10 ]
+# CHECK-NEXT: [ 7.30 7.60 7.90 ]
# CHECK-NEXT: Machine basic block: abc:if.else:
-# CHECK-NEXT: [ 0.80 1.00 1.20 ]
+# CHECK-NEXT: [ 3.40 3.60 3.80 ]
# CHECK-NEXT: Machine basic block: abc:return:
-# CHECK-NEXT: [ 6.60 6.90 7.20 ] \ No newline at end of file
+# CHECK-NEXT: [ 8.80 9.10 9.40 ]
diff --git a/llvm/test/CodeGen/MIR2Vec/mir2vec-basic-symbolic.mir b/llvm/test/CodeGen/MIR2Vec/mir2vec-basic-symbolic.mir
index 338cb63..0fdcc81 100644
--- a/llvm/test/CodeGen/MIR2Vec/mir2vec-basic-symbolic.mir
+++ b/llvm/test/CodeGen/MIR2Vec/mir2vec-basic-symbolic.mir
@@ -48,29 +48,29 @@ body: |
RET 0
# CHECK: MIR2Vec embeddings for machine function add_function:
-# CHECK: Function vector: [ 19.20 19.80 20.40 ]
+# CHECK: Function vector: [ 26.50 27.10 27.70 ]
# CHECK-NEXT: Machine basic block vectors:
# CHECK-NEXT: Machine basic block: add_function:entry:
-# CHECK-NEXT: [ 19.20 19.80 20.40 ]
+# CHECK-NEXT: [ 26.50 27.10 27.70 ]
# CHECK-NEXT: Machine instruction vectors:
# CHECK-NEXT: Machine instruction: %1:gr32 = COPY $esi
-# CHECK-NEXT: [ 5.20 5.30 5.40 ]
+# CHECK-NEXT: [ 6.00 6.10 6.20 ]
# CHECK-NEXT: Machine instruction: %0:gr32 = COPY $edi
-# CHECK-NEXT: [ 5.20 5.30 5.40 ]
+# CHECK-NEXT: [ 6.00 6.10 6.20 ]
# CHECK-NEXT: Machine instruction: %2:gr32 = nsw ADD32rr %0:gr32(tied-def 0), %1:gr32, implicit-def dead $eflags
-# CHECK-NEXT: [ 1.30 1.40 1.50 ]
+# CHECK-NEXT: [ 3.70 3.80 3.90 ]
# CHECK-NEXT: Machine instruction: %3:gr32 = ADD32rr %2:gr32(tied-def 0), %2:gr32, implicit-def dead $eflags
-# CHECK-NEXT: [ 1.30 1.40 1.50 ]
+# CHECK-NEXT: [ 3.70 3.80 3.90 ]
# CHECK-NEXT: Machine instruction: $eax = COPY %3:gr32
-# CHECK-NEXT: [ 5.20 5.30 5.40 ]
+# CHECK-NEXT: [ 6.00 6.10 6.20 ]
# CHECK-NEXT: Machine instruction: RET 0, $eax
-# CHECK-NEXT: [ 1.00 1.10 1.20 ]
+# CHECK-NEXT: [ 1.10 1.20 1.30 ]
# CHECK: MIR2Vec embeddings for machine function simple_function:
-# CHECK-NEXT:Function vector: [ 1.00 1.10 1.20 ]
+# CHECK-NEXT:Function vector: [ 1.10 1.20 1.30 ]
# CHECK-NEXT: Machine basic block vectors:
# CHECK-NEXT: Machine basic block: simple_function:entry:
-# CHECK-NEXT: [ 1.00 1.10 1.20 ]
+# CHECK-NEXT: [ 1.10 1.20 1.30 ]
# CHECK-NEXT: Machine instruction vectors:
# CHECK-NEXT: Machine instruction: RET 0
-# CHECK-NEXT: [ 1.00 1.10 1.20 ] \ No newline at end of file
+# CHECK-NEXT: [ 1.10 1.20 1.30 ]
diff --git a/llvm/test/CodeGen/MIR2Vec/vocab-error-handling.ll b/llvm/test/CodeGen/MIR2Vec/vocab-error-handling.ll
index c6554bc..13e908e 100644
--- a/llvm/test/CodeGen/MIR2Vec/vocab-error-handling.ll
+++ b/llvm/test/CodeGen/MIR2Vec/vocab-error-handling.ll
@@ -10,6 +10,6 @@ define dso_local void @test() {
}
; CHECK-INVALID: MIR2Vec Vocabulary Printer: Failed to get vocabulary - MIR2Vec vocabulary file path not specified; set it using --mir2vec-vocab-path
-; CHECK-ZERO-DIM: MIR2Vec Vocabulary Printer: Failed to get vocabulary - Dimension of 'entities' section of the vocabulary is zero
-; CHECK-NO-ENTITIES: MIR2Vec Vocabulary Printer: Failed to get vocabulary - Missing 'entities' section in vocabulary file
-; CHECK-INCONSISTENT-DIMS: MIR2Vec Vocabulary Printer: Failed to get vocabulary - All vectors in the 'entities' section of the vocabulary are not of the same dimension
+; CHECK-ZERO-DIM: MIR2Vec Vocabulary Printer: Failed to get vocabulary - Dimension of 'Opcodes' section of the vocabulary is zero
+; CHECK-NO-ENTITIES: MIR2Vec Vocabulary Printer: Failed to get vocabulary - Missing 'Opcodes' section in vocabulary file
+; CHECK-INCONSISTENT-DIMS: MIR2Vec Vocabulary Printer: Failed to get vocabulary - All vectors in the 'Opcodes' section of the vocabulary are not of the same dimension
diff --git a/llvm/test/CodeGen/MSP430/libcalls.ll b/llvm/test/CodeGen/MSP430/libcalls.ll
index 5d3755c..d1bafea 100644
--- a/llvm/test/CodeGen/MSP430/libcalls.ll
+++ b/llvm/test/CodeGen/MSP430/libcalls.ll
@@ -639,4 +639,18 @@ entry:
ret i32 %shr
}
+define i64 @test__mspabi_divull(i64 %a, i64 %b) #0 {
+; CHECK-LABEL: test__mspabi_divull:
+; CHECK: call #__mspabi_divull
+ %result = udiv i64 %a, %b
+ ret i64 %result
+}
+
+define i64 @test__mspabi_remull(i64 %a, i64 %b) #0 {
+; CHECK-LABEL: test__mspabi_remull:
+; CHECK: call #__mspabi_remull
+ %result = urem i64 %a, %b
+ ret i64 %result
+}
+
attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/store-fp-zero-to-x0.ll b/llvm/test/CodeGen/RISCV/GlobalISel/store-fp-zero-to-x0.ll
new file mode 100644
index 0000000..bc79c6f
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/store-fp-zero-to-x0.ll
@@ -0,0 +1,320 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -mtriple=riscv32 -mattr=+f,+zfh < %s \
+; RUN: | FileCheck %s --check-prefix=RV32F
+; RUN: llc -global-isel -mtriple=riscv32 -mattr=+d,+zfh < %s \
+; RUN: | FileCheck %s --check-prefix=RV32D
+; RUN: llc -global-isel -mtriple=riscv64 -mattr=+f,+zfh < %s \
+; RUN: | FileCheck %s --check-prefix=RV64F
+; RUN: llc -global-isel -mtriple=riscv64 -mattr=+d,+zfh < %s \
+; RUN: | FileCheck %s --check-prefix=RV64D
+
+define void @zero_f16(ptr %i) {
+; RV32F-LABEL: zero_f16:
+; RV32F: # %bb.0: # %entry
+; RV32F-NEXT: sh zero, 0(a0)
+; RV32F-NEXT: ret
+;
+; RV32D-LABEL: zero_f16:
+; RV32D: # %bb.0: # %entry
+; RV32D-NEXT: sh zero, 0(a0)
+; RV32D-NEXT: ret
+;
+; RV64F-LABEL: zero_f16:
+; RV64F: # %bb.0: # %entry
+; RV64F-NEXT: sh zero, 0(a0)
+; RV64F-NEXT: ret
+;
+; RV64D-LABEL: zero_f16:
+; RV64D: # %bb.0: # %entry
+; RV64D-NEXT: sh zero, 0(a0)
+; RV64D-NEXT: ret
+entry:
+ store half 0.0, ptr %i, align 4
+ ret void
+}
+
+define void @zero_bf16(ptr %i) {
+; RV32F-LABEL: zero_bf16:
+; RV32F: # %bb.0: # %entry
+; RV32F-NEXT: sh zero, 0(a0)
+; RV32F-NEXT: ret
+;
+; RV32D-LABEL: zero_bf16:
+; RV32D: # %bb.0: # %entry
+; RV32D-NEXT: sh zero, 0(a0)
+; RV32D-NEXT: ret
+;
+; RV64F-LABEL: zero_bf16:
+; RV64F: # %bb.0: # %entry
+; RV64F-NEXT: sh zero, 0(a0)
+; RV64F-NEXT: ret
+;
+; RV64D-LABEL: zero_bf16:
+; RV64D: # %bb.0: # %entry
+; RV64D-NEXT: sh zero, 0(a0)
+; RV64D-NEXT: ret
+entry:
+ store bfloat 0.0, ptr %i, align 4
+ ret void
+}
+
+define void @zero_f32(ptr %i) {
+; RV32F-LABEL: zero_f32:
+; RV32F: # %bb.0: # %entry
+; RV32F-NEXT: sw zero, 0(a0)
+; RV32F-NEXT: ret
+;
+; RV32D-LABEL: zero_f32:
+; RV32D: # %bb.0: # %entry
+; RV32D-NEXT: sw zero, 0(a0)
+; RV32D-NEXT: ret
+;
+; RV64F-LABEL: zero_f32:
+; RV64F: # %bb.0: # %entry
+; RV64F-NEXT: sw zero, 0(a0)
+; RV64F-NEXT: ret
+;
+; RV64D-LABEL: zero_f32:
+; RV64D: # %bb.0: # %entry
+; RV64D-NEXT: sw zero, 0(a0)
+; RV64D-NEXT: ret
+entry:
+ store float 0.0, ptr %i, align 4
+ ret void
+}
+
+
+define void @zero_f64(ptr %i) {
+; RV32F-LABEL: zero_f64:
+; RV32F: # %bb.0: # %entry
+; RV32F-NEXT: lui a1, %hi(.LCPI3_0)
+; RV32F-NEXT: addi a1, a1, %lo(.LCPI3_0)
+; RV32F-NEXT: lw a2, 0(a1)
+; RV32F-NEXT: lw a1, 4(a1)
+; RV32F-NEXT: sw a2, 0(a0)
+; RV32F-NEXT: sw a1, 4(a0)
+; RV32F-NEXT: ret
+;
+; RV32D-LABEL: zero_f64:
+; RV32D: # %bb.0: # %entry
+; RV32D-NEXT: fcvt.d.w fa5, zero
+; RV32D-NEXT: fsd fa5, 0(a0)
+; RV32D-NEXT: ret
+;
+; RV64F-LABEL: zero_f64:
+; RV64F: # %bb.0: # %entry
+; RV64F-NEXT: sd zero, 0(a0)
+; RV64F-NEXT: ret
+;
+; RV64D-LABEL: zero_f64:
+; RV64D: # %bb.0: # %entry
+; RV64D-NEXT: sd zero, 0(a0)
+; RV64D-NEXT: ret
+entry:
+ store double 0.0, ptr %i, align 8
+ ret void
+}
+
+define void @zero_v1f32(ptr %i) {
+; RV32F-LABEL: zero_v1f32:
+; RV32F: # %bb.0: # %entry
+; RV32F-NEXT: sw zero, 0(a0)
+; RV32F-NEXT: ret
+;
+; RV32D-LABEL: zero_v1f32:
+; RV32D: # %bb.0: # %entry
+; RV32D-NEXT: sw zero, 0(a0)
+; RV32D-NEXT: ret
+;
+; RV64F-LABEL: zero_v1f32:
+; RV64F: # %bb.0: # %entry
+; RV64F-NEXT: sw zero, 0(a0)
+; RV64F-NEXT: ret
+;
+; RV64D-LABEL: zero_v1f32:
+; RV64D: # %bb.0: # %entry
+; RV64D-NEXT: sw zero, 0(a0)
+; RV64D-NEXT: ret
+entry:
+ store <1 x float> <float 0.0>, ptr %i, align 8
+ ret void
+}
+
+define void @zero_v2f32(ptr %i) {
+; RV32F-LABEL: zero_v2f32:
+; RV32F: # %bb.0: # %entry
+; RV32F-NEXT: sw zero, 0(a0)
+; RV32F-NEXT: sw zero, 4(a0)
+; RV32F-NEXT: ret
+;
+; RV32D-LABEL: zero_v2f32:
+; RV32D: # %bb.0: # %entry
+; RV32D-NEXT: sw zero, 0(a0)
+; RV32D-NEXT: sw zero, 4(a0)
+; RV32D-NEXT: ret
+;
+; RV64F-LABEL: zero_v2f32:
+; RV64F: # %bb.0: # %entry
+; RV64F-NEXT: sw zero, 0(a0)
+; RV64F-NEXT: sw zero, 4(a0)
+; RV64F-NEXT: ret
+;
+; RV64D-LABEL: zero_v2f32:
+; RV64D: # %bb.0: # %entry
+; RV64D-NEXT: sw zero, 0(a0)
+; RV64D-NEXT: sw zero, 4(a0)
+; RV64D-NEXT: ret
+entry:
+ store <2 x float> <float 0.0, float 0.0>, ptr %i, align 8
+ ret void
+}
+
+define void @zero_v4f32(ptr %i) {
+; RV32F-LABEL: zero_v4f32:
+; RV32F: # %bb.0: # %entry
+; RV32F-NEXT: sw zero, 0(a0)
+; RV32F-NEXT: sw zero, 4(a0)
+; RV32F-NEXT: sw zero, 8(a0)
+; RV32F-NEXT: sw zero, 12(a0)
+; RV32F-NEXT: ret
+;
+; RV32D-LABEL: zero_v4f32:
+; RV32D: # %bb.0: # %entry
+; RV32D-NEXT: sw zero, 0(a0)
+; RV32D-NEXT: sw zero, 4(a0)
+; RV32D-NEXT: sw zero, 8(a0)
+; RV32D-NEXT: sw zero, 12(a0)
+; RV32D-NEXT: ret
+;
+; RV64F-LABEL: zero_v4f32:
+; RV64F: # %bb.0: # %entry
+; RV64F-NEXT: sw zero, 0(a0)
+; RV64F-NEXT: sw zero, 4(a0)
+; RV64F-NEXT: sw zero, 8(a0)
+; RV64F-NEXT: sw zero, 12(a0)
+; RV64F-NEXT: ret
+;
+; RV64D-LABEL: zero_v4f32:
+; RV64D: # %bb.0: # %entry
+; RV64D-NEXT: sw zero, 0(a0)
+; RV64D-NEXT: sw zero, 4(a0)
+; RV64D-NEXT: sw zero, 8(a0)
+; RV64D-NEXT: sw zero, 12(a0)
+; RV64D-NEXT: ret
+entry:
+ store <4 x float> <float 0.0, float 0.0, float 0.0, float 0.0>, ptr %i, align 8
+ ret void
+}
+
+define void @zero_v1f64(ptr %i) {
+; RV32F-LABEL: zero_v1f64:
+; RV32F: # %bb.0: # %entry
+; RV32F-NEXT: lui a1, %hi(.LCPI7_0)
+; RV32F-NEXT: addi a1, a1, %lo(.LCPI7_0)
+; RV32F-NEXT: lw a2, 0(a1)
+; RV32F-NEXT: lw a1, 4(a1)
+; RV32F-NEXT: sw a2, 0(a0)
+; RV32F-NEXT: sw a1, 4(a0)
+; RV32F-NEXT: ret
+;
+; RV32D-LABEL: zero_v1f64:
+; RV32D: # %bb.0: # %entry
+; RV32D-NEXT: fcvt.d.w fa5, zero
+; RV32D-NEXT: fsd fa5, 0(a0)
+; RV32D-NEXT: ret
+;
+; RV64F-LABEL: zero_v1f64:
+; RV64F: # %bb.0: # %entry
+; RV64F-NEXT: sd zero, 0(a0)
+; RV64F-NEXT: ret
+;
+; RV64D-LABEL: zero_v1f64:
+; RV64D: # %bb.0: # %entry
+; RV64D-NEXT: sd zero, 0(a0)
+; RV64D-NEXT: ret
+entry:
+ store <1 x double> <double 0.0>, ptr %i, align 8
+ ret void
+}
+
+define void @zero_v2f64(ptr %i) {
+; RV32F-LABEL: zero_v2f64:
+; RV32F: # %bb.0: # %entry
+; RV32F-NEXT: lui a1, %hi(.LCPI8_0)
+; RV32F-NEXT: addi a1, a1, %lo(.LCPI8_0)
+; RV32F-NEXT: lw a2, 0(a1)
+; RV32F-NEXT: lw a1, 4(a1)
+; RV32F-NEXT: sw a2, 0(a0)
+; RV32F-NEXT: sw a1, 4(a0)
+; RV32F-NEXT: sw a2, 8(a0)
+; RV32F-NEXT: sw a1, 12(a0)
+; RV32F-NEXT: ret
+;
+; RV32D-LABEL: zero_v2f64:
+; RV32D: # %bb.0: # %entry
+; RV32D-NEXT: fcvt.d.w fa5, zero
+; RV32D-NEXT: fsd fa5, 0(a0)
+; RV32D-NEXT: fsd fa5, 8(a0)
+; RV32D-NEXT: ret
+;
+; RV64F-LABEL: zero_v2f64:
+; RV64F: # %bb.0: # %entry
+; RV64F-NEXT: sd zero, 0(a0)
+; RV64F-NEXT: sd zero, 8(a0)
+; RV64F-NEXT: ret
+;
+; RV64D-LABEL: zero_v2f64:
+; RV64D: # %bb.0: # %entry
+; RV64D-NEXT: sd zero, 0(a0)
+; RV64D-NEXT: sd zero, 8(a0)
+; RV64D-NEXT: ret
+entry:
+ store <2 x double> <double 0.0, double 0.0>, ptr %i, align 8
+ ret void
+}
+
+define void @zero_v4f64(ptr %i) {
+; RV32F-LABEL: zero_v4f64:
+; RV32F: # %bb.0: # %entry
+; RV32F-NEXT: lui a1, %hi(.LCPI9_0)
+; RV32F-NEXT: addi a1, a1, %lo(.LCPI9_0)
+; RV32F-NEXT: lw a2, 0(a1)
+; RV32F-NEXT: lw a1, 4(a1)
+; RV32F-NEXT: sw a2, 0(a0)
+; RV32F-NEXT: sw a1, 4(a0)
+; RV32F-NEXT: sw a2, 8(a0)
+; RV32F-NEXT: sw a1, 12(a0)
+; RV32F-NEXT: sw a2, 16(a0)
+; RV32F-NEXT: sw a1, 20(a0)
+; RV32F-NEXT: sw a2, 24(a0)
+; RV32F-NEXT: sw a1, 28(a0)
+; RV32F-NEXT: ret
+;
+; RV32D-LABEL: zero_v4f64:
+; RV32D: # %bb.0: # %entry
+; RV32D-NEXT: fcvt.d.w fa5, zero
+; RV32D-NEXT: fsd fa5, 0(a0)
+; RV32D-NEXT: fsd fa5, 8(a0)
+; RV32D-NEXT: fsd fa5, 16(a0)
+; RV32D-NEXT: fsd fa5, 24(a0)
+; RV32D-NEXT: ret
+;
+; RV64F-LABEL: zero_v4f64:
+; RV64F: # %bb.0: # %entry
+; RV64F-NEXT: sd zero, 0(a0)
+; RV64F-NEXT: sd zero, 8(a0)
+; RV64F-NEXT: sd zero, 16(a0)
+; RV64F-NEXT: sd zero, 24(a0)
+; RV64F-NEXT: ret
+;
+; RV64D-LABEL: zero_v4f64:
+; RV64D: # %bb.0: # %entry
+; RV64D-NEXT: sd zero, 0(a0)
+; RV64D-NEXT: sd zero, 8(a0)
+; RV64D-NEXT: sd zero, 16(a0)
+; RV64D-NEXT: sd zero, 24(a0)
+; RV64D-NEXT: ret
+entry:
+ store <4 x double> <double 0.0, double 0.0, double 0.0, double 0.0>, ptr %i, align 8
+ ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rv32p.ll b/llvm/test/CodeGen/RISCV/rv32p.ll
new file mode 100644
index 0000000..4eee880a
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv32p.ll
@@ -0,0 +1,709 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-p -verify-machineinstrs < %s \
+; RUN: | FileCheck %s
+
+declare i32 @llvm.ctlz.i32(i32, i1)
+
+define i32 @ctlz_i32(i32 %a) nounwind {
+; CHECK-LABEL: ctlz_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: clz a0, a0
+; CHECK-NEXT: ret
+ %1 = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+ ret i32 %1
+}
+
+declare i64 @llvm.ctlz.i64(i64, i1)
+
+define i64 @ctlz_i64(i64 %a) nounwind {
+; CHECK-LABEL: ctlz_i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: bnez a1, .LBB1_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: clz a0, a0
+; CHECK-NEXT: addi a0, a0, 32
+; CHECK-NEXT: li a1, 0
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB1_2:
+; CHECK-NEXT: clz a0, a1
+; CHECK-NEXT: li a1, 0
+; CHECK-NEXT: ret
+ %1 = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
+ ret i64 %1
+}
+
+declare i32 @llvm.cttz.i32(i32, i1)
+
+define i32 @cttz_i32(i32 %a) nounwind {
+; CHECK-LABEL: cttz_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: beqz a0, .LBB2_2
+; CHECK-NEXT: # %bb.1: # %cond.false
+; CHECK-NEXT: addi a1, a0, -1
+; CHECK-NEXT: not a0, a0
+; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: clz a0, a0
+; CHECK-NEXT: li a1, 32
+; CHECK-NEXT: sub a0, a1, a0
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB2_2:
+; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: ret
+ %1 = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+ ret i32 %1
+}
+
+declare i64 @llvm.cttz.i64(i64, i1)
+
+define i64 @cttz_i64(i64 %a) nounwind {
+; CHECK-LABEL: cttz_i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: or a2, a0, a1
+; CHECK-NEXT: beqz a2, .LBB3_3
+; CHECK-NEXT: # %bb.1: # %cond.false
+; CHECK-NEXT: bnez a0, .LBB3_4
+; CHECK-NEXT: # %bb.2: # %cond.false
+; CHECK-NEXT: addi a0, a1, -1
+; CHECK-NEXT: not a1, a1
+; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: clz a0, a0
+; CHECK-NEXT: li a1, 64
+; CHECK-NEXT: j .LBB3_5
+; CHECK-NEXT: .LBB3_3:
+; CHECK-NEXT: li a1, 0
+; CHECK-NEXT: li a0, 64
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB3_4:
+; CHECK-NEXT: addi a1, a0, -1
+; CHECK-NEXT: not a0, a0
+; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: clz a0, a0
+; CHECK-NEXT: li a1, 32
+; CHECK-NEXT: .LBB3_5: # %cond.false
+; CHECK-NEXT: sub a0, a1, a0
+; CHECK-NEXT: li a1, 0
+; CHECK-NEXT: ret
+ %1 = call i64 @llvm.cttz.i64(i64 %a, i1 false)
+ ret i64 %1
+}
+
+define i32 @sextb_i32(i32 %a) nounwind {
+; CHECK-LABEL: sextb_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: sext.b a0, a0
+; CHECK-NEXT: ret
+ %shl = shl i32 %a, 24
+ %shr = ashr exact i32 %shl, 24
+ ret i32 %shr
+}
+
+define i64 @sextb_i64(i64 %a) nounwind {
+; CHECK-LABEL: sextb_i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: sext.b a0, a0
+; CHECK-NEXT: srai a1, a0, 31
+; CHECK-NEXT: ret
+ %shl = shl i64 %a, 56
+ %shr = ashr exact i64 %shl, 56
+ ret i64 %shr
+}
+
+define i32 @sexth_i32(i32 %a) nounwind {
+; CHECK-LABEL: sexth_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: sext.h a0, a0
+; CHECK-NEXT: ret
+ %shl = shl i32 %a, 16
+ %shr = ashr exact i32 %shl, 16
+ ret i32 %shr
+}
+
+define i64 @sexth_i64(i64 %a) nounwind {
+; CHECK-LABEL: sexth_i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: sext.h a0, a0
+; CHECK-NEXT: srai a1, a0, 31
+; CHECK-NEXT: ret
+ %shl = shl i64 %a, 48
+ %shr = ashr exact i64 %shl, 48
+ ret i64 %shr
+}
+
+define i32 @min_i32(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: min_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: min a0, a0, a1
+; CHECK-NEXT: ret
+ %cmp = icmp slt i32 %a, %b
+ %cond = select i1 %cmp, i32 %a, i32 %b
+ ret i32 %cond
+}
+
+; As we are not matching directly i64 code patterns on RV32 some i64 patterns
+; don't have yet any matching bit manipulation instructions on RV32.
+; This test is presented here in case future expansions of the Bitmanip
+; extensions introduce instructions suitable for this pattern.
+
+define i64 @min_i64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: min_i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: beq a1, a3, .LBB9_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: slt a4, a1, a3
+; CHECK-NEXT: beqz a4, .LBB9_3
+; CHECK-NEXT: j .LBB9_4
+; CHECK-NEXT: .LBB9_2:
+; CHECK-NEXT: sltu a4, a0, a2
+; CHECK-NEXT: bnez a4, .LBB9_4
+; CHECK-NEXT: .LBB9_3:
+; CHECK-NEXT: mv a0, a2
+; CHECK-NEXT: mv a1, a3
+; CHECK-NEXT: .LBB9_4:
+; CHECK-NEXT: ret
+ %cmp = icmp slt i64 %a, %b
+ %cond = select i1 %cmp, i64 %a, i64 %b
+ ret i64 %cond
+}
+
+define i32 @max_i32(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: max_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: max a0, a0, a1
+; CHECK-NEXT: ret
+ %cmp = icmp sgt i32 %a, %b
+ %cond = select i1 %cmp, i32 %a, i32 %b
+ ret i32 %cond
+}
+
+; As we are not matching directly i64 code patterns on RV32 some i64 patterns
+; don't have yet any matching bit manipulation instructions on RV32.
+; This test is presented here in case future expansions of the Bitmanip
+; extensions introduce instructions suitable for this pattern.
+
+define i64 @max_i64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: max_i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: beq a1, a3, .LBB11_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: slt a4, a3, a1
+; CHECK-NEXT: beqz a4, .LBB11_3
+; CHECK-NEXT: j .LBB11_4
+; CHECK-NEXT: .LBB11_2:
+; CHECK-NEXT: sltu a4, a2, a0
+; CHECK-NEXT: bnez a4, .LBB11_4
+; CHECK-NEXT: .LBB11_3:
+; CHECK-NEXT: mv a0, a2
+; CHECK-NEXT: mv a1, a3
+; CHECK-NEXT: .LBB11_4:
+; CHECK-NEXT: ret
+ %cmp = icmp sgt i64 %a, %b
+ %cond = select i1 %cmp, i64 %a, i64 %b
+ ret i64 %cond
+}
+
+define i32 @minu_i32(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: minu_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: minu a0, a0, a1
+; CHECK-NEXT: ret
+ %cmp = icmp ult i32 %a, %b
+ %cond = select i1 %cmp, i32 %a, i32 %b
+ ret i32 %cond
+}
+
+; As we are not matching directly i64 code patterns on RV32 some i64 patterns
+; don't have yet any matching bit manipulation instructions on RV32.
+; This test is presented here in case future expansions of the Bitmanip
+; extensions introduce instructions suitable for this pattern.
+
+define i64 @minu_i64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: minu_i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: beq a1, a3, .LBB13_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: sltu a4, a1, a3
+; CHECK-NEXT: beqz a4, .LBB13_3
+; CHECK-NEXT: j .LBB13_4
+; CHECK-NEXT: .LBB13_2:
+; CHECK-NEXT: sltu a4, a0, a2
+; CHECK-NEXT: bnez a4, .LBB13_4
+; CHECK-NEXT: .LBB13_3:
+; CHECK-NEXT: mv a0, a2
+; CHECK-NEXT: mv a1, a3
+; CHECK-NEXT: .LBB13_4:
+; CHECK-NEXT: ret
+ %cmp = icmp ult i64 %a, %b
+ %cond = select i1 %cmp, i64 %a, i64 %b
+ ret i64 %cond
+}
+
+define i32 @maxu_i32(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: maxu_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: maxu a0, a0, a1
+; CHECK-NEXT: ret
+ %cmp = icmp ugt i32 %a, %b
+ %cond = select i1 %cmp, i32 %a, i32 %b
+ ret i32 %cond
+}
+
+; As we are not matching directly i64 code patterns on RV32 some i64 patterns
+; don't have yet any matching bit manipulation instructions on RV32.
+; This test is presented here in case future expansions of the Bitmanip
+; extensions introduce instructions suitable for this pattern.
+
+define i64 @maxu_i64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: maxu_i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: beq a1, a3, .LBB15_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: sltu a4, a3, a1
+; CHECK-NEXT: beqz a4, .LBB15_3
+; CHECK-NEXT: j .LBB15_4
+; CHECK-NEXT: .LBB15_2:
+; CHECK-NEXT: sltu a4, a2, a0
+; CHECK-NEXT: bnez a4, .LBB15_4
+; CHECK-NEXT: .LBB15_3:
+; CHECK-NEXT: mv a0, a2
+; CHECK-NEXT: mv a1, a3
+; CHECK-NEXT: .LBB15_4:
+; CHECK-NEXT: ret
+ %cmp = icmp ugt i64 %a, %b
+ %cond = select i1 %cmp, i64 %a, i64 %b
+ ret i64 %cond
+}
+
+declare i32 @llvm.abs.i32(i32, i1 immarg)
+
+define i32 @abs_i32(i32 %x) {
+; CHECK-LABEL: abs_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: abs a0, a0
+; CHECK-NEXT: ret
+ %abs = tail call i32 @llvm.abs.i32(i32 %x, i1 true)
+ ret i32 %abs
+}
+
+declare i64 @llvm.abs.i64(i64, i1 immarg)
+
+define i64 @abs_i64(i64 %x) {
+; CHECK-LABEL: abs_i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: bgez a1, .LBB17_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: snez a2, a0
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: sub a1, a1, a2
+; CHECK-NEXT: .LBB17_2:
+; CHECK-NEXT: ret
+ %abs = tail call i64 @llvm.abs.i64(i64 %x, i1 true)
+ ret i64 %abs
+}
+
+define i32 @zexth_i32(i32 %a) nounwind {
+; CHECK-LABEL: zexth_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: slli a0, a0, 16
+; CHECK-NEXT: srli a0, a0, 16
+; CHECK-NEXT: ret
+ %and = and i32 %a, 65535
+ ret i32 %and
+}
+
+define i64 @zexth_i64(i64 %a) nounwind {
+; CHECK-LABEL: zexth_i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: slli a0, a0, 16
+; CHECK-NEXT: srli a0, a0, 16
+; CHECK-NEXT: li a1, 0
+; CHECK-NEXT: ret
+ %and = and i64 %a, 65535
+ ret i64 %and
+}
+
+declare i32 @llvm.bswap.i32(i32)
+
+define i32 @bswap_i32(i32 %a) nounwind {
+; CHECK-LABEL: bswap_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: rev8 a0, a0
+; CHECK-NEXT: ret
+ %1 = tail call i32 @llvm.bswap.i32(i32 %a)
+ ret i32 %1
+}
+
+declare i64 @llvm.bswap.i64(i64)
+
+define i64 @bswap_i64(i64 %a) {
+; CHECK-LABEL: bswap_i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: rev8 a2, a1
+; CHECK-NEXT: rev8 a1, a0
+; CHECK-NEXT: mv a0, a2
+; CHECK-NEXT: ret
+ %1 = call i64 @llvm.bswap.i64(i64 %a)
+ ret i64 %1
+}
+
+define i32 @srai_slli(i16 signext %0) {
+; CHECK-LABEL: srai_slli:
+; CHECK: # %bb.0:
+; CHECK-NEXT: slli a0, a0, 25
+; CHECK-NEXT: srai a0, a0, 31
+; CHECK-NEXT: ret
+ %2 = shl i16 %0, 9
+ %sext = ashr i16 %2, 15
+ %3 = sext i16 %sext to i32
+ ret i32 %3
+}
+
+define i32 @srai_slli2(i16 signext %0) {
+; CHECK-LABEL: srai_slli2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: slli a0, a0, 25
+; CHECK-NEXT: srai a0, a0, 30
+; CHECK-NEXT: ret
+ %2 = shl i16 %0, 9
+ %sext = ashr i16 %2, 14
+ %3 = sext i16 %sext to i32
+ ret i32 %3
+}
+define i8 @sub_if_uge_i8(i8 %x, i8 %y) {
+; CHECK-LABEL: sub_if_uge_i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: zext.b a2, a0
+; CHECK-NEXT: sub a0, a0, a1
+; CHECK-NEXT: zext.b a0, a0
+; CHECK-NEXT: minu a0, a2, a0
+; CHECK-NEXT: ret
+ %cmp = icmp ult i8 %x, %y
+ %select = select i1 %cmp, i8 0, i8 %y
+ %sub = sub nuw i8 %x, %select
+ ret i8 %sub
+}
+
+define i16 @sub_if_uge_i16(i16 %x, i16 %y) {
+; CHECK-LABEL: sub_if_uge_i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a2, 16
+; CHECK-NEXT: sub a1, a0, a1
+; CHECK-NEXT: addi a2, a2, -1
+; CHECK-NEXT: and a0, a0, a2
+; CHECK-NEXT: and a1, a1, a2
+; CHECK-NEXT: minu a0, a0, a1
+; CHECK-NEXT: ret
+ %cmp = icmp ult i16 %x, %y
+ %select = select i1 %cmp, i16 0, i16 %y
+ %sub = sub nuw i16 %x, %select
+ ret i16 %sub
+}
+
+define i32 @sub_if_uge_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: sub_if_uge_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: sub a1, a0, a1
+; CHECK-NEXT: minu a0, a0, a1
+; CHECK-NEXT: ret
+ %cmp = icmp ult i32 %x, %y
+ %select = select i1 %cmp, i32 0, i32 %y
+ %sub = sub nuw i32 %x, %select
+ ret i32 %sub
+}
+
+define i64 @sub_if_uge_i64(i64 %x, i64 %y) {
+; CHECK-LABEL: sub_if_uge_i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: beq a1, a3, .LBB27_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: sltu a4, a1, a3
+; CHECK-NEXT: j .LBB27_3
+; CHECK-NEXT: .LBB27_2:
+; CHECK-NEXT: sltu a4, a0, a2
+; CHECK-NEXT: .LBB27_3:
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: and a2, a4, a2
+; CHECK-NEXT: sltu a4, a0, a2
+; CHECK-NEXT: sub a1, a1, a3
+; CHECK-NEXT: sub a1, a1, a4
+; CHECK-NEXT: sub a0, a0, a2
+; CHECK-NEXT: ret
+ %cmp = icmp ult i64 %x, %y
+ %select = select i1 %cmp, i64 0, i64 %y
+ %sub = sub nuw i64 %x, %select
+ ret i64 %sub
+}
+
+define i128 @sub_if_uge_i128(i128 %x, i128 %y) {
+; CHECK-LABEL: sub_if_uge_i128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lw a3, 4(a1)
+; CHECK-NEXT: lw a4, 8(a1)
+; CHECK-NEXT: lw a5, 12(a1)
+; CHECK-NEXT: lw a6, 4(a2)
+; CHECK-NEXT: lw t0, 12(a2)
+; CHECK-NEXT: lw a7, 8(a2)
+; CHECK-NEXT: beq a5, t0, .LBB28_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: sltu t1, a5, t0
+; CHECK-NEXT: j .LBB28_3
+; CHECK-NEXT: .LBB28_2:
+; CHECK-NEXT: sltu t1, a4, a7
+; CHECK-NEXT: .LBB28_3:
+; CHECK-NEXT: lw a1, 0(a1)
+; CHECK-NEXT: lw a2, 0(a2)
+; CHECK-NEXT: beq a3, a6, .LBB28_5
+; CHECK-NEXT: # %bb.4:
+; CHECK-NEXT: sltu t2, a3, a6
+; CHECK-NEXT: j .LBB28_6
+; CHECK-NEXT: .LBB28_5:
+; CHECK-NEXT: sltu t2, a1, a2
+; CHECK-NEXT: .LBB28_6:
+; CHECK-NEXT: xor t3, a5, t0
+; CHECK-NEXT: xor t4, a4, a7
+; CHECK-NEXT: or t3, t4, t3
+; CHECK-NEXT: beqz t3, .LBB28_8
+; CHECK-NEXT: # %bb.7:
+; CHECK-NEXT: mv t2, t1
+; CHECK-NEXT: .LBB28_8:
+; CHECK-NEXT: addi t3, t2, -1
+; CHECK-NEXT: and t2, t3, t0
+; CHECK-NEXT: and t0, t3, a2
+; CHECK-NEXT: and t1, t3, a6
+; CHECK-NEXT: sltu a2, a1, t0
+; CHECK-NEXT: and a7, t3, a7
+; CHECK-NEXT: mv a6, a2
+; CHECK-NEXT: beq a3, t1, .LBB28_10
+; CHECK-NEXT: # %bb.9:
+; CHECK-NEXT: sltu a6, a3, t1
+; CHECK-NEXT: .LBB28_10:
+; CHECK-NEXT: sub t3, a4, a7
+; CHECK-NEXT: sltu a4, a4, a7
+; CHECK-NEXT: sub a5, a5, t2
+; CHECK-NEXT: sub a3, a3, t1
+; CHECK-NEXT: sub a1, a1, t0
+; CHECK-NEXT: sltu a7, t3, a6
+; CHECK-NEXT: sub a5, a5, a4
+; CHECK-NEXT: sub a4, t3, a6
+; CHECK-NEXT: sub a3, a3, a2
+; CHECK-NEXT: sub a2, a5, a7
+; CHECK-NEXT: sw a1, 0(a0)
+; CHECK-NEXT: sw a3, 4(a0)
+; CHECK-NEXT: sw a4, 8(a0)
+; CHECK-NEXT: sw a2, 12(a0)
+; CHECK-NEXT: ret
+ %cmp = icmp ult i128 %x, %y
+ %select = select i1 %cmp, i128 0, i128 %y
+ %sub = sub nuw i128 %x, %select
+ ret i128 %sub
+}
+
+define i32 @sub_if_uge_multiuse_select_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: sub_if_uge_multiuse_select_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: sltu a2, a0, a1
+; CHECK-NEXT: addi a2, a2, -1
+; CHECK-NEXT: and a1, a2, a1
+; CHECK-NEXT: sub a0, a0, a1
+; CHECK-NEXT: sll a0, a0, a1
+; CHECK-NEXT: ret
+ %cmp = icmp ult i32 %x, %y
+ %select = select i1 %cmp, i32 0, i32 %y
+ %sub = sub nuw i32 %x, %select
+ %shl = shl i32 %sub, %select
+ ret i32 %shl
+}
+
+define i32 @sub_if_uge_multiuse_cmp_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: sub_if_uge_multiuse_cmp_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: sub a2, a0, a1
+; CHECK-NEXT: minu a2, a0, a2
+; CHECK-NEXT: bltu a0, a1, .LBB30_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: li a0, 4
+; CHECK-NEXT: sll a0, a2, a0
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB30_2:
+; CHECK-NEXT: li a0, 2
+; CHECK-NEXT: sll a0, a2, a0
+; CHECK-NEXT: ret
+ %cmp = icmp ult i32 %x, %y
+ %select = select i1 %cmp, i32 0, i32 %y
+ %sub = sub nuw i32 %x, %select
+ %select2 = select i1 %cmp, i32 2, i32 4
+ %shl = shl i32 %sub, %select2
+ ret i32 %shl
+}
+
+define i32 @sub_if_uge_multiuse_cmp_store_i32(i32 %x, i32 %y, ptr %z) {
+; CHECK-LABEL: sub_if_uge_multiuse_cmp_store_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: sltu a3, a0, a1
+; CHECK-NEXT: sub a1, a0, a1
+; CHECK-NEXT: xori a3, a3, 1
+; CHECK-NEXT: minu a0, a0, a1
+; CHECK-NEXT: sw a3, 0(a2)
+; CHECK-NEXT: ret
+ %cmp = icmp uge i32 %x, %y
+ %conv = zext i1 %cmp to i32
+ store i32 %conv, ptr %z, align 4
+ %select = select i1 %cmp, i32 %y, i32 0
+ %sub = sub nuw i32 %x, %select
+ ret i32 %sub
+}
+
+define i8 @sub_if_uge_C_i8(i8 zeroext %x) {
+; CHECK-LABEL: sub_if_uge_C_i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi a1, a0, -13
+; CHECK-NEXT: zext.b a1, a1
+; CHECK-NEXT: minu a0, a1, a0
+; CHECK-NEXT: ret
+ %cmp = icmp ugt i8 %x, 12
+ %sub = add i8 %x, -13
+ %conv4 = select i1 %cmp, i8 %sub, i8 %x
+ ret i8 %conv4
+}
+
+define i16 @sub_if_uge_C_i16(i16 zeroext %x) {
+; CHECK-LABEL: sub_if_uge_C_i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi a1, a0, -251
+; CHECK-NEXT: slli a1, a1, 16
+; CHECK-NEXT: srli a1, a1, 16
+; CHECK-NEXT: minu a0, a1, a0
+; CHECK-NEXT: ret
+ %cmp = icmp ugt i16 %x, 250
+ %sub = add i16 %x, -251
+ %conv4 = select i1 %cmp, i16 %sub, i16 %x
+ ret i16 %conv4
+}
+
+define i32 @sub_if_uge_C_i32(i32 signext %x) {
+; CHECK-LABEL: sub_if_uge_C_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a1, 1048560
+; CHECK-NEXT: addi a1, a1, 15
+; CHECK-NEXT: add a1, a0, a1
+; CHECK-NEXT: minu a0, a1, a0
+; CHECK-NEXT: ret
+ %cmp = icmp ugt i32 %x, 65520
+ %sub = add i32 %x, -65521
+ %cond = select i1 %cmp, i32 %sub, i32 %x
+ ret i32 %cond
+}
+
+define i64 @sub_if_uge_C_i64(i64 %x) {
+; CHECK-LABEL: sub_if_uge_C_i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a2, 1
+; CHECK-NEXT: beq a1, a2, .LBB35_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: sltiu a2, a1, 2
+; CHECK-NEXT: xori a2, a2, 1
+; CHECK-NEXT: j .LBB35_3
+; CHECK-NEXT: .LBB35_2:
+; CHECK-NEXT: lui a2, 172127
+; CHECK-NEXT: addi a2, a2, 511
+; CHECK-NEXT: sltu a2, a2, a0
+; CHECK-NEXT: .LBB35_3:
+; CHECK-NEXT: neg a2, a2
+; CHECK-NEXT: andi a3, a2, -2
+; CHECK-NEXT: add a1, a1, a3
+; CHECK-NEXT: lui a3, 876449
+; CHECK-NEXT: addi a3, a3, -512
+; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: add a2, a0, a2
+; CHECK-NEXT: sltu a0, a2, a0
+; CHECK-NEXT: add a1, a1, a0
+; CHECK-NEXT: mv a0, a2
+; CHECK-NEXT: ret
+ %cmp = icmp ugt i64 %x, 4999999999
+ %sub = add i64 %x, -5000000000
+ %cond = select i1 %cmp, i64 %sub, i64 %x
+ ret i64 %cond
+}
+
+define i32 @sub_if_uge_C_multiuse_cmp_i32(i32 signext %x, ptr %z) {
+; CHECK-LABEL: sub_if_uge_C_multiuse_cmp_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a2, 16
+; CHECK-NEXT: lui a3, 1048560
+; CHECK-NEXT: addi a2, a2, -16
+; CHECK-NEXT: addi a3, a3, 15
+; CHECK-NEXT: sltu a2, a2, a0
+; CHECK-NEXT: add a3, a0, a3
+; CHECK-NEXT: minu a0, a3, a0
+; CHECK-NEXT: sw a2, 0(a1)
+; CHECK-NEXT: ret
+ %cmp = icmp ugt i32 %x, 65520
+ %conv = zext i1 %cmp to i32
+ store i32 %conv, ptr %z, align 4
+ %sub = add i32 %x, -65521
+ %cond = select i1 %cmp, i32 %sub, i32 %x
+ ret i32 %cond
+}
+
+define i32 @sub_if_uge_C_multiuse_sub_i32(i32 signext %x, ptr %z) {
+; CHECK-LABEL: sub_if_uge_C_multiuse_sub_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a2, 1048560
+; CHECK-NEXT: addi a2, a2, 15
+; CHECK-NEXT: add a2, a0, a2
+; CHECK-NEXT: minu a0, a2, a0
+; CHECK-NEXT: sw a2, 0(a1)
+; CHECK-NEXT: ret
+ %sub = add i32 %x, -65521
+ store i32 %sub, ptr %z, align 4
+ %cmp = icmp ugt i32 %x, 65520
+ %cond = select i1 %cmp, i32 %sub, i32 %x
+ ret i32 %cond
+}
+
+define i32 @sub_if_uge_C_swapped_i32(i32 %x) {
+; CHECK-LABEL: sub_if_uge_C_swapped_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a1, 1048560
+; CHECK-NEXT: addi a1, a1, 15
+; CHECK-NEXT: add a1, a0, a1
+; CHECK-NEXT: minu a0, a0, a1
+; CHECK-NEXT: ret
+ %cmp = icmp ult i32 %x, 65521
+ %sub = add i32 %x, -65521
+ %cond = select i1 %cmp, i32 %x, i32 %sub
+ ret i32 %cond
+}
+
+define i7 @sub_if_uge_C_nsw_i7(i7 %a) {
+; CHECK-LABEL: sub_if_uge_C_nsw_i7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ori a0, a0, 51
+; CHECK-NEXT: andi a1, a0, 127
+; CHECK-NEXT: addi a0, a0, 17
+; CHECK-NEXT: andi a0, a0, 92
+; CHECK-NEXT: minu a0, a0, a1
+; CHECK-NEXT: ret
+ %x = or i7 %a, 51
+ %c = icmp ugt i7 %x, -18
+ %add = add nsw i7 %x, 17
+ %s = select i1 %c, i7 %add, i7 %x
+ ret i7 %s
+}
+
+define i7 @sub_if_uge_C_swapped_nsw_i7(i7 %a) {
+; CHECK-LABEL: sub_if_uge_C_swapped_nsw_i7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ori a0, a0, 51
+; CHECK-NEXT: andi a1, a0, 127
+; CHECK-NEXT: addi a0, a0, 17
+; CHECK-NEXT: andi a0, a0, 92
+; CHECK-NEXT: minu a0, a1, a0
+; CHECK-NEXT: ret
+ %x = or i7 %a, 51
+ %c = icmp ult i7 %x, -17
+ %add = add nsw i7 %x, 17
+ %s = select i1 %c, i7 %x, i7 %add
+ ret i7 %s
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64p.ll b/llvm/test/CodeGen/RISCV/rv64p.ll
new file mode 100644
index 0000000..cb07f94
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv64p.ll
@@ -0,0 +1,677 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-p -verify-machineinstrs < %s \
+; RUN: | FileCheck %s
+
+declare i32 @llvm.ctlz.i32(i32, i1)
+
+define signext i32 @ctlz_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: ctlz_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: clzw a0, a0
+; CHECK-NEXT: ret
+ %1 = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+ ret i32 %1
+}
+
+define signext i32 @log2_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: log2_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: clzw a0, a0
+; CHECK-NEXT: li a1, 31
+; CHECK-NEXT: sub a0, a1, a0
+; CHECK-NEXT: ret
+ %1 = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+ %2 = sub i32 31, %1
+ ret i32 %2
+}
+
+define signext i32 @log2_ceil_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: log2_ceil_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi a0, a0, -1
+; CHECK-NEXT: clzw a0, a0
+; CHECK-NEXT: li a1, 32
+; CHECK-NEXT: sub a0, a1, a0
+; CHECK-NEXT: ret
+ %1 = sub i32 %a, 1
+ %2 = call i32 @llvm.ctlz.i32(i32 %1, i1 false)
+ %3 = sub i32 32, %2
+ ret i32 %3
+}
+
+define signext i32 @findLastSet_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: findLastSet_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: clzw a1, a0
+; CHECK-NEXT: snez a0, a0
+; CHECK-NEXT: xori a1, a1, 31
+; CHECK-NEXT: addi a0, a0, -1
+; CHECK-NEXT: or a0, a0, a1
+; CHECK-NEXT: ret
+ %1 = call i32 @llvm.ctlz.i32(i32 %a, i1 true)
+ %2 = xor i32 31, %1
+ %3 = icmp eq i32 %a, 0
+ %4 = select i1 %3, i32 -1, i32 %2
+ ret i32 %4
+}
+
+define i32 @ctlz_lshr_i32(i32 signext %a) {
+; CHECK-LABEL: ctlz_lshr_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: srliw a0, a0, 1
+; CHECK-NEXT: clzw a0, a0
+; CHECK-NEXT: ret
+ %1 = lshr i32 %a, 1
+ %2 = call i32 @llvm.ctlz.i32(i32 %1, i1 false)
+ ret i32 %2
+}
+
+declare i64 @llvm.ctlz.i64(i64, i1)
+
+define i64 @ctlz_i64(i64 %a) nounwind {
+; CHECK-LABEL: ctlz_i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: clz a0, a0
+; CHECK-NEXT: ret
+ %1 = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
+ ret i64 %1
+}
+
+declare i32 @llvm.cttz.i32(i32, i1)
+
+define signext i32 @cttz_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: cttz_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: beqz a0, .LBB6_2
+; CHECK-NEXT: # %bb.1: # %cond.false
+; CHECK-NEXT: addi a1, a0, -1
+; CHECK-NEXT: not a0, a0
+; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: clzw a0, a0
+; CHECK-NEXT: li a1, 32
+; CHECK-NEXT: sub a0, a1, a0
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB6_2:
+; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: ret
+ %1 = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+ ret i32 %1
+}
+
+define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: cttz_zero_undef_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi a1, a0, -1
+; CHECK-NEXT: not a0, a0
+; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: clzw a0, a0
+; CHECK-NEXT: li a1, 32
+; CHECK-NEXT: sub a0, a1, a0
+; CHECK-NEXT: ret
+ %1 = call i32 @llvm.cttz.i32(i32 %a, i1 true)
+ ret i32 %1
+}
+
+define signext i32 @findFirstSet_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: findFirstSet_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi a1, a0, -1
+; CHECK-NEXT: not a2, a0
+; CHECK-NEXT: and a1, a2, a1
+; CHECK-NEXT: li a2, 32
+; CHECK-NEXT: snez a0, a0
+; CHECK-NEXT: clzw a1, a1
+; CHECK-NEXT: sub a2, a2, a1
+; CHECK-NEXT: addi a0, a0, -1
+; CHECK-NEXT: or a0, a0, a2
+; CHECK-NEXT: ret
+ %1 = call i32 @llvm.cttz.i32(i32 %a, i1 true)
+ %2 = icmp eq i32 %a, 0
+ %3 = select i1 %2, i32 -1, i32 %1
+ ret i32 %3
+}
+
+define signext i32 @ffs_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: ffs_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi a1, a0, -1
+; CHECK-NEXT: not a2, a0
+; CHECK-NEXT: and a1, a2, a1
+; CHECK-NEXT: li a2, 33
+; CHECK-NEXT: seqz a0, a0
+; CHECK-NEXT: clzw a1, a1
+; CHECK-NEXT: sub a2, a2, a1
+; CHECK-NEXT: addi a0, a0, -1
+; CHECK-NEXT: and a0, a0, a2
+; CHECK-NEXT: ret
+ %1 = call i32 @llvm.cttz.i32(i32 %a, i1 true)
+ %2 = add i32 %1, 1
+ %3 = icmp eq i32 %a, 0
+ %4 = select i1 %3, i32 0, i32 %2
+ ret i32 %4
+}
+
+declare i64 @llvm.cttz.i64(i64, i1)
+
+define i64 @cttz_i64(i64 %a) nounwind {
+; CHECK-LABEL: cttz_i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: beqz a0, .LBB10_2
+; CHECK-NEXT: # %bb.1: # %cond.false
+; CHECK-NEXT: addi a1, a0, -1
+; CHECK-NEXT: not a0, a0
+; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: clz a0, a0
+; CHECK-NEXT: li a1, 64
+; CHECK-NEXT: sub a0, a1, a0
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB10_2:
+; CHECK-NEXT: li a0, 64
+; CHECK-NEXT: ret
+ %1 = call i64 @llvm.cttz.i64(i64 %a, i1 false)
+ ret i64 %1
+}
+
+define signext i32 @sextb_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: sextb_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: sext.b a0, a0
+; CHECK-NEXT: ret
+ %shl = shl i32 %a, 24
+ %shr = ashr exact i32 %shl, 24
+ ret i32 %shr
+}
+
+define i64 @sextb_i64(i64 %a) nounwind {
+; CHECK-LABEL: sextb_i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: sext.b a0, a0
+; CHECK-NEXT: ret
+ %shl = shl i64 %a, 56
+ %shr = ashr exact i64 %shl, 56
+ ret i64 %shr
+}
+
+define signext i32 @sexth_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: sexth_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: sext.h a0, a0
+; CHECK-NEXT: ret
+ %shl = shl i32 %a, 16
+ %shr = ashr exact i32 %shl, 16
+ ret i32 %shr
+}
+
+define i64 @sexth_i64(i64 %a) nounwind {
+; CHECK-LABEL: sexth_i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: sext.h a0, a0
+; CHECK-NEXT: ret
+ %shl = shl i64 %a, 48
+ %shr = ashr exact i64 %shl, 48
+ ret i64 %shr
+}
+
+define signext i32 @min_i32(i32 signext %a, i32 signext %b) nounwind {
+; CHECK-LABEL: min_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: min a0, a0, a1
+; CHECK-NEXT: ret
+ %cmp = icmp slt i32 %a, %b
+ %cond = select i1 %cmp, i32 %a, i32 %b
+ ret i32 %cond
+}
+
+define i64 @min_i64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: min_i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: min a0, a0, a1
+; CHECK-NEXT: ret
+ %cmp = icmp slt i64 %a, %b
+ %cond = select i1 %cmp, i64 %a, i64 %b
+ ret i64 %cond
+}
+
+define signext i32 @max_i32(i32 signext %a, i32 signext %b) nounwind {
+; CHECK-LABEL: max_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: max a0, a0, a1
+; CHECK-NEXT: ret
+ %cmp = icmp sgt i32 %a, %b
+ %cond = select i1 %cmp, i32 %a, i32 %b
+ ret i32 %cond
+}
+
+define i64 @max_i64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: max_i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: max a0, a0, a1
+; CHECK-NEXT: ret
+ %cmp = icmp sgt i64 %a, %b
+ %cond = select i1 %cmp, i64 %a, i64 %b
+ ret i64 %cond
+}
+
+define signext i32 @minu_i32(i32 signext %a, i32 signext %b) nounwind {
+; CHECK-LABEL: minu_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: minu a0, a0, a1
+; CHECK-NEXT: ret
+ %cmp = icmp ult i32 %a, %b
+ %cond = select i1 %cmp, i32 %a, i32 %b
+ ret i32 %cond
+}
+
+define i64 @minu_i64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: minu_i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: minu a0, a0, a1
+; CHECK-NEXT: ret
+ %cmp = icmp ult i64 %a, %b
+ %cond = select i1 %cmp, i64 %a, i64 %b
+ ret i64 %cond
+}
+
+define signext i32 @maxu_i32(i32 signext %a, i32 signext %b) nounwind {
+; CHECK-LABEL: maxu_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: maxu a0, a0, a1
+; CHECK-NEXT: ret
+ %cmp = icmp ugt i32 %a, %b
+ %cond = select i1 %cmp, i32 %a, i32 %b
+ ret i32 %cond
+}
+
+define i64 @maxu_i64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: maxu_i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: maxu a0, a0, a1
+; CHECK-NEXT: ret
+ %cmp = icmp ugt i64 %a, %b
+ %cond = select i1 %cmp, i64 %a, i64 %b
+ ret i64 %cond
+}
+
+declare i32 @llvm.abs.i32(i32, i1 immarg)
+
+define i32 @abs_i32(i32 %x) {
+; CHECK-LABEL: abs_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: sext.w a0, a0
+; CHECK-NEXT: abs a0, a0
+; CHECK-NEXT: ret
+ %abs = tail call i32 @llvm.abs.i32(i32 %x, i1 true)
+ ret i32 %abs
+}
+
+define signext i32 @abs_i32_sext(i32 signext %x) {
+; CHECK-LABEL: abs_i32_sext:
+; CHECK: # %bb.0:
+; CHECK-NEXT: abs a0, a0
+; CHECK-NEXT: sext.w a0, a0
+; CHECK-NEXT: ret
+ %abs = tail call i32 @llvm.abs.i32(i32 %x, i1 true)
+ ret i32 %abs
+}
+
+declare i64 @llvm.abs.i64(i64, i1 immarg)
+
+define i64 @abs_i64(i64 %x) {
+; CHECK-LABEL: abs_i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: abs a0, a0
+; CHECK-NEXT: ret
+ %abs = tail call i64 @llvm.abs.i64(i64 %x, i1 true)
+ ret i64 %abs
+}
+
+declare i32 @llvm.bswap.i32(i32)
+
+define signext i32 @bswap_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: bswap_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: rev8 a0, a0
+; CHECK-NEXT: srai a0, a0, 32
+; CHECK-NEXT: ret
+ %1 = tail call i32 @llvm.bswap.i32(i32 %a)
+ ret i32 %1
+}
+
+; Similar to bswap_i32 but the result is not sign extended.
+define void @bswap_i32_nosext(i32 signext %a, ptr %x) nounwind {
+; CHECK-LABEL: bswap_i32_nosext:
+; CHECK: # %bb.0:
+; CHECK-NEXT: rev8 a0, a0
+; CHECK-NEXT: srli a0, a0, 32
+; CHECK-NEXT: sw a0, 0(a1)
+; CHECK-NEXT: ret
+ %1 = tail call i32 @llvm.bswap.i32(i32 %a)
+ store i32 %1, ptr %x
+ ret void
+}
+
+declare i64 @llvm.bswap.i64(i64)
+
+define i64 @bswap_i64(i64 %a) {
+; CHECK-LABEL: bswap_i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: rev8 a0, a0
+; CHECK-NEXT: ret
+ %1 = call i64 @llvm.bswap.i64(i64 %a)
+ ret i64 %1
+}
+
+define i64 @srai_slli(i16 signext %0) {
+; CHECK-LABEL: srai_slli:
+; CHECK: # %bb.0:
+; CHECK-NEXT: slli a0, a0, 57
+; CHECK-NEXT: srai a0, a0, 63
+; CHECK-NEXT: ret
+ %2 = shl i16 %0, 9
+ %sext = ashr i16 %2, 15
+ %3 = sext i16 %sext to i64
+ ret i64 %3
+}
+
+define i64 @srai_slli2(i16 signext %0) {
+; CHECK-LABEL: srai_slli2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: slli a0, a0, 57
+; CHECK-NEXT: srai a0, a0, 62
+; CHECK-NEXT: ret
+ %2 = shl i16 %0, 9
+ %sext = ashr i16 %2, 14
+ %3 = sext i16 %sext to i64
+ ret i64 %3
+}
+
+define signext i32 @func0000000000000001(i32 signext %0, i8 signext %1) #0 {
+; CHECK-LABEL: func0000000000000001:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: slli a1, a1, 59
+; CHECK-NEXT: srai a1, a1, 63
+; CHECK-NEXT: addw a0, a1, a0
+; CHECK-NEXT: ret
+entry:
+ %2 = shl i8 %1, 3
+ %3 = ashr i8 %2, 7
+ %4 = sext i8 %3 to i32
+ %5 = add nsw i32 %4, %0
+ ret i32 %5
+}
+
+define i8 @sub_if_uge_i8(i8 %x, i8 %y) {
+; CHECK-LABEL: sub_if_uge_i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: zext.b a2, a0
+; CHECK-NEXT: sub a0, a0, a1
+; CHECK-NEXT: zext.b a0, a0
+; CHECK-NEXT: minu a0, a2, a0
+; CHECK-NEXT: ret
+ %cmp = icmp ult i8 %x, %y
+ %select = select i1 %cmp, i8 0, i8 %y
+ %sub = sub nuw i8 %x, %select
+ ret i8 %sub
+}
+
+define i16 @sub_if_uge_i16(i16 %x, i16 %y) {
+; CHECK-LABEL: sub_if_uge_i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a2, 16
+; CHECK-NEXT: sub a1, a0, a1
+; CHECK-NEXT: addi a2, a2, -1
+; CHECK-NEXT: and a0, a0, a2
+; CHECK-NEXT: and a1, a1, a2
+; CHECK-NEXT: minu a0, a0, a1
+; CHECK-NEXT: ret
+ %cmp = icmp ult i16 %x, %y
+ %select = select i1 %cmp, i16 0, i16 %y
+ %sub = sub nuw i16 %x, %select
+ ret i16 %sub
+}
+
+define i32 @sub_if_uge_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: sub_if_uge_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: sext.w a2, a0
+; CHECK-NEXT: subw a0, a0, a1
+; CHECK-NEXT: minu a0, a2, a0
+; CHECK-NEXT: ret
+ %cmp = icmp ult i32 %x, %y
+ %select = select i1 %cmp, i32 0, i32 %y
+ %sub = sub nuw i32 %x, %select
+ ret i32 %sub
+}
+
+define i64 @sub_if_uge_i64(i64 %x, i64 %y) {
+; CHECK-LABEL: sub_if_uge_i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: sub a1, a0, a1
+; CHECK-NEXT: minu a0, a0, a1
+; CHECK-NEXT: ret
+ %cmp = icmp ult i64 %x, %y
+ %select = select i1 %cmp, i64 0, i64 %y
+ %sub = sub nuw i64 %x, %select
+ ret i64 %sub
+}
+
+define i128 @sub_if_uge_i128(i128 %x, i128 %y) {
+; CHECK-LABEL: sub_if_uge_i128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: beq a1, a3, .LBB36_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: sltu a4, a1, a3
+; CHECK-NEXT: j .LBB36_3
+; CHECK-NEXT: .LBB36_2:
+; CHECK-NEXT: sltu a4, a0, a2
+; CHECK-NEXT: .LBB36_3:
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: and a2, a4, a2
+; CHECK-NEXT: sltu a4, a0, a2
+; CHECK-NEXT: sub a1, a1, a3
+; CHECK-NEXT: sub a1, a1, a4
+; CHECK-NEXT: sub a0, a0, a2
+; CHECK-NEXT: ret
+ %cmp = icmp ult i128 %x, %y
+ %select = select i1 %cmp, i128 0, i128 %y
+ %sub = sub nuw i128 %x, %select
+ ret i128 %sub
+}
+
+define i32 @sub_if_uge_multiuse_select_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: sub_if_uge_multiuse_select_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: sext.w a2, a1
+; CHECK-NEXT: sext.w a3, a0
+; CHECK-NEXT: sltu a2, a3, a2
+; CHECK-NEXT: addi a2, a2, -1
+; CHECK-NEXT: and a1, a2, a1
+; CHECK-NEXT: sub a0, a0, a1
+; CHECK-NEXT: sllw a0, a0, a1
+; CHECK-NEXT: ret
+ %cmp = icmp ult i32 %x, %y
+ %select = select i1 %cmp, i32 0, i32 %y
+ %sub = sub nuw i32 %x, %select
+ %shl = shl i32 %sub, %select
+ ret i32 %shl
+}
+
+define i32 @sub_if_uge_multiuse_cmp_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: sub_if_uge_multiuse_cmp_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: sext.w a2, a1
+; CHECK-NEXT: sext.w a3, a0
+; CHECK-NEXT: subw a0, a0, a1
+; CHECK-NEXT: minu a0, a3, a0
+; CHECK-NEXT: bltu a3, a2, .LBB38_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: li a1, 4
+; CHECK-NEXT: sllw a0, a0, a1
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB38_2:
+; CHECK-NEXT: li a1, 2
+; CHECK-NEXT: sllw a0, a0, a1
+; CHECK-NEXT: ret
+ %cmp = icmp ult i32 %x, %y
+ %select = select i1 %cmp, i32 0, i32 %y
+ %sub = sub nuw i32 %x, %select
+ %select2 = select i1 %cmp, i32 2, i32 4
+ %shl = shl i32 %sub, %select2
+ ret i32 %shl
+}
+
+define i32 @sub_if_uge_multiuse_cmp_store_i32(i32 signext %x, i32 signext %y, ptr %z) {
+; CHECK-LABEL: sub_if_uge_multiuse_cmp_store_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: sltu a3, a0, a1
+; CHECK-NEXT: subw a1, a0, a1
+; CHECK-NEXT: xori a3, a3, 1
+; CHECK-NEXT: minu a0, a0, a1
+; CHECK-NEXT: sw a3, 0(a2)
+; CHECK-NEXT: ret
+ %cmp = icmp uge i32 %x, %y
+ %conv = zext i1 %cmp to i32
+ store i32 %conv, ptr %z, align 4
+ %select = select i1 %cmp, i32 %y, i32 0
+ %sub = sub nuw i32 %x, %select
+ ret i32 %sub
+}
+
+define i8 @sub_if_uge_C_i8(i8 zeroext %x) {
+; CHECK-LABEL: sub_if_uge_C_i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi a1, a0, -13
+; CHECK-NEXT: zext.b a1, a1
+; CHECK-NEXT: minu a0, a1, a0
+; CHECK-NEXT: ret
+ %cmp = icmp ugt i8 %x, 12
+ %sub = add i8 %x, -13
+ %conv4 = select i1 %cmp, i8 %sub, i8 %x
+ ret i8 %conv4
+}
+
+define i16 @sub_if_uge_C_i16(i16 zeroext %x) {
+; CHECK-LABEL: sub_if_uge_C_i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi a1, a0, -251
+; CHECK-NEXT: slli a1, a1, 48
+; CHECK-NEXT: srli a1, a1, 48
+; CHECK-NEXT: minu a0, a1, a0
+; CHECK-NEXT: ret
+ %cmp = icmp ugt i16 %x, 250
+ %sub = add i16 %x, -251
+ %conv4 = select i1 %cmp, i16 %sub, i16 %x
+ ret i16 %conv4
+}
+
+define i32 @sub_if_uge_C_i32(i32 signext %x) {
+; CHECK-LABEL: sub_if_uge_C_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a1, 1048560
+; CHECK-NEXT: addi a1, a1, 15
+; CHECK-NEXT: addw a1, a0, a1
+; CHECK-NEXT: minu a0, a1, a0
+; CHECK-NEXT: ret
+ %cmp = icmp ugt i32 %x, 65520
+ %sub = add i32 %x, -65521
+ %cond = select i1 %cmp, i32 %sub, i32 %x
+ ret i32 %cond
+}
+
+define i64 @sub_if_uge_C_i64(i64 %x) {
+; CHECK-LABEL: sub_if_uge_C_i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a1, 1046192
+; CHECK-NEXT: addi a1, a1, -761
+; CHECK-NEXT: slli a1, a1, 9
+; CHECK-NEXT: add a1, a0, a1
+; CHECK-NEXT: minu a0, a1, a0
+; CHECK-NEXT: ret
+ %cmp = icmp ugt i64 %x, 4999999999
+ %sub = add i64 %x, -5000000000
+ %cond = select i1 %cmp, i64 %sub, i64 %x
+ ret i64 %cond
+}
+
+define i32 @sub_if_uge_C_multiuse_cmp_i32(i32 signext %x, ptr %z) {
+; CHECK-LABEL: sub_if_uge_C_multiuse_cmp_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a2, 16
+; CHECK-NEXT: lui a3, 1048560
+; CHECK-NEXT: addi a2, a2, -16
+; CHECK-NEXT: addi a3, a3, 15
+; CHECK-NEXT: sltu a2, a2, a0
+; CHECK-NEXT: addw a3, a0, a3
+; CHECK-NEXT: minu a0, a3, a0
+; CHECK-NEXT: sw a2, 0(a1)
+; CHECK-NEXT: ret
+ %cmp = icmp ugt i32 %x, 65520
+ %conv = zext i1 %cmp to i32
+ store i32 %conv, ptr %z, align 4
+ %sub = add i32 %x, -65521
+ %cond = select i1 %cmp, i32 %sub, i32 %x
+ ret i32 %cond
+}
+
+define i32 @sub_if_uge_C_multiuse_sub_i32(i32 signext %x, ptr %z) {
+; CHECK-LABEL: sub_if_uge_C_multiuse_sub_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a2, 1048560
+; CHECK-NEXT: addi a2, a2, 15
+; CHECK-NEXT: addw a2, a0, a2
+; CHECK-NEXT: minu a0, a2, a0
+; CHECK-NEXT: sw a2, 0(a1)
+; CHECK-NEXT: ret
+ %sub = add i32 %x, -65521
+ store i32 %sub, ptr %z, align 4
+ %cmp = icmp ugt i32 %x, 65520
+ %cond = select i1 %cmp, i32 %sub, i32 %x
+ ret i32 %cond
+}
+
+define i32 @sub_if_uge_C_swapped_i32(i32 signext %x) {
+; CHECK-LABEL: sub_if_uge_C_swapped_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a1, 1048560
+; CHECK-NEXT: addi a1, a1, 15
+; CHECK-NEXT: addw a1, a0, a1
+; CHECK-NEXT: minu a0, a0, a1
+; CHECK-NEXT: ret
+ %cmp = icmp ult i32 %x, 65521
+ %sub = add i32 %x, -65521
+ %cond = select i1 %cmp, i32 %x, i32 %sub
+ ret i32 %cond
+}
+
+define i7 @sub_if_uge_C_nsw_i7(i7 %a) {
+; CHECK-LABEL: sub_if_uge_C_nsw_i7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ori a0, a0, 51
+; CHECK-NEXT: andi a1, a0, 127
+; CHECK-NEXT: addi a0, a0, 17
+; CHECK-NEXT: andi a0, a0, 92
+; CHECK-NEXT: minu a0, a0, a1
+; CHECK-NEXT: ret
+ %x = or i7 %a, 51
+ %c = icmp ugt i7 %x, -18
+ %add = add nsw i7 %x, 17
+ %s = select i1 %c, i7 %add, i7 %x
+ ret i7 %s
+}
+
+define i7 @sub_if_uge_C_swapped_nsw_i7(i7 %a) {
+; CHECK-LABEL: sub_if_uge_C_swapped_nsw_i7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ori a0, a0, 51
+; CHECK-NEXT: andi a1, a0, 127
+; CHECK-NEXT: addi a0, a0, 17
+; CHECK-NEXT: andi a0, a0, 92
+; CHECK-NEXT: minu a0, a1, a0
+; CHECK-NEXT: ret
+ %x = or i7 %a, 51
+ %c = icmp ult i7 %x, -17
+ %add = add nsw i7 %x, 17
+ %s = select i1 %c, i7 %x, i7 %add
+ ret i7 %s
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sf_vfbfexp16e.ll b/llvm/test/CodeGen/RISCV/rvv/sf_vfbfexp16e.ll
new file mode 100644
index 0000000..5c0c6c1
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sf_vfbfexp16e.ll
@@ -0,0 +1,191 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfbfmin,+xsfvfbfexp16e \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfbfmin,+xsfvfbfexp16e \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+define <vscale x 1 x bfloat> @intrinsic_sf_vfexp_v_nxv1bf16(<vscale x 1 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: sf.vfexp.v v8, v8
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.nxv1bf16(
+ <vscale x 1 x bfloat> poison,
+ <vscale x 1 x bfloat> %0,
+ iXLen %1)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 2 x bfloat> @intrinsic_sf_vfexp_v_nxv2bf16(<vscale x 2 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: sf.vfexp.v v8, v8
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.nxv2bf16(
+ <vscale x 2 x bfloat> poison,
+ <vscale x 2 x bfloat> %0,
+ iXLen %1)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+define <vscale x 4 x bfloat> @intrinsic_sf_vfexp_v_nxv4bf16(<vscale x 4 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: sf.vfexp.v v8, v8
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.nxv4bf16(
+ <vscale x 4 x bfloat> poison,
+ <vscale x 4 x bfloat> %0,
+ iXLen %1)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+define <vscale x 8 x bfloat> @intrinsic_sf_vfexp_v_nxv8bf16(<vscale x 8 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: sf.vfexp.v v8, v8
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.nxv8bf16(
+ <vscale x 8 x bfloat> poison,
+ <vscale x 8 x bfloat> %0,
+ iXLen %1)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+define <vscale x 16 x bfloat> @intrinsic_sf_vfexp_v_nxv16bf16(<vscale x 16 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: sf.vfexp.v v8, v8
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.nxv16bf16(
+ <vscale x 16 x bfloat> poison,
+ <vscale x 16 x bfloat> %0,
+ iXLen %1)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+define <vscale x 32 x bfloat> @intrinsic_sf_vfexp_v_nxv32bf16(<vscale x 32 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv32bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT: sf.vfexp.v v8, v8
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.nxv32bf16(
+ <vscale x 32 x bfloat> poison,
+ <vscale x 32 x bfloat> %0,
+ iXLen %1)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat> @intrinsic_sf_vfexp_mask_v_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT: sf.vfexp.v v8, v9, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv1bf16(
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x i1> %m,
+ iXLen %2, iXLen 0)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 2 x bfloat> @intrinsic_sf_vfexp_mask_v_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT: sf.vfexp.v v8, v9, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv2bf16(
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x i1> %m,
+ iXLen %2, iXLen 0)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+define <vscale x 4 x bfloat> @intrinsic_sf_vfexp_mask_v_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT: sf.vfexp.v v8, v9, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv4bf16(
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x i1> %m,
+ iXLen %2, iXLen 0)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+define <vscale x 8 x bfloat> @intrinsic_sf_vfexp_mask_v_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT: sf.vfexp.v v8, v10, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv8bf16(
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x i1> %m,
+ iXLen %2, iXLen 0)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+define <vscale x 16 x bfloat> @intrinsic_sf_vfexp_mask_v_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT: sf.vfexp.v v8, v12, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv16bf16(
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x i1> %m,
+ iXLen %2, iXLen 0)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+define <vscale x 32 x bfloat> @intrinsic_sf_vfexp_mask_v_nxv32bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, <vscale x 32 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv32bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m8, tu, mu
+; CHECK-NEXT: sf.vfexp.v v8, v16, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv32bf16(
+ <vscale x 32 x bfloat> %0,
+ <vscale x 32 x bfloat> %1,
+ <vscale x 32 x i1> %m,
+ iXLen %2, iXLen 0)
+
+ ret <vscale x 32 x bfloat> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sf_vfexp16e.ll b/llvm/test/CodeGen/RISCV/rvv/sf_vfexp16e.ll
new file mode 100644
index 0000000..2d97f73
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sf_vfexp16e.ll
@@ -0,0 +1,191 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+xsfvfexp16e \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+xsfvfexp16e \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+define <vscale x 1 x half> @intrinsic_sf_vfexp_v_nxv1f16(<vscale x 1 x half> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv1f16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT: sf.vfexp.v v8, v8
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.nxv1f16(
+ <vscale x 1 x half> poison,
+ <vscale x 1 x half> %0,
+ iXLen %1)
+
+ ret <vscale x 1 x half> %a
+}
+
+define <vscale x 2 x half> @intrinsic_sf_vfexp_v_nxv2f16(<vscale x 2 x half> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv2f16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT: sf.vfexp.v v8, v8
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.nxv2f16(
+ <vscale x 2 x half> poison,
+ <vscale x 2 x half> %0,
+ iXLen %1)
+
+ ret <vscale x 2 x half> %a
+}
+
+define <vscale x 4 x half> @intrinsic_sf_vfexp_v_nxv4f16(<vscale x 4 x half> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv4f16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT: sf.vfexp.v v8, v8
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.nxv4f16(
+ <vscale x 4 x half> poison,
+ <vscale x 4 x half> %0,
+ iXLen %1)
+
+ ret <vscale x 4 x half> %a
+}
+
+define <vscale x 8 x half> @intrinsic_sf_vfexp_v_nxv8f16(<vscale x 8 x half> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv8f16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT: sf.vfexp.v v8, v8
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.nxv8f16(
+ <vscale x 8 x half> poison,
+ <vscale x 8 x half> %0,
+ iXLen %1)
+
+ ret <vscale x 8 x half> %a
+}
+
+define <vscale x 16 x half> @intrinsic_sf_vfexp_v_nxv16f16(<vscale x 16 x half> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv16f16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT: sf.vfexp.v v8, v8
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.nxv16f16(
+ <vscale x 16 x half> poison,
+ <vscale x 16 x half> %0,
+ iXLen %1)
+
+ ret <vscale x 16 x half> %a
+}
+
+define <vscale x 32 x half> @intrinsic_sf_vfexp_v_nxv32f16(<vscale x 32 x half> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv32f16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT: sf.vfexp.v v8, v8
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.nxv32f16(
+ <vscale x 32 x half> poison,
+ <vscale x 32 x half> %0,
+ iXLen %1)
+
+ ret <vscale x 32 x half> %a
+}
+
+define <vscale x 1 x half> @intrinsic_sf_vfexp_mask_v_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv1f16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu
+; CHECK-NEXT: sf.vfexp.v v8, v9, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.mask.nxv1f16(
+ <vscale x 1 x half> %0,
+ <vscale x 1 x half> %1,
+ <vscale x 1 x i1> %m,
+ iXLen %2, iXLen 0)
+
+ ret <vscale x 1 x half> %a
+}
+
+define <vscale x 2 x half> @intrinsic_sf_vfexp_mask_v_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv2f16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, mu
+; CHECK-NEXT: sf.vfexp.v v8, v9, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.mask.nxv2f16(
+ <vscale x 2 x half> %0,
+ <vscale x 2 x half> %1,
+ <vscale x 2 x i1> %m,
+ iXLen %2, iXLen 0)
+
+ ret <vscale x 2 x half> %a
+}
+
+define <vscale x 4 x half> @intrinsic_sf_vfexp_mask_v_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv4f16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, mu
+; CHECK-NEXT: sf.vfexp.v v8, v9, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.mask.nxv4f16(
+ <vscale x 4 x half> %0,
+ <vscale x 4 x half> %1,
+ <vscale x 4 x i1> %m,
+ iXLen %2, iXLen 0)
+
+ ret <vscale x 4 x half> %a
+}
+
+define <vscale x 8 x half> @intrinsic_sf_vfexp_mask_v_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv8f16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16, m2, tu, mu
+; CHECK-NEXT: sf.vfexp.v v8, v10, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.mask.nxv8f16(
+ <vscale x 8 x half> %0,
+ <vscale x 8 x half> %1,
+ <vscale x 8 x i1> %m,
+ iXLen %2, iXLen 0)
+
+ ret <vscale x 8 x half> %a
+}
+
+define <vscale x 16 x half> @intrinsic_sf_vfexp_mask_v_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv16f16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16, m4, tu, mu
+; CHECK-NEXT: sf.vfexp.v v8, v12, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.mask.nxv16f16(
+ <vscale x 16 x half> %0,
+ <vscale x 16 x half> %1,
+ <vscale x 16 x i1> %m,
+ iXLen %2, iXLen 0)
+
+ ret <vscale x 16 x half> %a
+}
+
+define <vscale x 32 x half> @intrinsic_sf_vfexp_mask_v_nxv32f16(<vscale x 32 x half> %0, <vscale x 32 x half> %1, <vscale x 32 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv32f16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16, m8, tu, mu
+; CHECK-NEXT: sf.vfexp.v v8, v16, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.mask.nxv32f16(
+ <vscale x 32 x half> %0,
+ <vscale x 32 x half> %1,
+ <vscale x 32 x i1> %m,
+ iXLen %2, iXLen 0)
+
+ ret <vscale x 32 x half> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sf_vfexp32e.ll b/llvm/test/CodeGen/RISCV/rvv/sf_vfexp32e.ll
new file mode 100644
index 0000000..46dce14
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sf_vfexp32e.ll
@@ -0,0 +1,160 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+xsfvfexp32e \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+xsfvfexp32e \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+define <vscale x 1 x float> @intrinsic_sf_vfexp_v_nxv1f32(<vscale x 1 x float> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv1f32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: sf.vfexp.v v8, v8
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.nxv1f32(
+ <vscale x 1 x float> poison,
+ <vscale x 1 x float> %0,
+ iXLen %1)
+
+ ret <vscale x 1 x float> %a
+}
+
+define <vscale x 2 x float> @intrinsic_sf_vfexp_v_nxv2f32(<vscale x 2 x float> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv2f32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: sf.vfexp.v v8, v8
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.nxv2f32(
+ <vscale x 2 x float> poison,
+ <vscale x 2 x float> %0,
+ iXLen %1)
+
+ ret <vscale x 2 x float> %a
+}
+
+define <vscale x 4 x float> @intrinsic_sf_vfexp_v_nxv4f32(<vscale x 4 x float> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv4f32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: sf.vfexp.v v8, v8
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.nxv4f32(
+ <vscale x 4 x float> poison,
+ <vscale x 4 x float> %0,
+ iXLen %1)
+
+ ret <vscale x 4 x float> %a
+}
+
+define <vscale x 8 x float> @intrinsic_sf_vfexp_v_nxv8f32(<vscale x 8 x float> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv8f32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: sf.vfexp.v v8, v8
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.nxv8f32(
+ <vscale x 8 x float> poison,
+ <vscale x 8 x float> %0,
+ iXLen %1)
+
+ ret <vscale x 8 x float> %a
+}
+
+define <vscale x 16 x float> @intrinsic_sf_vfexp_v_nxv16f32(<vscale x 16 x float> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv16f32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: sf.vfexp.v v8, v8
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.nxv16f32(
+ <vscale x 16 x float> poison,
+ <vscale x 16 x float> %0,
+ iXLen %1)
+
+ ret <vscale x 16 x float> %a
+}
+
+define <vscale x 1 x float> @intrinsic_sf_vfexp_mask_v_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv1f32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, mu
+; CHECK-NEXT: sf.vfexp.v v8, v9, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.mask.nxv1f32(
+ <vscale x 1 x float> %0,
+ <vscale x 1 x float> %1,
+ <vscale x 1 x i1> %m,
+ iXLen %2, iXLen 0)
+
+ ret <vscale x 1 x float> %a
+}
+
+define <vscale x 2 x float> @intrinsic_sf_vfexp_mask_v_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv2f32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, mu
+; CHECK-NEXT: sf.vfexp.v v8, v9, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.mask.nxv2f32(
+ <vscale x 2 x float> %0,
+ <vscale x 2 x float> %1,
+ <vscale x 2 x i1> %m,
+ iXLen %2, iXLen 0)
+
+ ret <vscale x 2 x float> %a
+}
+
+define <vscale x 4 x float> @intrinsic_sf_vfexp_mask_v_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv4f32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, mu
+; CHECK-NEXT: sf.vfexp.v v8, v10, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.mask.nxv4f32(
+ <vscale x 4 x float> %0,
+ <vscale x 4 x float> %1,
+ <vscale x 4 x i1> %m,
+ iXLen %2, iXLen 0)
+
+ ret <vscale x 4 x float> %a
+}
+
+define <vscale x 8 x float> @intrinsic_sf_vfexp_mask_v_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv8f32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, tu, mu
+; CHECK-NEXT: sf.vfexp.v v8, v12, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.mask.nxv8f32(
+ <vscale x 8 x float> %0,
+ <vscale x 8 x float> %1,
+ <vscale x 8 x i1> %m,
+ iXLen %2, iXLen 0)
+
+ ret <vscale x 8 x float> %a
+}
+
+define <vscale x 16 x float> @intrinsic_sf_vfexp_mask_v_nxv16f32(<vscale x 16 x float> %0, <vscale x 16 x float> %1, <vscale x 16 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv16f32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, tu, mu
+; CHECK-NEXT: sf.vfexp.v v8, v16, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.mask.nxv16f32(
+ <vscale x 16 x float> %0,
+ <vscale x 16 x float> %1,
+ <vscale x 16 x i1> %m,
+ iXLen %2, iXLen 0)
+
+ ret <vscale x 16 x float> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sf_vfexpa.ll b/llvm/test/CodeGen/RISCV/rvv/sf_vfexpa.ll
new file mode 100644
index 0000000..d3d10d2
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sf_vfexpa.ll
@@ -0,0 +1,335 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+zve64f,+zvfh,+xsfvfexpa \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+zve64f,+zvfh,+xsfvfexpa \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+define <vscale x 1 x float> @test_intrinsic_sf_vfexpa_v_nxv1f32(<vscale x 1 x float> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv1f32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: sf.vfexpa.v v8, v8
+; CHECK-NEXT: ret
+entry:
+ %f = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.nxv1f32(
+ <vscale x 1 x float> poison,
+ <vscale x 1 x float> %0,
+ iXLen %1)
+ ret <vscale x 1 x float> %f
+}
+
+define <vscale x 2 x float> @test_intrinsic_sf_vfexpa_v_nxv2f32(<vscale x 2 x float> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv2f32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: sf.vfexpa.v v8, v8
+; CHECK-NEXT: ret
+entry:
+ %f = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.nxv2f32(
+ <vscale x 2 x float> poison,
+ <vscale x 2 x float> %0,
+ iXLen %1)
+ ret <vscale x 2 x float> %f
+}
+
+define <vscale x 4 x float> @test_intrinsic_sf_vfexpa_v_nxv4f32(<vscale x 4 x float> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv4f32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: sf.vfexpa.v v8, v8
+; CHECK-NEXT: ret
+entry:
+ %f = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.nxv4f32(
+ <vscale x 4 x float> poison,
+ <vscale x 4 x float> %0,
+ iXLen %1)
+ ret <vscale x 4 x float> %f
+}
+
+define <vscale x 8 x float> @test_intrinsic_sf_vfexpa_v_nxv8f32(<vscale x 8 x float> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv8f32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: sf.vfexpa.v v8, v8
+; CHECK-NEXT: ret
+entry:
+ %f = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.nxv8f32(
+ <vscale x 8 x float> poison,
+ <vscale x 8 x float> %0,
+ iXLen %1)
+ ret <vscale x 8 x float> %f
+}
+
+define <vscale x 16 x float> @test_intrinsic_sf_vfexpa_v_nxv16f32(<vscale x 16 x float> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv16f32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: sf.vfexpa.v v8, v8
+; CHECK-NEXT: ret
+entry:
+ %f = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.nxv16f32(
+ <vscale x 16 x float> poison,
+ <vscale x 16 x float> %0,
+ iXLen %1)
+ ret <vscale x 16 x float> %f
+}
+
+define <vscale x 1 x float> @test_intrinsic_sf_vfexpa_v_mask_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv1f32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, mu
+; CHECK-NEXT: sf.vfexpa.v v8, v9, v0.t
+; CHECK-NEXT: ret
+entry:
+ %f = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.mask.nxv1f32(
+ <vscale x 1 x float> %0,
+ <vscale x 1 x float> %1,
+ <vscale x 1 x i1> %m,
+ iXLen %vl,
+ iXLen 0)
+ ret <vscale x 1 x float> %f
+}
+
+define <vscale x 2 x float> @test_intrinsic_sf_vfexpa_v_mask_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv2f32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, mu
+; CHECK-NEXT: sf.vfexpa.v v8, v9, v0.t
+; CHECK-NEXT: ret
+entry:
+ %f = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.mask.nxv2f32(
+ <vscale x 2 x float> %0,
+ <vscale x 2 x float> %1,
+ <vscale x 2 x i1> %m,
+ iXLen %vl,
+ iXLen 0)
+ ret <vscale x 2 x float> %f
+}
+
+define <vscale x 4 x float> @test_intrinsic_sf_vfexpa_v_mask_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv4f32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, mu
+; CHECK-NEXT: sf.vfexpa.v v8, v10, v0.t
+; CHECK-NEXT: ret
+entry:
+ %f = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.mask.nxv4f32(
+ <vscale x 4 x float> %0,
+ <vscale x 4 x float> %1,
+ <vscale x 4 x i1> %m,
+ iXLen %vl,
+ iXLen 0)
+ ret <vscale x 4 x float> %f
+}
+
+define <vscale x 8 x float> @test_intrinsic_sf_vfexpa_v_mask_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv8f32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, tu, mu
+; CHECK-NEXT: sf.vfexpa.v v8, v12, v0.t
+; CHECK-NEXT: ret
+entry:
+ %f = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.mask.nxv8f32(
+ <vscale x 8 x float> %0,
+ <vscale x 8 x float> %1,
+ <vscale x 8 x i1> %m,
+ iXLen %vl,
+ iXLen 0)
+ ret <vscale x 8 x float> %f
+}
+
+define <vscale x 16 x float> @test_intrinsic_sf_vfexpa_v_mask_nxv16f32(<vscale x 16 x float> %0, <vscale x 16 x float> %1, <vscale x 16 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv16f32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, tu, mu
+; CHECK-NEXT: sf.vfexpa.v v8, v16, v0.t
+; CHECK-NEXT: ret
+entry:
+ %f = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.mask.nxv16f32(
+ <vscale x 16 x float> %0,
+ <vscale x 16 x float> %1,
+ <vscale x 16 x i1> %m,
+ iXLen %vl,
+ iXLen 0)
+ ret <vscale x 16 x float> %f
+}
+
+define <vscale x 1 x half> @test_intrinsic_sf_vfexpa_v_nxv1f16(<vscale x 1 x half> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv1f16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT: sf.vfexpa.v v8, v8
+; CHECK-NEXT: ret
+entry:
+ %f = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.nxv1f16(
+ <vscale x 1 x half> poison,
+ <vscale x 1 x half> %0,
+ iXLen %1)
+ ret <vscale x 1 x half> %f
+}
+
+define <vscale x 2 x half> @test_intrinsic_sf_vfexpa_v_nxv2f16(<vscale x 2 x half> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv2f16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT: sf.vfexpa.v v8, v8
+; CHECK-NEXT: ret
+entry:
+ %f = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.nxv2f16(
+ <vscale x 2 x half> poison,
+ <vscale x 2 x half> %0,
+ iXLen %1)
+ ret <vscale x 2 x half> %f
+}
+
+define <vscale x 4 x half> @test_intrinsic_sf_vfexpa_v_nxv4f16(<vscale x 4 x half> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv4f16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT: sf.vfexpa.v v8, v8
+; CHECK-NEXT: ret
+entry:
+ %f = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.nxv4f16(
+ <vscale x 4 x half> poison,
+ <vscale x 4 x half> %0,
+ iXLen %1)
+ ret <vscale x 4 x half> %f
+}
+
+define <vscale x 8 x half> @test_intrinsic_sf_vfexpa_v_nxv8f16(<vscale x 8 x half> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv8f16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT: sf.vfexpa.v v8, v8
+; CHECK-NEXT: ret
+entry:
+ %f = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.nxv8f16(
+ <vscale x 8 x half> poison,
+ <vscale x 8 x half> %0,
+ iXLen %1)
+ ret <vscale x 8 x half> %f
+}
+
+define <vscale x 16 x half> @test_intrinsic_sf_vfexpa_v_nxv16f16(<vscale x 16 x half> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv16f16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT: sf.vfexpa.v v8, v8
+; CHECK-NEXT: ret
+entry:
+ %f = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.nxv16f16(
+ <vscale x 16 x half> poison,
+ <vscale x 16 x half> %0,
+ iXLen %1)
+ ret <vscale x 16 x half> %f
+}
+
+define <vscale x 32 x half> @test_intrinsic_sf_vfexpa_v_nxv32f16(<vscale x 32 x half> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv32f16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT: sf.vfexpa.v v8, v8
+; CHECK-NEXT: ret
+entry:
+ %f = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.nxv32f16(
+ <vscale x 32 x half> poison,
+ <vscale x 32 x half> %0,
+ iXLen %1)
+ ret <vscale x 32 x half> %f
+}
+
+define <vscale x 1 x half> @test_intrinsic_sf_vfexpa_v_mask_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv1f16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu
+; CHECK-NEXT: sf.vfexpa.v v8, v9, v0.t
+; CHECK-NEXT: ret
+entry:
+ %f = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.mask.nxv1f16(
+ <vscale x 1 x half> %0,
+ <vscale x 1 x half> %1,
+ <vscale x 1 x i1> %m,
+ iXLen %vl,
+ iXLen 0)
+ ret <vscale x 1 x half> %f
+}
+
+define <vscale x 2 x half> @test_intrinsic_sf_vfexpa_v_mask_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv2f16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, mu
+; CHECK-NEXT: sf.vfexpa.v v8, v9, v0.t
+; CHECK-NEXT: ret
+entry:
+ %f = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.mask.nxv2f16(
+ <vscale x 2 x half> %0,
+ <vscale x 2 x half> %1,
+ <vscale x 2 x i1> %m,
+ iXLen %vl,
+ iXLen 0)
+ ret <vscale x 2 x half> %f
+}
+
+define <vscale x 4 x half> @test_intrinsic_sf_vfexpa_v_mask_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv4f16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, mu
+; CHECK-NEXT: sf.vfexpa.v v8, v9, v0.t
+; CHECK-NEXT: ret
+entry:
+ %f = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.mask.nxv4f16(
+ <vscale x 4 x half> %0,
+ <vscale x 4 x half> %1,
+ <vscale x 4 x i1> %m,
+ iXLen %vl,
+ iXLen 0)
+ ret <vscale x 4 x half> %f
+}
+
+define <vscale x 8 x half> @test_intrinsic_sf_vfexpa_v_mask_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv8f16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16, m2, tu, mu
+; CHECK-NEXT: sf.vfexpa.v v8, v10, v0.t
+; CHECK-NEXT: ret
+entry:
+ %f = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.mask.nxv8f16(
+ <vscale x 8 x half> %0,
+ <vscale x 8 x half> %1,
+ <vscale x 8 x i1> %m,
+ iXLen %vl,
+ iXLen 0)
+ ret <vscale x 8 x half> %f
+}
+
+define <vscale x 16 x half> @test_intrinsic_sf_vfexpa_v_mask_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv16f16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16, m4, tu, mu
+; CHECK-NEXT: sf.vfexpa.v v8, v12, v0.t
+; CHECK-NEXT: ret
+entry:
+ %f = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.mask.nxv16f16(
+ <vscale x 16 x half> %0,
+ <vscale x 16 x half> %1,
+ <vscale x 16 x i1> %m,
+ iXLen %vl,
+ iXLen 0)
+ ret <vscale x 16 x half> %f
+}
+
+define <vscale x 32 x half> @test_intrinsic_sf_vfexpa_v_mask_nxv32f16(<vscale x 32 x half> %0, <vscale x 32 x half> %1, <vscale x 32 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv32f16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16, m8, tu, mu
+; CHECK-NEXT: sf.vfexpa.v v8, v16, v0.t
+; CHECK-NEXT: ret
+entry:
+ %f = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.mask.nxv32f16(
+ <vscale x 32 x half> %0,
+ <vscale x 32 x half> %1,
+ <vscale x 32 x i1> %m,
+ iXLen %vl,
+ iXLen 0)
+ ret <vscale x 32 x half> %f
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sf_vfexpa64e.ll b/llvm/test/CodeGen/RISCV/rvv/sf_vfexpa64e.ll
new file mode 100644
index 0000000..3de0e93
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sf_vfexpa64e.ll
@@ -0,0 +1,125 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+xsfvfexpa64e \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+xsfvfexpa64e \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+define <vscale x 1 x double> @test_intrinsic_sf_vfexpa_v_nxv1f64(<vscale x 1 x double> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv1f64:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT: sf.vfexpa.v v8, v8
+; CHECK-NEXT: ret
+entry:
+ %f = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.nxv1f64(
+ <vscale x 1 x double> poison,
+ <vscale x 1 x double> %0,
+ iXLen %1)
+ ret <vscale x 1 x double> %f
+}
+
+define <vscale x 2 x double> @test_intrinsic_sf_vfexpa_v_nxv2f64(<vscale x 2 x double> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv2f64:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT: sf.vfexpa.v v8, v8
+; CHECK-NEXT: ret
+entry:
+ %f = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.nxv2f64(
+ <vscale x 2 x double> poison,
+ <vscale x 2 x double> %0,
+ iXLen %1)
+ ret <vscale x 2 x double> %f
+}
+
+define <vscale x 4 x double> @test_intrinsic_sf_vfexpa_v_nxv4f64(<vscale x 4 x double> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv4f64:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT: sf.vfexpa.v v8, v8
+; CHECK-NEXT: ret
+entry:
+ %f = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.nxv4f64(
+ <vscale x 4 x double> poison,
+ <vscale x 4 x double> %0,
+ iXLen %1)
+ ret <vscale x 4 x double> %f
+}
+
+define <vscale x 8 x double> @test_intrinsic_sf_vfexpa_v_nxv8f64(<vscale x 8 x double> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv8f64:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: sf.vfexpa.v v8, v8
+; CHECK-NEXT: ret
+entry:
+ %f = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.nxv8f64(
+ <vscale x 8 x double> poison,
+ <vscale x 8 x double> %0,
+ iXLen %1)
+ ret <vscale x 8 x double> %f
+}
+
+define <vscale x 1 x double> @test_intrinsic_sf_vfexpa_v_mask_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv1f64:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, mu
+; CHECK-NEXT: sf.vfexpa.v v8, v9, v0.t
+; CHECK-NEXT: ret
+entry:
+ %f = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.mask.nxv1f64(
+ <vscale x 1 x double> %0,
+ <vscale x 1 x double> %1,
+ <vscale x 1 x i1> %m,
+ iXLen %vl,
+ iXLen 0)
+ ret <vscale x 1 x double> %f
+}
+
+define <vscale x 2 x double> @test_intrinsic_sf_vfexpa_v_mask_nxv2f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv2f64:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, mu
+; CHECK-NEXT: sf.vfexpa.v v8, v10, v0.t
+; CHECK-NEXT: ret
+entry:
+ %f = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.mask.nxv2f64(
+ <vscale x 2 x double> %0,
+ <vscale x 2 x double> %1,
+ <vscale x 2 x i1> %m,
+ iXLen %vl,
+ iXLen 0)
+ ret <vscale x 2 x double> %f
+}
+
+define <vscale x 4 x double> @test_intrinsic_sf_vfexpa_v_mask_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv4f64:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e64, m4, tu, mu
+; CHECK-NEXT: sf.vfexpa.v v8, v12, v0.t
+; CHECK-NEXT: ret
+entry:
+ %f = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.mask.nxv4f64(
+ <vscale x 4 x double> %0,
+ <vscale x 4 x double> %1,
+ <vscale x 4 x i1> %m,
+ iXLen %vl,
+ iXLen 0)
+ ret <vscale x 4 x double> %f
+}
+
+define <vscale x 8 x double> @test_intrinsic_sf_vfexpa_v_mask_nxv8f64(<vscale x 8 x double> %0, <vscale x 8 x double> %1, <vscale x 8 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv8f64:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e64, m8, tu, mu
+; CHECK-NEXT: sf.vfexpa.v v8, v16, v0.t
+; CHECK-NEXT: ret
+entry:
+ %f = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.mask.nxv8f64(
+ <vscale x 8 x double> %0,
+ <vscale x 8 x double> %1,
+ <vscale x 8 x i1> %m,
+ iXLen %vl,
+ iXLen 0)
+ ret <vscale x 8 x double> %f
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll
index 061b2b0..abd00b6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll
@@ -11,33 +11,80 @@
; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
; RUN: --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+zvfh,+experimental-zvfbfa,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFBFA
define <vscale x 1 x bfloat> @vfadd_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb) {
-; CHECK-LABEL: vfadd_vv_nxv1bf16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT: vfadd.vv v9, v9, v10
-; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
-; CHECK-NEXT: ret
+; ZVFH-LABEL: vfadd_vv_nxv1bf16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v10, v9
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v9, v8
+; ZVFH-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFH-NEXT: vfadd.vv v9, v9, v10
+; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v9
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv1bf16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v10, v9
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v9, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT: vfadd.vv v9, v9, v10
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v9
+; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv1bf16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: vsetvli a0, zero, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v10, v9
+; ZVFBFA-NEXT: vfwcvt.f.f.v v9, v8
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v9, v9, v10
+; ZVFBFA-NEXT: vsetvli zero, zero, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT: ret
%vc = fadd <vscale x 1 x bfloat> %va, %vb
ret <vscale x 1 x bfloat> %vc
}
define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b) {
-; CHECK-LABEL: vfadd_vf_nxv1bf16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: fcvt.s.bf16 fa5, fa0
-; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT: vfadd.vf v9, v9, fa5
-; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
-; CHECK-NEXT: ret
+; ZVFH-LABEL: vfadd_vf_nxv1bf16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: fcvt.s.bf16 fa5, fa0
+; ZVFH-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v9, v8
+; ZVFH-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFH-NEXT: vfadd.vf v9, v9, fa5
+; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v9
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv1bf16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: fcvt.s.bf16 fa5, fa0
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v9, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT: vfadd.vf v9, v9, fa5
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v9
+; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv1bf16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: fcvt.s.bf16 fa5, fa0
+; ZVFBFA-NEXT: vsetvli a0, zero, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v9, v8
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT: vfadd.vf v9, v9, fa5
+; ZVFBFA-NEXT: vsetvli zero, zero, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT: ret
%head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
%splat = shufflevector <vscale x 1 x bfloat> %head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
%vc = fadd <vscale x 1 x bfloat> %va, %splat
@@ -45,31 +92,75 @@ define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloa
}
define <vscale x 2 x bfloat> @vfadd_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb) {
-; CHECK-LABEL: vfadd_vv_nxv2bf16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT: vfadd.vv v9, v9, v10
-; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
-; CHECK-NEXT: ret
+; ZVFH-LABEL: vfadd_vv_nxv2bf16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v10, v9
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v9, v8
+; ZVFH-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFH-NEXT: vfadd.vv v9, v9, v10
+; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v9
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv2bf16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v10, v9
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v9, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT: vfadd.vv v9, v9, v10
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v9
+; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv2bf16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: vsetvli a0, zero, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v10, v9
+; ZVFBFA-NEXT: vfwcvt.f.f.v v9, v8
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v9, v9, v10
+; ZVFBFA-NEXT: vsetvli zero, zero, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT: ret
%vc = fadd <vscale x 2 x bfloat> %va, %vb
ret <vscale x 2 x bfloat> %vc
}
define <vscale x 2 x bfloat> @vfadd_vf_nxv2bf16(<vscale x 2 x bfloat> %va, bfloat %b) {
-; CHECK-LABEL: vfadd_vf_nxv2bf16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: fcvt.s.bf16 fa5, fa0
-; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT: vfadd.vf v9, v9, fa5
-; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
-; CHECK-NEXT: ret
+; ZVFH-LABEL: vfadd_vf_nxv2bf16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: fcvt.s.bf16 fa5, fa0
+; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v9, v8
+; ZVFH-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFH-NEXT: vfadd.vf v9, v9, fa5
+; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v9
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv2bf16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: fcvt.s.bf16 fa5, fa0
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v9, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT: vfadd.vf v9, v9, fa5
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v9
+; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv2bf16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: fcvt.s.bf16 fa5, fa0
+; ZVFBFA-NEXT: vsetvli a0, zero, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v9, v8
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFBFA-NEXT: vfadd.vf v9, v9, fa5
+; ZVFBFA-NEXT: vsetvli zero, zero, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT: ret
%head = insertelement <vscale x 2 x bfloat> poison, bfloat %b, i32 0
%splat = shufflevector <vscale x 2 x bfloat> %head, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
%vc = fadd <vscale x 2 x bfloat> %va, %splat
@@ -77,31 +168,75 @@ define <vscale x 2 x bfloat> @vfadd_vf_nxv2bf16(<vscale x 2 x bfloat> %va, bfloa
}
define <vscale x 4 x bfloat> @vfadd_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb) {
-; CHECK-LABEL: vfadd_vv_nxv4bf16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT: vfadd.vv v10, v12, v10
-; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
-; CHECK-NEXT: ret
+; ZVFH-LABEL: vfadd_vv_nxv4bf16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v10, v9
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v12, v8
+; ZVFH-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; ZVFH-NEXT: vfadd.vv v10, v12, v10
+; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v10
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv4bf16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v10, v9
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v12, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT: vfadd.vv v10, v12, v10
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v10
+; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv4bf16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: vsetvli a0, zero, e16alt, m1, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v10, v9
+; ZVFBFA-NEXT: vfwcvt.f.f.v v12, v8
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v10, v12, v10
+; ZVFBFA-NEXT: vsetvli zero, zero, e16alt, m1, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v10
+; ZVFBFA-NEXT: ret
%vc = fadd <vscale x 4 x bfloat> %va, %vb
ret <vscale x 4 x bfloat> %vc
}
define <vscale x 4 x bfloat> @vfadd_vf_nxv4bf16(<vscale x 4 x bfloat> %va, bfloat %b) {
-; CHECK-LABEL: vfadd_vf_nxv4bf16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: fcvt.s.bf16 fa5, fa0
-; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT: vfadd.vf v10, v10, fa5
-; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
-; CHECK-NEXT: ret
+; ZVFH-LABEL: vfadd_vf_nxv4bf16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: fcvt.s.bf16 fa5, fa0
+; ZVFH-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v10, v8
+; ZVFH-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; ZVFH-NEXT: vfadd.vf v10, v10, fa5
+; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v10
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv4bf16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: fcvt.s.bf16 fa5, fa0
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v10, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT: vfadd.vf v10, v10, fa5
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v10
+; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv4bf16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: fcvt.s.bf16 fa5, fa0
+; ZVFBFA-NEXT: vsetvli a0, zero, e16alt, m1, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v10, v8
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; ZVFBFA-NEXT: vfadd.vf v10, v10, fa5
+; ZVFBFA-NEXT: vsetvli zero, zero, e16alt, m1, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v10
+; ZVFBFA-NEXT: ret
%head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
%splat = shufflevector <vscale x 4 x bfloat> %head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
%vc = fadd <vscale x 4 x bfloat> %va, %splat
@@ -109,31 +244,75 @@ define <vscale x 4 x bfloat> @vfadd_vf_nxv4bf16(<vscale x 4 x bfloat> %va, bfloa
}
define <vscale x 8 x bfloat> @vfadd_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb) {
-; CHECK-LABEL: vfadd_vv_nxv8bf16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT: vfadd.vv v12, v16, v12
-; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
-; CHECK-NEXT: ret
+; ZVFH-LABEL: vfadd_vv_nxv8bf16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v12, v10
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v8
+; ZVFH-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; ZVFH-NEXT: vfadd.vv v12, v16, v12
+; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v12
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv8bf16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v12, v10
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT: vfadd.vv v12, v16, v12
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v12
+; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv8bf16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: vsetvli a0, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v12, v10
+; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v8
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v12, v16, v12
+; ZVFBFA-NEXT: vsetvli zero, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v12
+; ZVFBFA-NEXT: ret
%vc = fadd <vscale x 8 x bfloat> %va, %vb
ret <vscale x 8 x bfloat> %vc
}
define <vscale x 8 x bfloat> @vfadd_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) {
-; CHECK-LABEL: vfadd_vf_nxv8bf16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: fcvt.s.bf16 fa5, fa0
-; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT: vfadd.vf v12, v12, fa5
-; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
-; CHECK-NEXT: ret
+; ZVFH-LABEL: vfadd_vf_nxv8bf16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: fcvt.s.bf16 fa5, fa0
+; ZVFH-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v12, v8
+; ZVFH-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; ZVFH-NEXT: vfadd.vf v12, v12, fa5
+; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v12
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv8bf16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: fcvt.s.bf16 fa5, fa0
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v12, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT: vfadd.vf v12, v12, fa5
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v12
+; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv8bf16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: fcvt.s.bf16 fa5, fa0
+; ZVFBFA-NEXT: vsetvli a0, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v12, v8
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT: vfadd.vf v12, v12, fa5
+; ZVFBFA-NEXT: vsetvli zero, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v12
+; ZVFBFA-NEXT: ret
%head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
%splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
%vc = fadd <vscale x 8 x bfloat> %va, %splat
@@ -141,16 +320,38 @@ define <vscale x 8 x bfloat> @vfadd_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloa
}
define <vscale x 8 x bfloat> @vfadd_fv_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) {
-; CHECK-LABEL: vfadd_fv_nxv8bf16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: fcvt.s.bf16 fa5, fa0
-; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT: vfadd.vf v12, v12, fa5
-; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
-; CHECK-NEXT: ret
+; ZVFH-LABEL: vfadd_fv_nxv8bf16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: fcvt.s.bf16 fa5, fa0
+; ZVFH-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v12, v8
+; ZVFH-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; ZVFH-NEXT: vfadd.vf v12, v12, fa5
+; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v12
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfadd_fv_nxv8bf16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: fcvt.s.bf16 fa5, fa0
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v12, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT: vfadd.vf v12, v12, fa5
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v12
+; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_fv_nxv8bf16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: fcvt.s.bf16 fa5, fa0
+; ZVFBFA-NEXT: vsetvli a0, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v12, v8
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT: vfadd.vf v12, v12, fa5
+; ZVFBFA-NEXT: vsetvli zero, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v12
+; ZVFBFA-NEXT: ret
%head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
%splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
%vc = fadd <vscale x 8 x bfloat> %splat, %va
@@ -158,31 +359,75 @@ define <vscale x 8 x bfloat> @vfadd_fv_nxv8bf16(<vscale x 8 x bfloat> %va, bfloa
}
define <vscale x 16 x bfloat> @vfadd_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb) {
-; CHECK-LABEL: vfadd_vv_nxv16bf16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfadd.vv v16, v24, v16
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT: ret
+; ZVFH-LABEL: vfadd_vv_nxv16bf16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v12
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v8
+; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT: vfadd.vv v16, v24, v16
+; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v16
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv16bf16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v12
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfadd.vv v16, v24, v16
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v16
+; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv16bf16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: vsetvli a0, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v12
+; ZVFBFA-NEXT: vfwcvt.f.f.v v24, v8
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v16, v24, v16
+; ZVFBFA-NEXT: vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v16
+; ZVFBFA-NEXT: ret
%vc = fadd <vscale x 16 x bfloat> %va, %vb
ret <vscale x 16 x bfloat> %vc
}
define <vscale x 16 x bfloat> @vfadd_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bfloat %b) {
-; CHECK-LABEL: vfadd_vf_nxv16bf16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: fcvt.s.bf16 fa5, fa0
-; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfadd.vf v16, v16, fa5
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT: ret
+; ZVFH-LABEL: vfadd_vf_nxv16bf16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: fcvt.s.bf16 fa5, fa0
+; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v8
+; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT: vfadd.vf v16, v16, fa5
+; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v16
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv16bf16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: fcvt.s.bf16 fa5, fa0
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfadd.vf v16, v16, fa5
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v16
+; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv16bf16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: fcvt.s.bf16 fa5, fa0
+; ZVFBFA-NEXT: vsetvli a0, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v8
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT: vfadd.vf v16, v16, fa5
+; ZVFBFA-NEXT: vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v16
+; ZVFBFA-NEXT: ret
%head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
%splat = shufflevector <vscale x 16 x bfloat> %head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
%vc = fadd <vscale x 16 x bfloat> %va, %splat
@@ -190,78 +435,216 @@ define <vscale x 16 x bfloat> @vfadd_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bf
}
define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb) {
-; CHECK-LABEL: vfadd_vv_nxv32bf16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: sub sp, sp, a0
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfadd.vv v0, v0, v8
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfadd.vv v16, v16, v24
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
-; CHECK-NEXT: ret
+; ZVFH-LABEL: vfadd_vv_nxv32bf16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: addi sp, sp, -16
+; ZVFH-NEXT: .cfi_def_cfa_offset 16
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 3
+; ZVFH-NEXT: sub sp, sp, a0
+; ZVFH-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v16
+; ZVFH-NEXT: addi a0, sp, 16
+; ZVFH-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v0, v8
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v20
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v12
+; ZVFH-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT: vfadd.vv v0, v0, v8
+; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v0
+; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT: vfadd.vv v16, v16, v24
+; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v12, v16
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 3
+; ZVFH-NEXT: add sp, sp, a0
+; ZVFH-NEXT: .cfi_def_cfa sp, 16
+; ZVFH-NEXT: addi sp, sp, 16
+; ZVFH-NEXT: .cfi_def_cfa_offset 0
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv32bf16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: addi sp, sp, -16
+; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: sub sp, sp, a0
+; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v16
+; ZVFHMIN-NEXT: addi a0, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v0, v8
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v20
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v12
+; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfadd.vv v0, v0, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v0
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfadd.vv v16, v16, v24
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v12, v16
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: add sp, sp, a0
+; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT: addi sp, sp, 16
+; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
+; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv32bf16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: addi sp, sp, -16
+; ZVFBFA-NEXT: .cfi_def_cfa_offset 16
+; ZVFBFA-NEXT: csrr a0, vlenb
+; ZVFBFA-NEXT: slli a0, a0, 3
+; ZVFBFA-NEXT: sub sp, sp, a0
+; ZVFBFA-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFBFA-NEXT: vsetvli a0, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v24, v16
+; ZVFBFA-NEXT: addi a0, sp, 16
+; ZVFBFA-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT: vfwcvt.f.f.v v0, v8
+; ZVFBFA-NEXT: vfwcvt.f.f.v v24, v20
+; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v12
+; ZVFBFA-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v0, v0, v8
+; ZVFBFA-NEXT: vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v0
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v16, v16, v24
+; ZVFBFA-NEXT: vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v12, v16
+; ZVFBFA-NEXT: csrr a0, vlenb
+; ZVFBFA-NEXT: slli a0, a0, 3
+; ZVFBFA-NEXT: add sp, sp, a0
+; ZVFBFA-NEXT: .cfi_def_cfa sp, 16
+; ZVFBFA-NEXT: addi sp, sp, 16
+; ZVFBFA-NEXT: .cfi_def_cfa_offset 0
+; ZVFBFA-NEXT: ret
%vc = fadd <vscale x 32 x bfloat> %va, %vb
ret <vscale x 32 x bfloat> %vc
}
define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfloat %b) {
-; CHECK-LABEL: vfadd_vf_nxv32bf16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: sub sp, sp, a0
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT: fmv.x.h a0, fa0
-; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT: addi a1, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT: vmv.v.x v8, a0
-; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfadd.vv v0, v8, v0
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfadd.vv v16, v24, v16
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
-; CHECK-NEXT: ret
+; ZVFH-LABEL: vfadd_vf_nxv32bf16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: addi sp, sp, -16
+; ZVFH-NEXT: .cfi_def_cfa_offset 16
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 3
+; ZVFH-NEXT: sub sp, sp, a0
+; ZVFH-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFH-NEXT: fmv.x.h a0, fa0
+; ZVFH-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v8
+; ZVFH-NEXT: addi a1, sp, 16
+; ZVFH-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v12
+; ZVFH-NEXT: vsetvli a1, zero, e16, m8, ta, ma
+; ZVFH-NEXT: vmv.v.x v8, a0
+; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v0, v8
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v12
+; ZVFH-NEXT: addi a0, sp, 16
+; ZVFH-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT: vfadd.vv v0, v8, v0
+; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v0
+; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT: vfadd.vv v16, v24, v16
+; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v12, v16
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 3
+; ZVFH-NEXT: add sp, sp, a0
+; ZVFH-NEXT: .cfi_def_cfa sp, 16
+; ZVFH-NEXT: addi sp, sp, 16
+; ZVFH-NEXT: .cfi_def_cfa_offset 0
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv32bf16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: addi sp, sp, -16
+; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: sub sp, sp, a0
+; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT: fmv.x.h a0, fa0
+; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v8
+; ZVFHMIN-NEXT: addi a1, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v12
+; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT: vmv.v.x v8, a0
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v0, v8
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v12
+; ZVFHMIN-NEXT: addi a0, sp, 16
+; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfadd.vv v0, v8, v0
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v0
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfadd.vv v16, v24, v16
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v12, v16
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: add sp, sp, a0
+; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT: addi sp, sp, 16
+; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
+; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv32bf16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: addi sp, sp, -16
+; ZVFBFA-NEXT: .cfi_def_cfa_offset 16
+; ZVFBFA-NEXT: csrr a0, vlenb
+; ZVFBFA-NEXT: slli a0, a0, 3
+; ZVFBFA-NEXT: sub sp, sp, a0
+; ZVFBFA-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFBFA-NEXT: fmv.x.h a0, fa0
+; ZVFBFA-NEXT: vsetvli a1, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v8
+; ZVFBFA-NEXT: addi a1, sp, 16
+; ZVFBFA-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT: vfwcvt.f.f.v v24, v12
+; ZVFBFA-NEXT: vsetvli a1, zero, e16alt, m8, ta, ma
+; ZVFBFA-NEXT: vmv.v.x v8, a0
+; ZVFBFA-NEXT: vsetvli a0, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v0, v8
+; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v12
+; ZVFBFA-NEXT: addi a0, sp, 16
+; ZVFBFA-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v0, v8, v0
+; ZVFBFA-NEXT: vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v0
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v16, v24, v16
+; ZVFBFA-NEXT: vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v12, v16
+; ZVFBFA-NEXT: csrr a0, vlenb
+; ZVFBFA-NEXT: slli a0, a0, 3
+; ZVFBFA-NEXT: add sp, sp, a0
+; ZVFBFA-NEXT: .cfi_def_cfa sp, 16
+; ZVFBFA-NEXT: addi sp, sp, 16
+; ZVFBFA-NEXT: .cfi_def_cfa_offset 0
+; ZVFBFA-NEXT: ret
%head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
%splat = shufflevector <vscale x 32 x bfloat> %head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
%vc = fadd <vscale x 32 x bfloat> %va, %splat
@@ -285,6 +668,12 @@ define <vscale x 1 x half> @vfadd_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv1f16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v8, v8, v9
+; ZVFBFA-NEXT: ret
%vc = fadd <vscale x 1 x half> %va, %vb
ret <vscale x 1 x half> %vc
}
@@ -306,6 +695,12 @@ define <vscale x 1 x half> @vfadd_vf_nxv1f16(<vscale x 1 x half> %va, half %b) {
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv1f16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT: vfadd.vf v8, v8, fa0
+; ZVFBFA-NEXT: ret
%head = insertelement <vscale x 1 x half> poison, half %b, i32 0
%splat = shufflevector <vscale x 1 x half> %head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
%vc = fadd <vscale x 1 x half> %va, %splat
@@ -329,6 +724,12 @@ define <vscale x 2 x half> @vfadd_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv2f16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v8, v8, v9
+; ZVFBFA-NEXT: ret
%vc = fadd <vscale x 2 x half> %va, %vb
ret <vscale x 2 x half> %vc
}
@@ -350,6 +751,12 @@ define <vscale x 2 x half> @vfadd_vf_nxv2f16(<vscale x 2 x half> %va, half %b) {
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv2f16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFBFA-NEXT: vfadd.vf v8, v8, fa0
+; ZVFBFA-NEXT: ret
%head = insertelement <vscale x 2 x half> poison, half %b, i32 0
%splat = shufflevector <vscale x 2 x half> %head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
%vc = fadd <vscale x 2 x half> %va, %splat
@@ -373,6 +780,12 @@ define <vscale x 4 x half> @vfadd_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10
; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv4f16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v8, v8, v9
+; ZVFBFA-NEXT: ret
%vc = fadd <vscale x 4 x half> %va, %vb
ret <vscale x 4 x half> %vc
}
@@ -394,6 +807,12 @@ define <vscale x 4 x half> @vfadd_vf_nxv4f16(<vscale x 4 x half> %va, half %b) {
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10
; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv4f16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZVFBFA-NEXT: vfadd.vf v8, v8, fa0
+; ZVFBFA-NEXT: ret
%head = insertelement <vscale x 4 x half> poison, half %b, i32 0
%splat = shufflevector <vscale x 4 x half> %head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
%vc = fadd <vscale x 4 x half> %va, %splat
@@ -417,6 +836,12 @@ define <vscale x 8 x half> @vfadd_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12
; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv8f16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v8, v8, v10
+; ZVFBFA-NEXT: ret
%vc = fadd <vscale x 8 x half> %va, %vb
ret <vscale x 8 x half> %vc
}
@@ -438,6 +863,12 @@ define <vscale x 8 x half> @vfadd_vf_nxv8f16(<vscale x 8 x half> %va, half %b) {
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12
; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv8f16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT: vfadd.vf v8, v8, fa0
+; ZVFBFA-NEXT: ret
%head = insertelement <vscale x 8 x half> poison, half %b, i32 0
%splat = shufflevector <vscale x 8 x half> %head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
%vc = fadd <vscale x 8 x half> %va, %splat
@@ -461,6 +892,12 @@ define <vscale x 8 x half> @vfadd_fv_nxv8f16(<vscale x 8 x half> %va, half %b) {
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12
; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_fv_nxv8f16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT: vfadd.vf v8, v8, fa0
+; ZVFBFA-NEXT: ret
%head = insertelement <vscale x 8 x half> poison, half %b, i32 0
%splat = shufflevector <vscale x 8 x half> %head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
%vc = fadd <vscale x 8 x half> %splat, %va
@@ -484,6 +921,12 @@ define <vscale x 16 x half> @vfadd_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv16f16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v8, v8, v12
+; ZVFBFA-NEXT: ret
%vc = fadd <vscale x 16 x half> %va, %vb
ret <vscale x 16 x half> %vc
}
@@ -505,6 +948,12 @@ define <vscale x 16 x half> @vfadd_vf_nxv16f16(<vscale x 16 x half> %va, half %b
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv16f16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT: vfadd.vf v8, v8, fa0
+; ZVFBFA-NEXT: ret
%head = insertelement <vscale x 16 x half> poison, half %b, i32 0
%splat = shufflevector <vscale x 16 x half> %head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
%vc = fadd <vscale x 16 x half> %va, %splat
@@ -549,6 +998,12 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
; ZVFHMIN-NEXT: addi sp, sp, 16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv32f16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: vsetvli a0, zero, e16, m8, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v8, v8, v16
+; ZVFBFA-NEXT: ret
%vc = fadd <vscale x 32 x half> %va, %vb
ret <vscale x 32 x half> %vc
}
@@ -596,6 +1051,12 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16(<vscale x 32 x half> %va, half %b
; ZVFHMIN-NEXT: addi sp, sp, 16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv32f16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: vsetvli a0, zero, e16, m8, ta, ma
+; ZVFBFA-NEXT: vfadd.vf v8, v8, fa0
+; ZVFBFA-NEXT: ret
%head = insertelement <vscale x 32 x half> poison, half %b, i32 0
%splat = shufflevector <vscale x 32 x half> %head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
%vc = fadd <vscale x 32 x half> %va, %splat
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
index 32e3d6b..633a201 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
@@ -11,52 +11,125 @@
; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
; RUN: --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+zvfhmin,+experimental-zvfbfa,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFBFA
declare <vscale x 1 x bfloat> @llvm.vp.fadd.nxv1bf16(<vscale x 1 x bfloat>, <vscale x 1 x bfloat>, <vscale x 1 x i1>, i32)
define <vscale x 1 x bfloat> @vfadd_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv1bf16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9, v0.t
-; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT: vfadd.vv v9, v9, v10, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: vfadd_vv_nxv1bf16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v10, v9, v0.t
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v9, v8, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFH-NEXT: vfadd.vv v9, v9, v10, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v9, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv1bf16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT: vfadd.vv v9, v9, v10, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v9, v0.t
+; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv1bf16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v10, v9, v0.t
+; ZVFBFA-NEXT: vfwcvt.f.f.v v9, v8, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v9, v9, v10, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v9, v0.t
+; ZVFBFA-NEXT: ret
%v = call <vscale x 1 x bfloat> @llvm.vp.fadd.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, <vscale x 1 x i1> %m, i32 %evl)
ret <vscale x 1 x bfloat> %v
}
define <vscale x 1 x bfloat> @vfadd_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv1bf16_unmasked:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT: vfadd.vv v9, v9, v10
-; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
-; CHECK-NEXT: ret
+; ZVFH-LABEL: vfadd_vv_nxv1bf16_unmasked:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v10, v9
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v9, v8
+; ZVFH-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFH-NEXT: vfadd.vv v9, v9, v10
+; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v9
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv1bf16_unmasked:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v10, v9
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v9, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT: vfadd.vv v9, v9, v10
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v9
+; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv1bf16_unmasked:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v10, v9
+; ZVFBFA-NEXT: vfwcvt.f.f.v v9, v8
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v9, v9, v10
+; ZVFBFA-NEXT: vsetvli zero, zero, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT: ret
%v = call <vscale x 1 x bfloat> @llvm.vp.fadd.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, <vscale x 1 x i1> splat (i1 true), i32 %evl)
ret <vscale x 1 x bfloat> %v
}
define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv1bf16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: fmv.x.h a1, fa0
-; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT: vmv.v.x v9, a1
-; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8, v0.t
-; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT: vfadd.vv v9, v10, v8, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: vfadd_vf_nxv1bf16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: fmv.x.h a1, fa0
+; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT: vmv.v.x v9, a1
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v10, v8, v0.t
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v9, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFH-NEXT: vfadd.vv v9, v10, v8, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v9, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv1bf16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: fmv.x.h a1, fa0
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vmv.v.x v9, a1
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT: vfadd.vv v9, v10, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v9, v0.t
+; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv1bf16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: fmv.x.h a1, fa0
+; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT: vmv.v.x v9, a1
+; ZVFBFA-NEXT: vfwcvt.f.f.v v10, v8, v0.t
+; ZVFBFA-NEXT: vfwcvt.f.f.v v8, v9, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v9, v10, v8, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v9, v0.t
+; ZVFBFA-NEXT: ret
%elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
%vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
%v = call <vscale x 1 x bfloat> @llvm.vp.fadd.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 %evl)
@@ -64,18 +137,44 @@ define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloa
}
define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16_commute(<vscale x 1 x bfloat> %va, bfloat %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv1bf16_commute:
-; CHECK: # %bb.0:
-; CHECK-NEXT: fmv.x.h a1, fa0
-; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT: vmv.v.x v9, a1
-; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8, v0.t
-; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT: vfadd.vv v9, v8, v10, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: vfadd_vf_nxv1bf16_commute:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: fmv.x.h a1, fa0
+; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT: vmv.v.x v9, a1
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v10, v8, v0.t
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v9, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFH-NEXT: vfadd.vv v9, v8, v10, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v9, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv1bf16_commute:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: fmv.x.h a1, fa0
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vmv.v.x v9, a1
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT: vfadd.vv v9, v8, v10, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v9, v0.t
+; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv1bf16_commute:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: fmv.x.h a1, fa0
+; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT: vmv.v.x v9, a1
+; ZVFBFA-NEXT: vfwcvt.f.f.v v10, v8, v0.t
+; ZVFBFA-NEXT: vfwcvt.f.f.v v8, v9, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v9, v8, v10, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v9, v0.t
+; ZVFBFA-NEXT: ret
%elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
%vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
%v = call <vscale x 1 x bfloat> @llvm.vp.fadd.nxv1bf16(<vscale x 1 x bfloat> %vb, <vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 %evl)
@@ -83,18 +182,44 @@ define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16_commute(<vscale x 1 x bfloat> %v
}
define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv1bf16_unmasked:
-; CHECK: # %bb.0:
-; CHECK-NEXT: fmv.x.h a1, fa0
-; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT: vmv.v.x v9, a1
-; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT: vfadd.vv v9, v10, v8
-; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
-; CHECK-NEXT: ret
+; ZVFH-LABEL: vfadd_vf_nxv1bf16_unmasked:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: fmv.x.h a1, fa0
+; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT: vmv.v.x v9, a1
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v10, v8
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v9
+; ZVFH-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFH-NEXT: vfadd.vv v9, v10, v8
+; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v9
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv1bf16_unmasked:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: fmv.x.h a1, fa0
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vmv.v.x v9, a1
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v10, v8
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v9
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT: vfadd.vv v9, v10, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v9
+; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv1bf16_unmasked:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: fmv.x.h a1, fa0
+; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT: vmv.v.x v9, a1
+; ZVFBFA-NEXT: vfwcvt.f.f.v v10, v8
+; ZVFBFA-NEXT: vfwcvt.f.f.v v8, v9
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v9, v10, v8
+; ZVFBFA-NEXT: vsetvli zero, zero, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT: ret
%elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
%vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
%v = call <vscale x 1 x bfloat> @llvm.vp.fadd.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -102,18 +227,44 @@ define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16_unmasked(<vscale x 1 x bfloat> %
}
define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16_unmasked_commute(<vscale x 1 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv1bf16_unmasked_commute:
-; CHECK: # %bb.0:
-; CHECK-NEXT: fmv.x.h a1, fa0
-; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT: vmv.v.x v9, a1
-; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT: vfadd.vv v9, v8, v10
-; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
-; CHECK-NEXT: ret
+; ZVFH-LABEL: vfadd_vf_nxv1bf16_unmasked_commute:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: fmv.x.h a1, fa0
+; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT: vmv.v.x v9, a1
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v10, v8
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v9
+; ZVFH-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFH-NEXT: vfadd.vv v9, v8, v10
+; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v9
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv1bf16_unmasked_commute:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: fmv.x.h a1, fa0
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vmv.v.x v9, a1
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v10, v8
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v9
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT: vfadd.vv v9, v8, v10
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v9
+; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv1bf16_unmasked_commute:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: fmv.x.h a1, fa0
+; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT: vmv.v.x v9, a1
+; ZVFBFA-NEXT: vfwcvt.f.f.v v10, v8
+; ZVFBFA-NEXT: vfwcvt.f.f.v v8, v9
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v9, v8, v10
+; ZVFBFA-NEXT: vsetvli zero, zero, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT: ret
%elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
%vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
%v = call <vscale x 1 x bfloat> @llvm.vp.fadd.nxv1bf16(<vscale x 1 x bfloat> %vb, <vscale x 1 x bfloat> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -123,48 +274,118 @@ define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16_unmasked_commute(<vscale x 1 x b
declare <vscale x 2 x bfloat> @llvm.vp.fadd.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x i1>, i32)
define <vscale x 2 x bfloat> @vfadd_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv2bf16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9, v0.t
-; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT: vfadd.vv v9, v9, v10, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: vfadd_vv_nxv2bf16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v10, v9, v0.t
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v9, v8, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFH-NEXT: vfadd.vv v9, v9, v10, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v9, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv2bf16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT: vfadd.vv v9, v9, v10, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v9, v0.t
+; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv2bf16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v10, v9, v0.t
+; ZVFBFA-NEXT: vfwcvt.f.f.v v9, v8, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v9, v9, v10, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v9, v0.t
+; ZVFBFA-NEXT: ret
%v = call <vscale x 2 x bfloat> @llvm.vp.fadd.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, <vscale x 2 x i1> %m, i32 %evl)
ret <vscale x 2 x bfloat> %v
}
define <vscale x 2 x bfloat> @vfadd_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv2bf16_unmasked:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT: vfadd.vv v9, v9, v10
-; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
-; CHECK-NEXT: ret
+; ZVFH-LABEL: vfadd_vv_nxv2bf16_unmasked:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v10, v9
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v9, v8
+; ZVFH-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFH-NEXT: vfadd.vv v9, v9, v10
+; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v9
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv2bf16_unmasked:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v10, v9
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v9, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT: vfadd.vv v9, v9, v10
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v9
+; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv2bf16_unmasked:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v10, v9
+; ZVFBFA-NEXT: vfwcvt.f.f.v v9, v8
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v9, v9, v10
+; ZVFBFA-NEXT: vsetvli zero, zero, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT: ret
%v = call <vscale x 2 x bfloat> @llvm.vp.fadd.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, <vscale x 2 x i1> splat (i1 true), i32 %evl)
ret <vscale x 2 x bfloat> %v
}
define <vscale x 2 x bfloat> @vfadd_vf_nxv2bf16(<vscale x 2 x bfloat> %va, bfloat %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv2bf16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: fmv.x.h a1, fa0
-; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT: vmv.v.x v9, a1
-; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8, v0.t
-; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT: vfadd.vv v9, v10, v8, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: vfadd_vf_nxv2bf16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: fmv.x.h a1, fa0
+; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFH-NEXT: vmv.v.x v9, a1
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v10, v8, v0.t
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v9, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFH-NEXT: vfadd.vv v9, v10, v8, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v9, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv2bf16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: fmv.x.h a1, fa0
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vmv.v.x v9, a1
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT: vfadd.vv v9, v10, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v9, v0.t
+; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv2bf16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: fmv.x.h a1, fa0
+; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT: vmv.v.x v9, a1
+; ZVFBFA-NEXT: vfwcvt.f.f.v v10, v8, v0.t
+; ZVFBFA-NEXT: vfwcvt.f.f.v v8, v9, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v9, v10, v8, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v9, v0.t
+; ZVFBFA-NEXT: ret
%elt.head = insertelement <vscale x 2 x bfloat> poison, bfloat %b, i32 0
%vb = shufflevector <vscale x 2 x bfloat> %elt.head, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
%v = call <vscale x 2 x bfloat> @llvm.vp.fadd.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x i1> %m, i32 %evl)
@@ -172,18 +393,44 @@ define <vscale x 2 x bfloat> @vfadd_vf_nxv2bf16(<vscale x 2 x bfloat> %va, bfloa
}
define <vscale x 2 x bfloat> @vfadd_vf_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv2bf16_unmasked:
-; CHECK: # %bb.0:
-; CHECK-NEXT: fmv.x.h a1, fa0
-; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT: vmv.v.x v9, a1
-; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT: vfadd.vv v9, v10, v8
-; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
-; CHECK-NEXT: ret
+; ZVFH-LABEL: vfadd_vf_nxv2bf16_unmasked:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: fmv.x.h a1, fa0
+; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFH-NEXT: vmv.v.x v9, a1
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v10, v8
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v9
+; ZVFH-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFH-NEXT: vfadd.vv v9, v10, v8
+; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v9
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv2bf16_unmasked:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: fmv.x.h a1, fa0
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vmv.v.x v9, a1
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v10, v8
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v9
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT: vfadd.vv v9, v10, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v9
+; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv2bf16_unmasked:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: fmv.x.h a1, fa0
+; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT: vmv.v.x v9, a1
+; ZVFBFA-NEXT: vfwcvt.f.f.v v10, v8
+; ZVFBFA-NEXT: vfwcvt.f.f.v v8, v9
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v9, v10, v8
+; ZVFBFA-NEXT: vsetvli zero, zero, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT: ret
%elt.head = insertelement <vscale x 2 x bfloat> poison, bfloat %b, i32 0
%vb = shufflevector <vscale x 2 x bfloat> %elt.head, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
%v = call <vscale x 2 x bfloat> @llvm.vp.fadd.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -193,48 +440,118 @@ define <vscale x 2 x bfloat> @vfadd_vf_nxv2bf16_unmasked(<vscale x 2 x bfloat> %
declare <vscale x 4 x bfloat> @llvm.vp.fadd.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x i1>, i32)
define <vscale x 4 x bfloat> @vfadd_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv4bf16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9, v0.t
-; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT: vfadd.vv v10, v12, v10, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: vfadd_vv_nxv4bf16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v10, v9, v0.t
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v12, v8, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; ZVFH-NEXT: vfadd.vv v10, v12, v10, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v10, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv4bf16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT: vfadd.vv v10, v12, v10, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v10, v0.t
+; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv4bf16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v10, v9, v0.t
+; ZVFBFA-NEXT: vfwcvt.f.f.v v12, v8, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v10, v12, v10, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e16alt, m1, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v10, v0.t
+; ZVFBFA-NEXT: ret
%v = call <vscale x 4 x bfloat> @llvm.vp.fadd.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, <vscale x 4 x i1> %m, i32 %evl)
ret <vscale x 4 x bfloat> %v
}
define <vscale x 4 x bfloat> @vfadd_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv4bf16_unmasked:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT: vfadd.vv v10, v12, v10
-; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
-; CHECK-NEXT: ret
+; ZVFH-LABEL: vfadd_vv_nxv4bf16_unmasked:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v10, v9
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v12, v8
+; ZVFH-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; ZVFH-NEXT: vfadd.vv v10, v12, v10
+; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v10
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv4bf16_unmasked:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v10, v9
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v12, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT: vfadd.vv v10, v12, v10
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v10
+; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv4bf16_unmasked:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v10, v9
+; ZVFBFA-NEXT: vfwcvt.f.f.v v12, v8
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v10, v12, v10
+; ZVFBFA-NEXT: vsetvli zero, zero, e16alt, m1, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v10
+; ZVFBFA-NEXT: ret
%v = call <vscale x 4 x bfloat> @llvm.vp.fadd.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, <vscale x 4 x i1> splat (i1 true), i32 %evl)
ret <vscale x 4 x bfloat> %v
}
define <vscale x 4 x bfloat> @vfadd_vf_nxv4bf16(<vscale x 4 x bfloat> %va, bfloat %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv4bf16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: fmv.x.h a1, fa0
-; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT: vmv.v.x v12, a1
-; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8, v0.t
-; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v12, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT: vfadd.vv v10, v10, v8, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: vfadd_vf_nxv4bf16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: fmv.x.h a1, fa0
+; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; ZVFH-NEXT: vmv.v.x v12, a1
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v10, v8, v0.t
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v12, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; ZVFH-NEXT: vfadd.vv v10, v10, v8, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v10, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv4bf16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: fmv.x.h a1, fa0
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT: vmv.v.x v12, a1
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v12, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT: vfadd.vv v10, v10, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v10, v0.t
+; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv4bf16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: fmv.x.h a1, fa0
+; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; ZVFBFA-NEXT: vmv.v.x v12, a1
+; ZVFBFA-NEXT: vfwcvt.f.f.v v10, v8, v0.t
+; ZVFBFA-NEXT: vfwcvt.f.f.v v8, v12, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v10, v10, v8, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e16alt, m1, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v10, v0.t
+; ZVFBFA-NEXT: ret
%elt.head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
%vb = shufflevector <vscale x 4 x bfloat> %elt.head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
%v = call <vscale x 4 x bfloat> @llvm.vp.fadd.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x i1> %m, i32 %evl)
@@ -242,18 +559,44 @@ define <vscale x 4 x bfloat> @vfadd_vf_nxv4bf16(<vscale x 4 x bfloat> %va, bfloa
}
define <vscale x 4 x bfloat> @vfadd_vf_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv4bf16_unmasked:
-; CHECK: # %bb.0:
-; CHECK-NEXT: fmv.x.h a1, fa0
-; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT: vmv.v.x v12, a1
-; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v12
-; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT: vfadd.vv v10, v10, v8
-; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
-; CHECK-NEXT: ret
+; ZVFH-LABEL: vfadd_vf_nxv4bf16_unmasked:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: fmv.x.h a1, fa0
+; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; ZVFH-NEXT: vmv.v.x v12, a1
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v10, v8
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v12
+; ZVFH-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; ZVFH-NEXT: vfadd.vv v10, v10, v8
+; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v10
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv4bf16_unmasked:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: fmv.x.h a1, fa0
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT: vmv.v.x v12, a1
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v10, v8
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v12
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT: vfadd.vv v10, v10, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v10
+; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv4bf16_unmasked:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: fmv.x.h a1, fa0
+; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; ZVFBFA-NEXT: vmv.v.x v12, a1
+; ZVFBFA-NEXT: vfwcvt.f.f.v v10, v8
+; ZVFBFA-NEXT: vfwcvt.f.f.v v8, v12
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v10, v10, v8
+; ZVFBFA-NEXT: vsetvli zero, zero, e16alt, m1, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v10
+; ZVFBFA-NEXT: ret
%elt.head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
%vb = shufflevector <vscale x 4 x bfloat> %elt.head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
%v = call <vscale x 4 x bfloat> @llvm.vp.fadd.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -263,48 +606,118 @@ define <vscale x 4 x bfloat> @vfadd_vf_nxv4bf16_unmasked(<vscale x 4 x bfloat> %
declare <vscale x 8 x bfloat> @llvm.vp.fadd.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x i1>, i32)
define <vscale x 8 x bfloat> @vfadd_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv8bf16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10, v0.t
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT: vfadd.vv v12, v16, v12, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: vfadd_vv_nxv8bf16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v12, v10, v0.t
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v8, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; ZVFH-NEXT: vfadd.vv v12, v16, v12, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v12, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv8bf16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT: vfadd.vv v12, v16, v12, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v12, v0.t
+; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv8bf16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v12, v10, v0.t
+; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v8, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v12, v16, v12, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v12, v0.t
+; ZVFBFA-NEXT: ret
%v = call <vscale x 8 x bfloat> @llvm.vp.fadd.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> %m, i32 %evl)
ret <vscale x 8 x bfloat> %v
}
define <vscale x 8 x bfloat> @vfadd_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv8bf16_unmasked:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT: vfadd.vv v12, v16, v12
-; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
-; CHECK-NEXT: ret
+; ZVFH-LABEL: vfadd_vv_nxv8bf16_unmasked:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v12, v10
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v8
+; ZVFH-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; ZVFH-NEXT: vfadd.vv v12, v16, v12
+; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v12
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv8bf16_unmasked:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v12, v10
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT: vfadd.vv v12, v16, v12
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v12
+; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv8bf16_unmasked:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v12, v10
+; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v8
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v12, v16, v12
+; ZVFBFA-NEXT: vsetvli zero, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v12
+; ZVFBFA-NEXT: ret
%v = call <vscale x 8 x bfloat> @llvm.vp.fadd.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> splat (i1 true), i32 %evl)
ret <vscale x 8 x bfloat> %v
}
define <vscale x 8 x bfloat> @vfadd_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv8bf16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: fmv.x.h a1, fa0
-; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT: vmv.v.x v16, a1
-; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8, v0.t
-; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v16, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT: vfadd.vv v12, v12, v8, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: vfadd_vf_nxv8bf16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: fmv.x.h a1, fa0
+; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma
+; ZVFH-NEXT: vmv.v.x v16, a1
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v12, v8, v0.t
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v16, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; ZVFH-NEXT: vfadd.vv v12, v12, v8, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v12, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv8bf16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: fmv.x.h a1, fa0
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT: vmv.v.x v16, a1
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v16, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT: vfadd.vv v12, v12, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v12, v0.t
+; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv8bf16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: fmv.x.h a1, fa0
+; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; ZVFBFA-NEXT: vmv.v.x v16, a1
+; ZVFBFA-NEXT: vfwcvt.f.f.v v12, v8, v0.t
+; ZVFBFA-NEXT: vfwcvt.f.f.v v8, v16, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v12, v12, v8, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v12, v0.t
+; ZVFBFA-NEXT: ret
%elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
%vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
%v = call <vscale x 8 x bfloat> @llvm.vp.fadd.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 %evl)
@@ -312,18 +725,44 @@ define <vscale x 8 x bfloat> @vfadd_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloa
}
define <vscale x 8 x bfloat> @vfadd_vf_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv8bf16_unmasked:
-; CHECK: # %bb.0:
-; CHECK-NEXT: fmv.x.h a1, fa0
-; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT: vmv.v.x v16, a1
-; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v16
-; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT: vfadd.vv v12, v12, v8
-; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
-; CHECK-NEXT: ret
+; ZVFH-LABEL: vfadd_vf_nxv8bf16_unmasked:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: fmv.x.h a1, fa0
+; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma
+; ZVFH-NEXT: vmv.v.x v16, a1
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v12, v8
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v16
+; ZVFH-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; ZVFH-NEXT: vfadd.vv v12, v12, v8
+; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v12
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv8bf16_unmasked:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: fmv.x.h a1, fa0
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT: vmv.v.x v16, a1
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v12, v8
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v16
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT: vfadd.vv v12, v12, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v12
+; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv8bf16_unmasked:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: fmv.x.h a1, fa0
+; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; ZVFBFA-NEXT: vmv.v.x v16, a1
+; ZVFBFA-NEXT: vfwcvt.f.f.v v12, v8
+; ZVFBFA-NEXT: vfwcvt.f.f.v v8, v16
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v12, v12, v8
+; ZVFBFA-NEXT: vsetvli zero, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v12
+; ZVFBFA-NEXT: ret
%elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
%vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
%v = call <vscale x 8 x bfloat> @llvm.vp.fadd.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -333,48 +772,118 @@ define <vscale x 8 x bfloat> @vfadd_vf_nxv8bf16_unmasked(<vscale x 8 x bfloat> %
declare <vscale x 16 x bfloat> @llvm.vp.fadd.nxv16bf16(<vscale x 16 x bfloat>, <vscale x 16 x bfloat>, <vscale x 16 x i1>, i32)
define <vscale x 16 x bfloat> @vfadd_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv16bf16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfadd.vv v16, v24, v16, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: vfadd_vv_nxv16bf16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v8, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT: vfadd.vv v16, v24, v16, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v16, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv16bf16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfadd.vv v16, v24, v16, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v16, v0.t
+; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv16bf16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v12, v0.t
+; ZVFBFA-NEXT: vfwcvt.f.f.v v24, v8, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v16, v24, v16, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v16, v0.t
+; ZVFBFA-NEXT: ret
%v = call <vscale x 16 x bfloat> @llvm.vp.fadd.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, <vscale x 16 x i1> %m, i32 %evl)
ret <vscale x 16 x bfloat> %v
}
define <vscale x 16 x bfloat> @vfadd_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv16bf16_unmasked:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfadd.vv v16, v24, v16
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT: ret
+; ZVFH-LABEL: vfadd_vv_nxv16bf16_unmasked:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v12
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v8
+; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT: vfadd.vv v16, v24, v16
+; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v16
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv16bf16_unmasked:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v12
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfadd.vv v16, v24, v16
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v16
+; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv16bf16_unmasked:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v12
+; ZVFBFA-NEXT: vfwcvt.f.f.v v24, v8
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v16, v24, v16
+; ZVFBFA-NEXT: vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v16
+; ZVFBFA-NEXT: ret
%v = call <vscale x 16 x bfloat> @llvm.vp.fadd.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, <vscale x 16 x i1> splat (i1 true), i32 %evl)
ret <vscale x 16 x bfloat> %v
}
define <vscale x 16 x bfloat> @vfadd_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bfloat %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv16bf16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: fmv.x.h a1, fa0
-; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT: vmv.v.x v24, a1
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8, v0.t
-; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v24, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfadd.vv v16, v16, v8, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: vfadd_vf_nxv16bf16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: fmv.x.h a1, fa0
+; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; ZVFH-NEXT: vmv.v.x v24, a1
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v8, v0.t
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v24, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT: vfadd.vv v16, v16, v8, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v16, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv16bf16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: fmv.x.h a1, fa0
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vmv.v.x v24, a1
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfadd.vv v16, v16, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v16, v0.t
+; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv16bf16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: fmv.x.h a1, fa0
+; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; ZVFBFA-NEXT: vmv.v.x v24, a1
+; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v8, v0.t
+; ZVFBFA-NEXT: vfwcvt.f.f.v v8, v24, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v16, v16, v8, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v16, v0.t
+; ZVFBFA-NEXT: ret
%elt.head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
%vb = shufflevector <vscale x 16 x bfloat> %elt.head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
%v = call <vscale x 16 x bfloat> @llvm.vp.fadd.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x i1> %m, i32 %evl)
@@ -382,18 +891,44 @@ define <vscale x 16 x bfloat> @vfadd_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bf
}
define <vscale x 16 x bfloat> @vfadd_vf_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv16bf16_unmasked:
-; CHECK: # %bb.0:
-; CHECK-NEXT: fmv.x.h a1, fa0
-; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT: vmv.v.x v24, a1
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v24
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfadd.vv v16, v16, v8
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT: ret
+; ZVFH-LABEL: vfadd_vf_nxv16bf16_unmasked:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: fmv.x.h a1, fa0
+; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; ZVFH-NEXT: vmv.v.x v24, a1
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v8
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v24
+; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT: vfadd.vv v16, v16, v8
+; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v16
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv16bf16_unmasked:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: fmv.x.h a1, fa0
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vmv.v.x v24, a1
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v8
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v24
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfadd.vv v16, v16, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v16
+; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv16bf16_unmasked:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: fmv.x.h a1, fa0
+; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; ZVFBFA-NEXT: vmv.v.x v24, a1
+; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v8
+; ZVFBFA-NEXT: vfwcvt.f.f.v v8, v24
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v16, v16, v8
+; ZVFBFA-NEXT: vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v16
+; ZVFBFA-NEXT: ret
%elt.head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
%vb = shufflevector <vscale x 16 x bfloat> %elt.head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
%v = call <vscale x 16 x bfloat> @llvm.vp.fadd.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -403,173 +938,493 @@ define <vscale x 16 x bfloat> @vfadd_vf_nxv16bf16_unmasked(<vscale x 16 x bfloat
declare <vscale x 32 x bfloat> @llvm.vp.fadd.nxv32bf16(<vscale x 32 x bfloat>, <vscale x 32 x bfloat>, <vscale x 32 x i1>, i32)
define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv32bf16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vmv1r.v v7, v0
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a1, a2, 1
-; CHECK-NEXT: srli a2, a2, 2
-; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
-; CHECK-NEXT: addi a3, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfadd.vv v16, v16, v24, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16, v0.t
-; CHECK-NEXT: bltu a0, a1, .LBB22_2
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: mv a0, a1
-; CHECK-NEXT: .LBB22_2:
-; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: addi a1, sp, 16
-; CHECK-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24, v0.t
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfadd.vv v16, v24, v16, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16, v0.t
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
-; CHECK-NEXT: ret
+; ZVFH-LABEL: vfadd_vv_nxv32bf16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: addi sp, sp, -16
+; ZVFH-NEXT: .cfi_def_cfa_offset 16
+; ZVFH-NEXT: csrr a1, vlenb
+; ZVFH-NEXT: slli a1, a1, 3
+; ZVFH-NEXT: sub sp, sp, a1
+; ZVFH-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFH-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
+; ZVFH-NEXT: vmv1r.v v7, v0
+; ZVFH-NEXT: csrr a2, vlenb
+; ZVFH-NEXT: slli a1, a2, 1
+; ZVFH-NEXT: srli a2, a2, 2
+; ZVFH-NEXT: sub a3, a0, a1
+; ZVFH-NEXT: vslidedown.vx v0, v0, a2
+; ZVFH-NEXT: sltu a2, a0, a3
+; ZVFH-NEXT: addi a2, a2, -1
+; ZVFH-NEXT: and a2, a2, a3
+; ZVFH-NEXT: addi a3, sp, 16
+; ZVFH-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFH-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT: vfadd.vv v16, v16, v24, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v12, v16, v0.t
+; ZVFH-NEXT: bltu a0, a1, .LBB22_2
+; ZVFH-NEXT: # %bb.1:
+; ZVFH-NEXT: mv a0, a1
+; ZVFH-NEXT: .LBB22_2:
+; ZVFH-NEXT: vmv1r.v v0, v7
+; ZVFH-NEXT: addi a1, sp, 16
+; ZVFH-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v24, v0.t
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v8, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT: vfadd.vv v16, v24, v16, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v16, v0.t
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 3
+; ZVFH-NEXT: add sp, sp, a0
+; ZVFH-NEXT: .cfi_def_cfa sp, 16
+; ZVFH-NEXT: addi sp, sp, 16
+; ZVFH-NEXT: .cfi_def_cfa_offset 0
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv32bf16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: addi sp, sp, -16
+; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT: csrr a1, vlenb
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: sub sp, sp, a1
+; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: vmv1r.v v7, v0
+; ZVFHMIN-NEXT: csrr a2, vlenb
+; ZVFHMIN-NEXT: slli a1, a2, 1
+; ZVFHMIN-NEXT: srli a2, a2, 2
+; ZVFHMIN-NEXT: sub a3, a0, a1
+; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2
+; ZVFHMIN-NEXT: sltu a2, a0, a3
+; ZVFHMIN-NEXT: addi a2, a2, -1
+; ZVFHMIN-NEXT: and a2, a2, a3
+; ZVFHMIN-NEXT: addi a3, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfadd.vv v16, v16, v24, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v12, v16, v0.t
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB22_2
+; ZVFHMIN-NEXT: # %bb.1:
+; ZVFHMIN-NEXT: mv a0, a1
+; ZVFHMIN-NEXT: .LBB22_2:
+; ZVFHMIN-NEXT: vmv1r.v v0, v7
+; ZVFHMIN-NEXT: addi a1, sp, 16
+; ZVFHMIN-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v24, v0.t
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfadd.vv v16, v24, v16, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v16, v0.t
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: add sp, sp, a0
+; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT: addi sp, sp, 16
+; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
+; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv32bf16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: addi sp, sp, -16
+; ZVFBFA-NEXT: .cfi_def_cfa_offset 16
+; ZVFBFA-NEXT: csrr a1, vlenb
+; ZVFBFA-NEXT: slli a1, a1, 3
+; ZVFBFA-NEXT: sub sp, sp, a1
+; ZVFBFA-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFBFA-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT: vmv1r.v v7, v0
+; ZVFBFA-NEXT: csrr a2, vlenb
+; ZVFBFA-NEXT: slli a1, a2, 1
+; ZVFBFA-NEXT: srli a2, a2, 2
+; ZVFBFA-NEXT: sub a3, a0, a1
+; ZVFBFA-NEXT: vslidedown.vx v0, v0, a2
+; ZVFBFA-NEXT: sltu a2, a0, a3
+; ZVFBFA-NEXT: addi a2, a2, -1
+; ZVFBFA-NEXT: and a2, a2, a3
+; ZVFBFA-NEXT: addi a3, sp, 16
+; ZVFBFA-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT: vsetvli zero, a2, e16alt, m4, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v24, v20, v0.t
+; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v12, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v16, v16, v24, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v12, v16, v0.t
+; ZVFBFA-NEXT: bltu a0, a1, .LBB22_2
+; ZVFBFA-NEXT: # %bb.1:
+; ZVFBFA-NEXT: mv a0, a1
+; ZVFBFA-NEXT: .LBB22_2:
+; ZVFBFA-NEXT: vmv1r.v v0, v7
+; ZVFBFA-NEXT: addi a1, sp, 16
+; ZVFBFA-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v24, v0.t
+; ZVFBFA-NEXT: vfwcvt.f.f.v v24, v8, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v16, v24, v16, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v16, v0.t
+; ZVFBFA-NEXT: csrr a0, vlenb
+; ZVFBFA-NEXT: slli a0, a0, 3
+; ZVFBFA-NEXT: add sp, sp, a0
+; ZVFBFA-NEXT: .cfi_def_cfa sp, 16
+; ZVFBFA-NEXT: addi sp, sp, 16
+; ZVFBFA-NEXT: .cfi_def_cfa_offset 0
+; ZVFBFA-NEXT: ret
%v = call <vscale x 32 x bfloat> @llvm.vp.fadd.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %b, <vscale x 32 x i1> %m, i32 %evl)
ret <vscale x 32 x bfloat> %v
}
define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv32bf16_unmasked:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-NEXT: vmset.m v24
-; CHECK-NEXT: slli a1, a2, 1
-; CHECK-NEXT: srli a2, a2, 2
-; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vx v0, v24, a2
-; CHECK-NEXT: sltu a2, a0, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
-; CHECK-NEXT: addi a3, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfadd.vv v16, v16, v24, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16, v0.t
-; CHECK-NEXT: bltu a0, a1, .LBB23_2
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: mv a0, a1
-; CHECK-NEXT: .LBB23_2:
-; CHECK-NEXT: addi a1, sp, 16
-; CHECK-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfadd.vv v16, v24, v16
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
-; CHECK-NEXT: ret
+; ZVFH-LABEL: vfadd_vv_nxv32bf16_unmasked:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: addi sp, sp, -16
+; ZVFH-NEXT: .cfi_def_cfa_offset 16
+; ZVFH-NEXT: csrr a1, vlenb
+; ZVFH-NEXT: slli a1, a1, 3
+; ZVFH-NEXT: sub sp, sp, a1
+; ZVFH-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFH-NEXT: csrr a2, vlenb
+; ZVFH-NEXT: vsetvli a1, zero, e8, m4, ta, ma
+; ZVFH-NEXT: vmset.m v24
+; ZVFH-NEXT: slli a1, a2, 1
+; ZVFH-NEXT: srli a2, a2, 2
+; ZVFH-NEXT: sub a3, a0, a1
+; ZVFH-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFH-NEXT: vslidedown.vx v0, v24, a2
+; ZVFH-NEXT: sltu a2, a0, a3
+; ZVFH-NEXT: addi a2, a2, -1
+; ZVFH-NEXT: and a2, a2, a3
+; ZVFH-NEXT: addi a3, sp, 16
+; ZVFH-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFH-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT: vfadd.vv v16, v16, v24, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v12, v16, v0.t
+; ZVFH-NEXT: bltu a0, a1, .LBB23_2
+; ZVFH-NEXT: # %bb.1:
+; ZVFH-NEXT: mv a0, a1
+; ZVFH-NEXT: .LBB23_2:
+; ZVFH-NEXT: addi a1, sp, 16
+; ZVFH-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v24
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v8
+; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT: vfadd.vv v16, v24, v16
+; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v16
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 3
+; ZVFH-NEXT: add sp, sp, a0
+; ZVFH-NEXT: .cfi_def_cfa sp, 16
+; ZVFH-NEXT: addi sp, sp, 16
+; ZVFH-NEXT: .cfi_def_cfa_offset 0
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv32bf16_unmasked:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: addi sp, sp, -16
+; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT: csrr a1, vlenb
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: sub sp, sp, a1
+; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT: csrr a2, vlenb
+; ZVFHMIN-NEXT: vsetvli a1, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT: vmset.m v24
+; ZVFHMIN-NEXT: slli a1, a2, 1
+; ZVFHMIN-NEXT: srli a2, a2, 2
+; ZVFHMIN-NEXT: sub a3, a0, a1
+; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a2
+; ZVFHMIN-NEXT: sltu a2, a0, a3
+; ZVFHMIN-NEXT: addi a2, a2, -1
+; ZVFHMIN-NEXT: and a2, a2, a3
+; ZVFHMIN-NEXT: addi a3, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfadd.vv v16, v16, v24, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v12, v16, v0.t
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB23_2
+; ZVFHMIN-NEXT: # %bb.1:
+; ZVFHMIN-NEXT: mv a0, a1
+; ZVFHMIN-NEXT: .LBB23_2:
+; ZVFHMIN-NEXT: addi a1, sp, 16
+; ZVFHMIN-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v24
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfadd.vv v16, v24, v16
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v16
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: add sp, sp, a0
+; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT: addi sp, sp, 16
+; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
+; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv32bf16_unmasked:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: addi sp, sp, -16
+; ZVFBFA-NEXT: .cfi_def_cfa_offset 16
+; ZVFBFA-NEXT: csrr a1, vlenb
+; ZVFBFA-NEXT: slli a1, a1, 3
+; ZVFBFA-NEXT: sub sp, sp, a1
+; ZVFBFA-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFBFA-NEXT: csrr a2, vlenb
+; ZVFBFA-NEXT: vsetvli a1, zero, e8, m4, ta, ma
+; ZVFBFA-NEXT: vmset.m v24
+; ZVFBFA-NEXT: slli a1, a2, 1
+; ZVFBFA-NEXT: srli a2, a2, 2
+; ZVFBFA-NEXT: sub a3, a0, a1
+; ZVFBFA-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT: vslidedown.vx v0, v24, a2
+; ZVFBFA-NEXT: sltu a2, a0, a3
+; ZVFBFA-NEXT: addi a2, a2, -1
+; ZVFBFA-NEXT: and a2, a2, a3
+; ZVFBFA-NEXT: addi a3, sp, 16
+; ZVFBFA-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT: vsetvli zero, a2, e16alt, m4, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v24, v20, v0.t
+; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v12, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v16, v16, v24, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v12, v16, v0.t
+; ZVFBFA-NEXT: bltu a0, a1, .LBB23_2
+; ZVFBFA-NEXT: # %bb.1:
+; ZVFBFA-NEXT: mv a0, a1
+; ZVFBFA-NEXT: .LBB23_2:
+; ZVFBFA-NEXT: addi a1, sp, 16
+; ZVFBFA-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v24
+; ZVFBFA-NEXT: vfwcvt.f.f.v v24, v8
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v16, v24, v16
+; ZVFBFA-NEXT: vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v16
+; ZVFBFA-NEXT: csrr a0, vlenb
+; ZVFBFA-NEXT: slli a0, a0, 3
+; ZVFBFA-NEXT: add sp, sp, a0
+; ZVFBFA-NEXT: .cfi_def_cfa sp, 16
+; ZVFBFA-NEXT: addi sp, sp, 16
+; ZVFBFA-NEXT: .cfi_def_cfa_offset 0
+; ZVFBFA-NEXT: ret
%v = call <vscale x 32 x bfloat> @llvm.vp.fadd.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %b, <vscale x 32 x i1> splat (i1 true), i32 %evl)
ret <vscale x 32 x bfloat> %v
}
define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfloat %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv32bf16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT: vmv1r.v v7, v0
-; CHECK-NEXT: fmv.x.h a1, fa0
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: vmv.v.x v24, a1
-; CHECK-NEXT: slli a1, a2, 1
-; CHECK-NEXT: srli a2, a2, 2
-; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
-; CHECK-NEXT: csrr a3, vlenb
-; CHECK-NEXT: slli a3, a3, 3
-; CHECK-NEXT: add a3, sp, a3
-; CHECK-NEXT: addi a3, a3, 16
-; CHECK-NEXT: vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v28, v0.t
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfadd.vv v16, v24, v16, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16, v0.t
-; CHECK-NEXT: bltu a0, a1, .LBB24_2
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: mv a0, a1
-; CHECK-NEXT: .LBB24_2:
-; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8, v0.t
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16, v0.t
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfadd.vv v16, v16, v24, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16, v0.t
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
-; CHECK-NEXT: ret
+; ZVFH-LABEL: vfadd_vf_nxv32bf16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: addi sp, sp, -16
+; ZVFH-NEXT: .cfi_def_cfa_offset 16
+; ZVFH-NEXT: csrr a1, vlenb
+; ZVFH-NEXT: slli a1, a1, 4
+; ZVFH-NEXT: sub sp, sp, a1
+; ZVFH-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; ZVFH-NEXT: vsetvli a1, zero, e16, m8, ta, ma
+; ZVFH-NEXT: vmv1r.v v7, v0
+; ZVFH-NEXT: fmv.x.h a1, fa0
+; ZVFH-NEXT: csrr a2, vlenb
+; ZVFH-NEXT: vmv.v.x v24, a1
+; ZVFH-NEXT: slli a1, a2, 1
+; ZVFH-NEXT: srli a2, a2, 2
+; ZVFH-NEXT: sub a3, a0, a1
+; ZVFH-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFH-NEXT: vslidedown.vx v0, v0, a2
+; ZVFH-NEXT: sltu a2, a0, a3
+; ZVFH-NEXT: addi a2, a2, -1
+; ZVFH-NEXT: and a2, a2, a3
+; ZVFH-NEXT: csrr a3, vlenb
+; ZVFH-NEXT: slli a3, a3, 3
+; ZVFH-NEXT: add a3, sp, a3
+; ZVFH-NEXT: addi a3, a3, 16
+; ZVFH-NEXT: vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
+; ZVFH-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v28, v0.t
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v12, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT: vfadd.vv v16, v24, v16, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v12, v16, v0.t
+; ZVFH-NEXT: bltu a0, a1, .LBB24_2
+; ZVFH-NEXT: # %bb.1:
+; ZVFH-NEXT: mv a0, a1
+; ZVFH-NEXT: .LBB24_2:
+; ZVFH-NEXT: vmv1r.v v0, v7
+; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v8, v0.t
+; ZVFH-NEXT: addi a0, sp, 16
+; ZVFH-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 3
+; ZVFH-NEXT: add a0, sp, a0
+; ZVFH-NEXT: addi a0, a0, 16
+; ZVFH-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v16, v0.t
+; ZVFH-NEXT: addi a0, sp, 16
+; ZVFH-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT: vfadd.vv v16, v16, v24, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v16, v0.t
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 4
+; ZVFH-NEXT: add sp, sp, a0
+; ZVFH-NEXT: .cfi_def_cfa sp, 16
+; ZVFH-NEXT: addi sp, sp, 16
+; ZVFH-NEXT: .cfi_def_cfa_offset 0
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv32bf16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: addi sp, sp, -16
+; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT: csrr a1, vlenb
+; ZVFHMIN-NEXT: slli a1, a1, 4
+; ZVFHMIN-NEXT: sub sp, sp, a1
+; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT: vmv1r.v v7, v0
+; ZVFHMIN-NEXT: fmv.x.h a1, fa0
+; ZVFHMIN-NEXT: csrr a2, vlenb
+; ZVFHMIN-NEXT: vmv.v.x v24, a1
+; ZVFHMIN-NEXT: slli a1, a2, 1
+; ZVFHMIN-NEXT: srli a2, a2, 2
+; ZVFHMIN-NEXT: sub a3, a0, a1
+; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2
+; ZVFHMIN-NEXT: sltu a2, a0, a3
+; ZVFHMIN-NEXT: addi a2, a2, -1
+; ZVFHMIN-NEXT: and a2, a2, a3
+; ZVFHMIN-NEXT: csrr a3, vlenb
+; ZVFHMIN-NEXT: slli a3, a3, 3
+; ZVFHMIN-NEXT: add a3, sp, a3
+; ZVFHMIN-NEXT: addi a3, a3, 16
+; ZVFHMIN-NEXT: vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v28, v0.t
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfadd.vv v16, v24, v16, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v12, v16, v0.t
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB24_2
+; ZVFHMIN-NEXT: # %bb.1:
+; ZVFHMIN-NEXT: mv a0, a1
+; ZVFHMIN-NEXT: .LBB24_2:
+; ZVFHMIN-NEXT: vmv1r.v v0, v7
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT: addi a0, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT: addi a0, sp, 16
+; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfadd.vv v16, v16, v24, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v16, v0.t
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: add sp, sp, a0
+; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT: addi sp, sp, 16
+; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
+; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv32bf16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: addi sp, sp, -16
+; ZVFBFA-NEXT: .cfi_def_cfa_offset 16
+; ZVFBFA-NEXT: csrr a1, vlenb
+; ZVFBFA-NEXT: slli a1, a1, 4
+; ZVFBFA-NEXT: sub sp, sp, a1
+; ZVFBFA-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; ZVFBFA-NEXT: vsetvli a1, zero, e16, m8, ta, ma
+; ZVFBFA-NEXT: vmv1r.v v7, v0
+; ZVFBFA-NEXT: fmv.x.h a1, fa0
+; ZVFBFA-NEXT: csrr a2, vlenb
+; ZVFBFA-NEXT: vmv.v.x v24, a1
+; ZVFBFA-NEXT: slli a1, a2, 1
+; ZVFBFA-NEXT: srli a2, a2, 2
+; ZVFBFA-NEXT: sub a3, a0, a1
+; ZVFBFA-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT: vslidedown.vx v0, v0, a2
+; ZVFBFA-NEXT: sltu a2, a0, a3
+; ZVFBFA-NEXT: addi a2, a2, -1
+; ZVFBFA-NEXT: and a2, a2, a3
+; ZVFBFA-NEXT: csrr a3, vlenb
+; ZVFBFA-NEXT: slli a3, a3, 3
+; ZVFBFA-NEXT: add a3, sp, a3
+; ZVFBFA-NEXT: addi a3, a3, 16
+; ZVFBFA-NEXT: vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT: vsetvli zero, a2, e16alt, m4, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v28, v0.t
+; ZVFBFA-NEXT: vfwcvt.f.f.v v24, v12, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v16, v24, v16, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v12, v16, v0.t
+; ZVFBFA-NEXT: bltu a0, a1, .LBB24_2
+; ZVFBFA-NEXT: # %bb.1:
+; ZVFBFA-NEXT: mv a0, a1
+; ZVFBFA-NEXT: .LBB24_2:
+; ZVFBFA-NEXT: vmv1r.v v0, v7
+; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v8, v0.t
+; ZVFBFA-NEXT: addi a0, sp, 16
+; ZVFBFA-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT: csrr a0, vlenb
+; ZVFBFA-NEXT: slli a0, a0, 3
+; ZVFBFA-NEXT: add a0, sp, a0
+; ZVFBFA-NEXT: addi a0, a0, 16
+; ZVFBFA-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT: vfwcvt.f.f.v v24, v16, v0.t
+; ZVFBFA-NEXT: addi a0, sp, 16
+; ZVFBFA-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v16, v16, v24, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v16, v0.t
+; ZVFBFA-NEXT: csrr a0, vlenb
+; ZVFBFA-NEXT: slli a0, a0, 4
+; ZVFBFA-NEXT: add sp, sp, a0
+; ZVFBFA-NEXT: .cfi_def_cfa sp, 16
+; ZVFBFA-NEXT: addi sp, sp, 16
+; ZVFBFA-NEXT: .cfi_def_cfa_offset 0
+; ZVFBFA-NEXT: ret
%elt.head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
%vb = shufflevector <vscale x 32 x bfloat> %elt.head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
%v = call <vscale x 32 x bfloat> @llvm.vp.fadd.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> %m, i32 %evl)
@@ -577,56 +1432,158 @@ define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
}
define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv32bf16_unmasked:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT: fmv.x.h a1, fa0
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: vsetvli a3, zero, e16, m8, ta, ma
-; CHECK-NEXT: vmset.m v24
-; CHECK-NEXT: vmv.v.x v16, a1
-; CHECK-NEXT: slli a1, a2, 1
-; CHECK-NEXT: srli a2, a2, 2
-; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vx v0, v24, a2
-; CHECK-NEXT: sltu a2, a0, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
-; CHECK-NEXT: addi a3, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfadd.vv v16, v16, v24, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16, v0.t
-; CHECK-NEXT: bltu a0, a1, .LBB25_2
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: mv a0, a1
-; CHECK-NEXT: .LBB25_2:
-; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfadd.vv v16, v16, v24
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
-; CHECK-NEXT: ret
+; ZVFH-LABEL: vfadd_vf_nxv32bf16_unmasked:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: addi sp, sp, -16
+; ZVFH-NEXT: .cfi_def_cfa_offset 16
+; ZVFH-NEXT: csrr a1, vlenb
+; ZVFH-NEXT: slli a1, a1, 3
+; ZVFH-NEXT: sub sp, sp, a1
+; ZVFH-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFH-NEXT: fmv.x.h a1, fa0
+; ZVFH-NEXT: csrr a2, vlenb
+; ZVFH-NEXT: vsetvli a3, zero, e16, m8, ta, ma
+; ZVFH-NEXT: vmset.m v24
+; ZVFH-NEXT: vmv.v.x v16, a1
+; ZVFH-NEXT: slli a1, a2, 1
+; ZVFH-NEXT: srli a2, a2, 2
+; ZVFH-NEXT: sub a3, a0, a1
+; ZVFH-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFH-NEXT: vslidedown.vx v0, v24, a2
+; ZVFH-NEXT: sltu a2, a0, a3
+; ZVFH-NEXT: addi a2, a2, -1
+; ZVFH-NEXT: and a2, a2, a3
+; ZVFH-NEXT: addi a3, sp, 16
+; ZVFH-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFH-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT: vfadd.vv v16, v16, v24, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v12, v16, v0.t
+; ZVFH-NEXT: bltu a0, a1, .LBB25_2
+; ZVFH-NEXT: # %bb.1:
+; ZVFH-NEXT: mv a0, a1
+; ZVFH-NEXT: .LBB25_2:
+; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v8
+; ZVFH-NEXT: addi a0, sp, 16
+; ZVFH-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v0
+; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT: vfadd.vv v16, v16, v24
+; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v16
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 3
+; ZVFH-NEXT: add sp, sp, a0
+; ZVFH-NEXT: .cfi_def_cfa sp, 16
+; ZVFH-NEXT: addi sp, sp, 16
+; ZVFH-NEXT: .cfi_def_cfa_offset 0
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv32bf16_unmasked:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: addi sp, sp, -16
+; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT: csrr a1, vlenb
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: sub sp, sp, a1
+; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT: fmv.x.h a1, fa0
+; ZVFHMIN-NEXT: csrr a2, vlenb
+; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT: vmset.m v24
+; ZVFHMIN-NEXT: vmv.v.x v16, a1
+; ZVFHMIN-NEXT: slli a1, a2, 1
+; ZVFHMIN-NEXT: srli a2, a2, 2
+; ZVFHMIN-NEXT: sub a3, a0, a1
+; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a2
+; ZVFHMIN-NEXT: sltu a2, a0, a3
+; ZVFHMIN-NEXT: addi a2, a2, -1
+; ZVFHMIN-NEXT: and a2, a2, a3
+; ZVFHMIN-NEXT: addi a3, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfadd.vv v16, v16, v24, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v12, v16, v0.t
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB25_2
+; ZVFHMIN-NEXT: # %bb.1:
+; ZVFHMIN-NEXT: mv a0, a1
+; ZVFHMIN-NEXT: .LBB25_2:
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v8
+; ZVFHMIN-NEXT: addi a0, sp, 16
+; ZVFHMIN-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v0
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfadd.vv v16, v16, v24
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v16
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: add sp, sp, a0
+; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT: addi sp, sp, 16
+; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
+; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv32bf16_unmasked:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: addi sp, sp, -16
+; ZVFBFA-NEXT: .cfi_def_cfa_offset 16
+; ZVFBFA-NEXT: csrr a1, vlenb
+; ZVFBFA-NEXT: slli a1, a1, 3
+; ZVFBFA-NEXT: sub sp, sp, a1
+; ZVFBFA-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFBFA-NEXT: fmv.x.h a1, fa0
+; ZVFBFA-NEXT: csrr a2, vlenb
+; ZVFBFA-NEXT: vsetvli a3, zero, e16, m8, ta, ma
+; ZVFBFA-NEXT: vmset.m v24
+; ZVFBFA-NEXT: vmv.v.x v16, a1
+; ZVFBFA-NEXT: slli a1, a2, 1
+; ZVFBFA-NEXT: srli a2, a2, 2
+; ZVFBFA-NEXT: sub a3, a0, a1
+; ZVFBFA-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT: vslidedown.vx v0, v24, a2
+; ZVFBFA-NEXT: sltu a2, a0, a3
+; ZVFBFA-NEXT: addi a2, a2, -1
+; ZVFBFA-NEXT: and a2, a2, a3
+; ZVFBFA-NEXT: addi a3, sp, 16
+; ZVFBFA-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT: vsetvli zero, a2, e16alt, m4, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v24, v20, v0.t
+; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v12, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v16, v16, v24, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v12, v16, v0.t
+; ZVFBFA-NEXT: bltu a0, a1, .LBB25_2
+; ZVFBFA-NEXT: # %bb.1:
+; ZVFBFA-NEXT: mv a0, a1
+; ZVFBFA-NEXT: .LBB25_2:
+; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v8
+; ZVFBFA-NEXT: addi a0, sp, 16
+; ZVFBFA-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT: vfwcvt.f.f.v v24, v0
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v16, v16, v24
+; ZVFBFA-NEXT: vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v16
+; ZVFBFA-NEXT: csrr a0, vlenb
+; ZVFBFA-NEXT: slli a0, a0, 3
+; ZVFBFA-NEXT: add sp, sp, a0
+; ZVFBFA-NEXT: .cfi_def_cfa sp, 16
+; ZVFBFA-NEXT: addi sp, sp, 16
+; ZVFBFA-NEXT: .cfi_def_cfa_offset 0
+; ZVFBFA-NEXT: ret
%elt.head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
%vb = shufflevector <vscale x 32 x bfloat> %elt.head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
%v = call <vscale x 32 x bfloat> @llvm.vp.fadd.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
@@ -651,6 +1608,17 @@ define <vscale x 1 x half> @vfadd_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9, v0.t
; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv1f16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v10, v9, v0.t
+; ZVFBFA-NEXT: vfwcvt.f.f.v v9, v8, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v9, v9, v10, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v9, v0.t
+; ZVFBFA-NEXT: ret
%v = call <vscale x 1 x half> @llvm.vp.fadd.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %b, <vscale x 1 x i1> %m, i32 %evl)
ret <vscale x 1 x half> %v
}
@@ -672,6 +1640,17 @@ define <vscale x 1 x half> @vfadd_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv1f16_unmasked:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v10, v9
+; ZVFBFA-NEXT: vfwcvt.f.f.v v9, v8
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v9, v9, v10
+; ZVFBFA-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT: ret
%v = call <vscale x 1 x half> @llvm.vp.fadd.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %b, <vscale x 1 x i1> splat (i1 true), i32 %evl)
ret <vscale x 1 x half> %v
}
@@ -695,6 +1674,19 @@ define <vscale x 1 x half> @vfadd_vf_nxv1f16(<vscale x 1 x half> %va, half %b, <
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9, v0.t
; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv1f16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: fmv.x.w a1, fa0
+; ZVFBFA-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFBFA-NEXT: vmv.v.x v9, a1
+; ZVFBFA-NEXT: vfwcvt.f.f.v v10, v8, v0.t
+; ZVFBFA-NEXT: vfwcvt.f.f.v v8, v9, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v9, v10, v8, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v9, v0.t
+; ZVFBFA-NEXT: ret
%elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
%vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
%v = call <vscale x 1 x half> @llvm.vp.fadd.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, <vscale x 1 x i1> %m, i32 %evl)
@@ -720,6 +1712,19 @@ define <vscale x 1 x half> @vfadd_vf_nxv1f16_commute(<vscale x 1 x half> %va, ha
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9, v0.t
; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv1f16_commute:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: fmv.x.w a1, fa0
+; ZVFBFA-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFBFA-NEXT: vmv.v.x v9, a1
+; ZVFBFA-NEXT: vfwcvt.f.f.v v10, v8, v0.t
+; ZVFBFA-NEXT: vfwcvt.f.f.v v8, v9, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v9, v8, v10, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v9, v0.t
+; ZVFBFA-NEXT: ret
%elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
%vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
%v = call <vscale x 1 x half> @llvm.vp.fadd.nxv1f16(<vscale x 1 x half> %vb, <vscale x 1 x half> %va, <vscale x 1 x i1> %m, i32 %evl)
@@ -745,6 +1750,19 @@ define <vscale x 1 x half> @vfadd_vf_nxv1f16_unmasked(<vscale x 1 x half> %va, h
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv1f16_unmasked:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: fmv.x.w a1, fa0
+; ZVFBFA-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFBFA-NEXT: vmv.v.x v9, a1
+; ZVFBFA-NEXT: vfwcvt.f.f.v v10, v8
+; ZVFBFA-NEXT: vfwcvt.f.f.v v8, v9
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v9, v10, v8
+; ZVFBFA-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT: ret
%elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
%vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
%v = call <vscale x 1 x half> @llvm.vp.fadd.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -770,6 +1788,19 @@ define <vscale x 1 x half> @vfadd_vf_nxv1f16_unmasked_commute(<vscale x 1 x half
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv1f16_unmasked_commute:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: fmv.x.w a1, fa0
+; ZVFBFA-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFBFA-NEXT: vmv.v.x v9, a1
+; ZVFBFA-NEXT: vfwcvt.f.f.v v10, v8
+; ZVFBFA-NEXT: vfwcvt.f.f.v v8, v9
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v9, v8, v10
+; ZVFBFA-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT: ret
%elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
%vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
%v = call <vscale x 1 x half> @llvm.vp.fadd.nxv1f16(<vscale x 1 x half> %vb, <vscale x 1 x half> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -795,6 +1826,17 @@ define <vscale x 2 x half> @vfadd_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9, v0.t
; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv2f16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v10, v9, v0.t
+; ZVFBFA-NEXT: vfwcvt.f.f.v v9, v8, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v9, v9, v10, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v9, v0.t
+; ZVFBFA-NEXT: ret
%v = call <vscale x 2 x half> @llvm.vp.fadd.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %b, <vscale x 2 x i1> %m, i32 %evl)
ret <vscale x 2 x half> %v
}
@@ -816,6 +1858,17 @@ define <vscale x 2 x half> @vfadd_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv2f16_unmasked:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v10, v9
+; ZVFBFA-NEXT: vfwcvt.f.f.v v9, v8
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v9, v9, v10
+; ZVFBFA-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT: ret
%v = call <vscale x 2 x half> @llvm.vp.fadd.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %b, <vscale x 2 x i1> splat (i1 true), i32 %evl)
ret <vscale x 2 x half> %v
}
@@ -839,6 +1892,19 @@ define <vscale x 2 x half> @vfadd_vf_nxv2f16(<vscale x 2 x half> %va, half %b, <
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9, v0.t
; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv2f16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: fmv.x.w a1, fa0
+; ZVFBFA-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFBFA-NEXT: vmv.v.x v9, a1
+; ZVFBFA-NEXT: vfwcvt.f.f.v v10, v8, v0.t
+; ZVFBFA-NEXT: vfwcvt.f.f.v v8, v9, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v9, v10, v8, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v9, v0.t
+; ZVFBFA-NEXT: ret
%elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
%vb = shufflevector <vscale x 2 x half> %elt.head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
%v = call <vscale x 2 x half> @llvm.vp.fadd.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %vb, <vscale x 2 x i1> %m, i32 %evl)
@@ -864,6 +1930,19 @@ define <vscale x 2 x half> @vfadd_vf_nxv2f16_unmasked(<vscale x 2 x half> %va, h
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv2f16_unmasked:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: fmv.x.w a1, fa0
+; ZVFBFA-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFBFA-NEXT: vmv.v.x v9, a1
+; ZVFBFA-NEXT: vfwcvt.f.f.v v10, v8
+; ZVFBFA-NEXT: vfwcvt.f.f.v v8, v9
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v9, v10, v8
+; ZVFBFA-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT: ret
%elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
%vb = shufflevector <vscale x 2 x half> %elt.head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
%v = call <vscale x 2 x half> @llvm.vp.fadd.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %vb, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -889,6 +1968,17 @@ define <vscale x 4 x half> @vfadd_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10, v0.t
; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv4f16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v10, v9, v0.t
+; ZVFBFA-NEXT: vfwcvt.f.f.v v12, v8, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v10, v12, v10, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v10, v0.t
+; ZVFBFA-NEXT: ret
%v = call <vscale x 4 x half> @llvm.vp.fadd.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %b, <vscale x 4 x i1> %m, i32 %evl)
ret <vscale x 4 x half> %v
}
@@ -910,6 +2000,17 @@ define <vscale x 4 x half> @vfadd_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10
; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv4f16_unmasked:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v10, v9
+; ZVFBFA-NEXT: vfwcvt.f.f.v v12, v8
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v10, v12, v10
+; ZVFBFA-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v10
+; ZVFBFA-NEXT: ret
%v = call <vscale x 4 x half> @llvm.vp.fadd.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %b, <vscale x 4 x i1> splat (i1 true), i32 %evl)
ret <vscale x 4 x half> %v
}
@@ -933,6 +2034,19 @@ define <vscale x 4 x half> @vfadd_vf_nxv4f16(<vscale x 4 x half> %va, half %b, <
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10, v0.t
; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv4f16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: fmv.x.w a1, fa0
+; ZVFBFA-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; ZVFBFA-NEXT: vmv.v.x v12, a1
+; ZVFBFA-NEXT: vfwcvt.f.f.v v10, v8, v0.t
+; ZVFBFA-NEXT: vfwcvt.f.f.v v8, v12, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v10, v10, v8, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v10, v0.t
+; ZVFBFA-NEXT: ret
%elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
%vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
%v = call <vscale x 4 x half> @llvm.vp.fadd.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %vb, <vscale x 4 x i1> %m, i32 %evl)
@@ -958,6 +2072,19 @@ define <vscale x 4 x half> @vfadd_vf_nxv4f16_unmasked(<vscale x 4 x half> %va, h
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10
; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv4f16_unmasked:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: fmv.x.w a1, fa0
+; ZVFBFA-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; ZVFBFA-NEXT: vmv.v.x v12, a1
+; ZVFBFA-NEXT: vfwcvt.f.f.v v10, v8
+; ZVFBFA-NEXT: vfwcvt.f.f.v v8, v12
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v10, v10, v8
+; ZVFBFA-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v10
+; ZVFBFA-NEXT: ret
%elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
%vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
%v = call <vscale x 4 x half> @llvm.vp.fadd.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %vb, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -983,6 +2110,17 @@ define <vscale x 8 x half> @vfadd_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12, v0.t
; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv8f16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: vsetvli zero, a0, e16, m2, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v12, v10, v0.t
+; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v8, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v12, v16, v12, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v12, v0.t
+; ZVFBFA-NEXT: ret
%v = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %b, <vscale x 8 x i1> %m, i32 %evl)
ret <vscale x 8 x half> %v
}
@@ -1004,6 +2142,17 @@ define <vscale x 8 x half> @vfadd_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12
; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv8f16_unmasked:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: vsetvli zero, a0, e16, m2, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v12, v10
+; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v8
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v12, v16, v12
+; ZVFBFA-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v12
+; ZVFBFA-NEXT: ret
%v = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %b, <vscale x 8 x i1> splat (i1 true), i32 %evl)
ret <vscale x 8 x half> %v
}
@@ -1027,6 +2176,19 @@ define <vscale x 8 x half> @vfadd_vf_nxv8f16(<vscale x 8 x half> %va, half %b, <
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12, v0.t
; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv8f16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: fmv.x.w a1, fa0
+; ZVFBFA-NEXT: vsetvli zero, a0, e16, m2, ta, ma
+; ZVFBFA-NEXT: vmv.v.x v16, a1
+; ZVFBFA-NEXT: vfwcvt.f.f.v v12, v8, v0.t
+; ZVFBFA-NEXT: vfwcvt.f.f.v v8, v16, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v12, v12, v8, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v12, v0.t
+; ZVFBFA-NEXT: ret
%elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
%vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
%v = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %vb, <vscale x 8 x i1> %m, i32 %evl)
@@ -1052,6 +2214,19 @@ define <vscale x 8 x half> @vfadd_vf_nxv8f16_unmasked(<vscale x 8 x half> %va, h
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12
; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv8f16_unmasked:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: fmv.x.w a1, fa0
+; ZVFBFA-NEXT: vsetvli zero, a0, e16, m2, ta, ma
+; ZVFBFA-NEXT: vmv.v.x v16, a1
+; ZVFBFA-NEXT: vfwcvt.f.f.v v12, v8
+; ZVFBFA-NEXT: vfwcvt.f.f.v v8, v16
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v12, v12, v8
+; ZVFBFA-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v12
+; ZVFBFA-NEXT: ret
%elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
%vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
%v = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %vb, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -1077,6 +2252,17 @@ define <vscale x 16 x half> @vfadd_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16, v0.t
; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv16f16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v12, v0.t
+; ZVFBFA-NEXT: vfwcvt.f.f.v v24, v8, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v16, v24, v16, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v16, v0.t
+; ZVFBFA-NEXT: ret
%v = call <vscale x 16 x half> @llvm.vp.fadd.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %b, <vscale x 16 x i1> %m, i32 %evl)
ret <vscale x 16 x half> %v
}
@@ -1098,6 +2284,17 @@ define <vscale x 16 x half> @vfadd_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv16f16_unmasked:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v12
+; ZVFBFA-NEXT: vfwcvt.f.f.v v24, v8
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v16, v24, v16
+; ZVFBFA-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v16
+; ZVFBFA-NEXT: ret
%v = call <vscale x 16 x half> @llvm.vp.fadd.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %b, <vscale x 16 x i1> splat (i1 true), i32 %evl)
ret <vscale x 16 x half> %v
}
@@ -1121,6 +2318,19 @@ define <vscale x 16 x half> @vfadd_vf_nxv16f16(<vscale x 16 x half> %va, half %b
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16, v0.t
; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv16f16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: fmv.x.w a1, fa0
+; ZVFBFA-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; ZVFBFA-NEXT: vmv.v.x v24, a1
+; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v8, v0.t
+; ZVFBFA-NEXT: vfwcvt.f.f.v v8, v24, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v16, v16, v8, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v16, v0.t
+; ZVFBFA-NEXT: ret
%elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
%vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
%v = call <vscale x 16 x half> @llvm.vp.fadd.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %vb, <vscale x 16 x i1> %m, i32 %evl)
@@ -1146,6 +2356,19 @@ define <vscale x 16 x half> @vfadd_vf_nxv16f16_unmasked(<vscale x 16 x half> %va
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv16f16_unmasked:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: fmv.x.w a1, fa0
+; ZVFBFA-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; ZVFBFA-NEXT: vmv.v.x v24, a1
+; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v8
+; ZVFBFA-NEXT: vfwcvt.f.f.v v8, v24
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v16, v16, v8
+; ZVFBFA-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v16
+; ZVFBFA-NEXT: ret
%elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
%vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
%v = call <vscale x 16 x half> @llvm.vp.fadd.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %vb, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -1209,6 +2432,55 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
; ZVFHMIN-NEXT: addi sp, sp, 16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv32f16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: addi sp, sp, -16
+; ZVFBFA-NEXT: .cfi_def_cfa_offset 16
+; ZVFBFA-NEXT: csrr a1, vlenb
+; ZVFBFA-NEXT: slli a1, a1, 3
+; ZVFBFA-NEXT: sub sp, sp, a1
+; ZVFBFA-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFBFA-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT: vmv1r.v v7, v0
+; ZVFBFA-NEXT: csrr a2, vlenb
+; ZVFBFA-NEXT: slli a1, a2, 1
+; ZVFBFA-NEXT: srli a2, a2, 2
+; ZVFBFA-NEXT: sub a3, a0, a1
+; ZVFBFA-NEXT: vslidedown.vx v0, v0, a2
+; ZVFBFA-NEXT: sltu a2, a0, a3
+; ZVFBFA-NEXT: addi a2, a2, -1
+; ZVFBFA-NEXT: and a2, a2, a3
+; ZVFBFA-NEXT: addi a3, sp, 16
+; ZVFBFA-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v24, v20, v0.t
+; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v12, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v16, v16, v24, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v12, v16, v0.t
+; ZVFBFA-NEXT: bltu a0, a1, .LBB48_2
+; ZVFBFA-NEXT: # %bb.1:
+; ZVFBFA-NEXT: mv a0, a1
+; ZVFBFA-NEXT: .LBB48_2:
+; ZVFBFA-NEXT: vmv1r.v v0, v7
+; ZVFBFA-NEXT: addi a1, sp, 16
+; ZVFBFA-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v24, v0.t
+; ZVFBFA-NEXT: vfwcvt.f.f.v v24, v8, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v16, v24, v16, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v16, v0.t
+; ZVFBFA-NEXT: csrr a0, vlenb
+; ZVFBFA-NEXT: slli a0, a0, 3
+; ZVFBFA-NEXT: add sp, sp, a0
+; ZVFBFA-NEXT: .cfi_def_cfa sp, 16
+; ZVFBFA-NEXT: addi sp, sp, 16
+; ZVFBFA-NEXT: .cfi_def_cfa_offset 0
+; ZVFBFA-NEXT: ret
%v = call <vscale x 32 x half> @llvm.vp.fadd.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x i1> %m, i32 %evl)
ret <vscale x 32 x half> %v
}
@@ -1268,6 +2540,55 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: addi sp, sp, 16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv32f16_unmasked:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: addi sp, sp, -16
+; ZVFBFA-NEXT: .cfi_def_cfa_offset 16
+; ZVFBFA-NEXT: csrr a1, vlenb
+; ZVFBFA-NEXT: slli a1, a1, 3
+; ZVFBFA-NEXT: sub sp, sp, a1
+; ZVFBFA-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFBFA-NEXT: csrr a2, vlenb
+; ZVFBFA-NEXT: vsetvli a1, zero, e8, m4, ta, ma
+; ZVFBFA-NEXT: vmset.m v24
+; ZVFBFA-NEXT: slli a1, a2, 1
+; ZVFBFA-NEXT: srli a2, a2, 2
+; ZVFBFA-NEXT: sub a3, a0, a1
+; ZVFBFA-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT: vslidedown.vx v0, v24, a2
+; ZVFBFA-NEXT: sltu a2, a0, a3
+; ZVFBFA-NEXT: addi a2, a2, -1
+; ZVFBFA-NEXT: and a2, a2, a3
+; ZVFBFA-NEXT: addi a3, sp, 16
+; ZVFBFA-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v24, v20, v0.t
+; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v12, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v16, v16, v24, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v12, v16, v0.t
+; ZVFBFA-NEXT: bltu a0, a1, .LBB49_2
+; ZVFBFA-NEXT: # %bb.1:
+; ZVFBFA-NEXT: mv a0, a1
+; ZVFBFA-NEXT: .LBB49_2:
+; ZVFBFA-NEXT: addi a1, sp, 16
+; ZVFBFA-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v24
+; ZVFBFA-NEXT: vfwcvt.f.f.v v24, v8
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v16, v24, v16
+; ZVFBFA-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v16
+; ZVFBFA-NEXT: csrr a0, vlenb
+; ZVFBFA-NEXT: slli a0, a0, 3
+; ZVFBFA-NEXT: add sp, sp, a0
+; ZVFBFA-NEXT: .cfi_def_cfa sp, 16
+; ZVFBFA-NEXT: addi sp, sp, 16
+; ZVFBFA-NEXT: .cfi_def_cfa_offset 0
+; ZVFBFA-NEXT: ret
%v = call <vscale x 32 x half> @llvm.vp.fadd.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x i1> splat (i1 true), i32 %evl)
ret <vscale x 32 x half> %v
}
@@ -1340,6 +2661,68 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16(<vscale x 32 x half> %va, half %b
; ZVFHMIN-NEXT: addi sp, sp, 16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv32f16:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: addi sp, sp, -16
+; ZVFBFA-NEXT: .cfi_def_cfa_offset 16
+; ZVFBFA-NEXT: csrr a1, vlenb
+; ZVFBFA-NEXT: slli a1, a1, 4
+; ZVFBFA-NEXT: sub sp, sp, a1
+; ZVFBFA-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; ZVFBFA-NEXT: vsetvli a1, zero, e16, m8, ta, ma
+; ZVFBFA-NEXT: vmv1r.v v7, v0
+; ZVFBFA-NEXT: fmv.x.w a1, fa0
+; ZVFBFA-NEXT: csrr a2, vlenb
+; ZVFBFA-NEXT: vmv.v.x v24, a1
+; ZVFBFA-NEXT: slli a1, a2, 1
+; ZVFBFA-NEXT: srli a2, a2, 2
+; ZVFBFA-NEXT: sub a3, a0, a1
+; ZVFBFA-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT: vslidedown.vx v0, v0, a2
+; ZVFBFA-NEXT: sltu a2, a0, a3
+; ZVFBFA-NEXT: addi a2, a2, -1
+; ZVFBFA-NEXT: and a2, a2, a3
+; ZVFBFA-NEXT: csrr a3, vlenb
+; ZVFBFA-NEXT: slli a3, a3, 3
+; ZVFBFA-NEXT: add a3, sp, a3
+; ZVFBFA-NEXT: addi a3, a3, 16
+; ZVFBFA-NEXT: vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v28, v0.t
+; ZVFBFA-NEXT: vfwcvt.f.f.v v24, v12, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v16, v24, v16, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v12, v16, v0.t
+; ZVFBFA-NEXT: bltu a0, a1, .LBB50_2
+; ZVFBFA-NEXT: # %bb.1:
+; ZVFBFA-NEXT: mv a0, a1
+; ZVFBFA-NEXT: .LBB50_2:
+; ZVFBFA-NEXT: vmv1r.v v0, v7
+; ZVFBFA-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v8, v0.t
+; ZVFBFA-NEXT: addi a0, sp, 16
+; ZVFBFA-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT: csrr a0, vlenb
+; ZVFBFA-NEXT: slli a0, a0, 3
+; ZVFBFA-NEXT: add a0, sp, a0
+; ZVFBFA-NEXT: addi a0, a0, 16
+; ZVFBFA-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT: vfwcvt.f.f.v v24, v16, v0.t
+; ZVFBFA-NEXT: addi a0, sp, 16
+; ZVFBFA-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v16, v16, v24, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v16, v0.t
+; ZVFBFA-NEXT: csrr a0, vlenb
+; ZVFBFA-NEXT: slli a0, a0, 4
+; ZVFBFA-NEXT: add sp, sp, a0
+; ZVFBFA-NEXT: .cfi_def_cfa sp, 16
+; ZVFBFA-NEXT: addi sp, sp, 16
+; ZVFBFA-NEXT: .cfi_def_cfa_offset 0
+; ZVFBFA-NEXT: ret
%elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
%vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
%v = call <vscale x 32 x half> @llvm.vp.fadd.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x i1> %m, i32 %evl)
@@ -1403,6 +2786,57 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: addi sp, sp, 16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
; ZVFHMIN-NEXT: ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv32f16_unmasked:
+; ZVFBFA: # %bb.0:
+; ZVFBFA-NEXT: addi sp, sp, -16
+; ZVFBFA-NEXT: .cfi_def_cfa_offset 16
+; ZVFBFA-NEXT: csrr a1, vlenb
+; ZVFBFA-NEXT: slli a1, a1, 3
+; ZVFBFA-NEXT: sub sp, sp, a1
+; ZVFBFA-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFBFA-NEXT: fmv.x.w a1, fa0
+; ZVFBFA-NEXT: csrr a2, vlenb
+; ZVFBFA-NEXT: vsetvli a3, zero, e16, m8, ta, ma
+; ZVFBFA-NEXT: vmset.m v24
+; ZVFBFA-NEXT: vmv.v.x v16, a1
+; ZVFBFA-NEXT: slli a1, a2, 1
+; ZVFBFA-NEXT: srli a2, a2, 2
+; ZVFBFA-NEXT: sub a3, a0, a1
+; ZVFBFA-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT: vslidedown.vx v0, v24, a2
+; ZVFBFA-NEXT: sltu a2, a0, a3
+; ZVFBFA-NEXT: addi a2, a2, -1
+; ZVFBFA-NEXT: and a2, a2, a3
+; ZVFBFA-NEXT: addi a3, sp, 16
+; ZVFBFA-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v24, v20, v0.t
+; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v12, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v16, v16, v24, v0.t
+; ZVFBFA-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v12, v16, v0.t
+; ZVFBFA-NEXT: bltu a0, a1, .LBB51_2
+; ZVFBFA-NEXT: # %bb.1:
+; ZVFBFA-NEXT: mv a0, a1
+; ZVFBFA-NEXT: .LBB51_2:
+; ZVFBFA-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v8
+; ZVFBFA-NEXT: addi a0, sp, 16
+; ZVFBFA-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT: vfwcvt.f.f.v v24, v0
+; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT: vfadd.vv v16, v16, v24
+; ZVFBFA-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT: vfncvt.f.f.w v8, v16
+; ZVFBFA-NEXT: csrr a0, vlenb
+; ZVFBFA-NEXT: slli a0, a0, 3
+; ZVFBFA-NEXT: add sp, sp, a0
+; ZVFBFA-NEXT: .cfi_def_cfa sp, 16
+; ZVFBFA-NEXT: addi sp, sp, 16
+; ZVFBFA-NEXT: .cfi_def_cfa_offset 0
+; ZVFBFA-NEXT: ret
%elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
%vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
%v = call <vscale x 32 x half> @llvm.vp.fadd.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll
index bd07ba1..eb4cf76 100644
--- a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll
@@ -20,6 +20,9 @@
; CHECK: OpReadClockKHR [[v2uint]] [[uint_1]]
; CHECK: OpReadClockKHR [[v2uint]] [[uint_2]]
; CHECK: OpReadClockKHR [[v2uint]] [[uint_3]]
+; CHECK: OpReadClockKHR [[ulong]] [[uint_1]]
+; CHECK: OpReadClockKHR [[ulong]] [[uint_2]]
+; CHECK: OpReadClockKHR [[ulong]] [[uint_3]]
define dso_local spir_kernel void @test_clocks(ptr addrspace(1) nocapture noundef writeonly align 8 %out64, ptr addrspace(1) nocapture noundef writeonly align 8 %outv2) {
entry:
@@ -39,6 +42,9 @@ entry:
%call9 = tail call spir_func <2 x i32> @_Z25clock_read_hilo_sub_groupv()
%arrayidx10 = getelementptr inbounds i8, ptr addrspace(1) %outv2, i32 16
store <2 x i32> %call9, ptr addrspace(1) %arrayidx10, align 8
+ %call10 = call spir_func i64 @_Z27__spirv_ReadClockKHR_Rulongi(i32 1)
+ %call11 = call spir_func i64 @_Z27__spirv_ReadClockKHR_Rulongi(i32 2)
+ %call12 = call spir_func i64 @_Z27__spirv_ReadClockKHR_Rulongi(i32 3)
ret void
}
@@ -59,3 +65,6 @@ declare spir_func <2 x i32> @_Z26clock_read_hilo_work_groupv() local_unnamed_add
; Function Attrs: convergent nounwind
declare spir_func <2 x i32> @_Z25clock_read_hilo_sub_groupv() local_unnamed_addr
+
+; Function Attrs: nounwind
+declare spir_func i64 @_Z27__spirv_ReadClockKHR_Rulongi(i32)
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/cbuffer.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/cbuffer.ll
index 4d32e66..6d41875 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/cbuffer.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/cbuffer.ll
@@ -1,5 +1,5 @@
; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.6-vulkan1.3-library %s -o - | FileCheck %s
-; Test that uses of cbuffer members inside ConstantExprs are handled correctly.
+; Test that uses of cbuffer members are handled correctly.
; CHECK-DAG: OpDecorate %[[MyCBuffer:[0-9]+]] DescriptorSet 0
; CHECK-DAG: OpDecorate %[[MyCBuffer]] Binding 0
@@ -37,10 +37,8 @@ entry:
; CHECK: %[[tmp_ptr:[0-9]+]] = OpAccessChain {{%[0-9]+}} %[[tmp]] %[[uint_0]] %[[uint_0]]
; CHECK: %[[v_ptr:.+]] = OpAccessChain %[[_ptr_Uniform_v4float]] %[[tmp]] %[[uint_0]] %[[uint_1]]
; CHECK: %[[s_ptr_gep:[0-9]+]] = OpInBoundsAccessChain %[[_ptr_Uniform_float]] %[[tmp_ptr]] %[[uint_0]] %[[uint_1]]
- %gep = getelementptr inbounds %MyStruct, ptr addrspace(12) @s, i32 0, i32 0, i32 1
-
; CHECK: %[[s_val:.+]] = OpLoad %[[float]] %[[s_ptr_gep]]
- %load_from_gep = load float, ptr addrspace(12) %gep, align 4
+ %load_from_gep = load float, ptr addrspace(12) getelementptr inbounds (%MyStruct, ptr addrspace(12) @s, i32 0, i32 0, i32 1), align 4
; CHECK: %[[v_val:.+]] = OpLoad %[[v4float]] %[[v_ptr]]
%load_v = load <4 x float>, ptr addrspace(12) @v, align 16
diff --git a/llvm/test/CodeGen/Thumb/PR17309.ll b/llvm/test/CodeGen/Thumb/PR17309.ll
index b548499..4da25ca 100644
--- a/llvm/test/CodeGen/Thumb/PR17309.ll
+++ b/llvm/test/CodeGen/Thumb/PR17309.ll
@@ -48,7 +48,7 @@ declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #1
declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #1
-attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
attributes #1 = { nounwind }
-attributes #2 = { optsize "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { optsize "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
attributes #3 = { nounwind optsize }
diff --git a/llvm/test/CodeGen/Thumb/fastcc.ll b/llvm/test/CodeGen/Thumb/fastcc.ll
index be356d8..000e20a 100644
--- a/llvm/test/CodeGen/Thumb/fastcc.ll
+++ b/llvm/test/CodeGen/Thumb/fastcc.ll
@@ -29,7 +29,7 @@ for.body193: ; preds = %for.body193, %for.e
br label %for.body193
}
-attributes #0 = { optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
!llvm.ident = !{!0}
diff --git a/llvm/test/CodeGen/Thumb/ldm-merge-call.ll b/llvm/test/CodeGen/Thumb/ldm-merge-call.ll
index 700b207..33c4346 100644
--- a/llvm/test/CodeGen/Thumb/ldm-merge-call.ll
+++ b/llvm/test/CodeGen/Thumb/ldm-merge-call.ll
@@ -19,6 +19,6 @@ entry:
; Function Attrs: optsize
declare i32 @bar(i32, i32, i32, i32) #1
-attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
attributes #2 = { nounwind optsize }
diff --git a/llvm/test/CodeGen/Thumb/stack_guard_remat.ll b/llvm/test/CodeGen/Thumb/stack_guard_remat.ll
index cc14239..82314be 100644
--- a/llvm/test/CodeGen/Thumb/stack_guard_remat.ll
+++ b/llvm/test/CodeGen/Thumb/stack_guard_remat.ll
@@ -50,4 +50,4 @@ declare void @foo3(ptr)
; Function Attrs: nounwind
declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
-attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/Thumb/stm-merge.ll b/llvm/test/CodeGen/Thumb/stm-merge.ll
index 837c2f6..426210a 100644
--- a/llvm/test/CodeGen/Thumb/stm-merge.ll
+++ b/llvm/test/CodeGen/Thumb/stm-merge.ll
@@ -38,4 +38,4 @@ for.end8: ; preds = %for.body5
ret void
}
-attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-block-debug.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-block-debug.mir
index 4e2a275..00f0a1c 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-block-debug.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-block-debug.mir
@@ -118,7 +118,7 @@
declare i32 @llvm.start.loop.iterations.i32(i32)
declare i32 @llvm.loop.decrement.reg.i32(i32, i32)
- attributes #0 = { nofree nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+cdecp0,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" "unsafe-fp-math"="true" }
+ attributes #0 = { nofree nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+cdecp0,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" }
!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, retainedTypes: !2, splitDebugInlining: false, nameTableKind: None)
!1 = !DIFile(filename: "tmp.c", directory: "")
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-1-pred.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-1-pred.mir
index 4e817ba..e48a038 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-1-pred.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-1-pred.mir
@@ -13,7 +13,7 @@
ret <4 x float> %inactive1
}
- attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "use-soft-float"="false" }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-2-preds.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-2-preds.mir
index 6b5cbce..b8657c2 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-2-preds.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-2-preds.mir
@@ -16,7 +16,7 @@
declare <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float>, <4 x float>, <4 x float>, i32) #1
- attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "use-soft-float"="false" }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-ctrl-flow.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-ctrl-flow.mir
index 91ccf3b..68a38a4 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-ctrl-flow.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-ctrl-flow.mir
@@ -19,7 +19,7 @@
declare <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float>, <4 x float>, <4 x float>, i32) #1
- attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "use-soft-float"="false" }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-non-consecutive-ins.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-non-consecutive-ins.mir
index 1f19ed9..caa7b17 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-non-consecutive-ins.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-non-consecutive-ins.mir
@@ -17,7 +17,7 @@
declare <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float>, <4 x float>, <4 x float>, i32) #1
- attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "use-soft-float"="false" }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks.mir
index 4f75b01..2f07485 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks.mir
@@ -18,7 +18,7 @@
declare <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float>, <4 x float>, <4 x float>, i32) #1
- attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "use-soft-float"="false" }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-3-blocks-kill-vpr.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-3-blocks-kill-vpr.mir
index c268388..f6b64a0 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-3-blocks-kill-vpr.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-3-blocks-kill-vpr.mir
@@ -17,7 +17,7 @@
declare <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float>, <4 x float>, <4 x float>, i32) #1
- attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "use-soft-float"="false" }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-block-1-ins.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-block-1-ins.mir
index 1e9e0e3..d086566 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-block-1-ins.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-block-1-ins.mir
@@ -14,7 +14,7 @@
declare <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float>, <4 x float>, <4 x float>, i32) #1
- attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "use-soft-float"="false" }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-block-2-ins.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-block-2-ins.mir
index cb73cdf..5436882 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-block-2-ins.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-block-2-ins.mir
@@ -15,7 +15,7 @@
declare <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float>, <4 x float>, <4 x float>, i32) #1
- attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "use-soft-float"="false" }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-block-4-ins.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-block-4-ins.mir
index 62d7640d..435836d 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-block-4-ins.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-block-4-ins.mir
@@ -17,7 +17,7 @@
declare <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float>, <4 x float>, <4 x float>, i32) #1
- attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "use-soft-float"="false" }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-block-elses.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-block-elses.mir
index 130c7f4..dc195dd 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-block-elses.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-block-elses.mir
@@ -17,7 +17,7 @@
declare <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float>, <4 x float>, <4 x float>, i32) #1
- attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "use-soft-float"="false" }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir
index 0ffed2e..ee2e58f 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir
@@ -18,7 +18,7 @@
declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32 immarg, <4 x i1>, <4 x i32>) #2
declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32 immarg, <4 x i1>) #3
- attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+fp-armv8d16sp,+fp16,+fpregs,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+fp-armv8d16sp,+fp16,+fpregs,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp" "use-soft-float"="false" }
attributes #1 = { nounwind readnone }
attributes #2 = { argmemonly nounwind readonly willreturn }
attributes #3 = { argmemonly nounwind willreturn }
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-block-optnone.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-block-optnone.mir
index 695a8d8..ba21068 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-block-optnone.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-block-optnone.mir
@@ -14,7 +14,7 @@
declare <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float>, <4 x float>, <4 x float>, i32) #1
- attributes #0 = { noinline optnone nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #0 = { noinline optnone nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "use-soft-float"="false" }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/Thumb2/pacbti-m-outliner-4.ll b/llvm/test/CodeGen/Thumb2/pacbti-m-outliner-4.ll
index 8777d51..db779de 100644
--- a/llvm/test/CodeGen/Thumb2/pacbti-m-outliner-4.ll
+++ b/llvm/test/CodeGen/Thumb2/pacbti-m-outliner-4.ll
@@ -179,7 +179,7 @@ return: ; preds = %entry, %if.end
; CHECK-NOT: aut
; CHECK: b _Z1hii
-attributes #0 = { minsize noinline optsize "sign-return-address"="non-leaf" "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m3" "target-features"="+armv7-m,+hwdiv,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { minsize noinline optsize "sign-return-address"="non-leaf" "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m3" "target-features"="+armv7-m,+hwdiv,+thumb-mode" "use-soft-float"="false" }
attributes #1 = { nounwind "sign-return-address"="non-leaf" }
attributes #2 = { noreturn "sign-return-address"="non-leaf" }
diff --git a/llvm/test/CodeGen/Thumb2/stack_guard_remat.ll b/llvm/test/CodeGen/Thumb2/stack_guard_remat.ll
index 4a93c2c..0ee075c 100644
--- a/llvm/test/CodeGen/Thumb2/stack_guard_remat.ll
+++ b/llvm/test/CodeGen/Thumb2/stack_guard_remat.ll
@@ -38,7 +38,7 @@ declare void @foo3(ptr)
; Function Attrs: nounwind
declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
-attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
!llvm.module.flags = !{!0}
!0 = !{i32 7, !"PIC Level", i32 2}
diff --git a/llvm/test/CodeGen/Thumb2/t2sizereduction.mir b/llvm/test/CodeGen/Thumb2/t2sizereduction.mir
index 48b75ed5..f5eb642 100644
--- a/llvm/test/CodeGen/Thumb2/t2sizereduction.mir
+++ b/llvm/test/CodeGen/Thumb2/t2sizereduction.mir
@@ -29,7 +29,7 @@
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}
- attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m7" "target-features"="-d32,+dsp,+fp-armv8,-fp64,+hwdiv,+strict-align,+thumb-mode,-crc,-dotprod,-hwdiv-arm,-ras" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m7" "target-features"="-d32,+dsp,+fp-armv8,-fp64,+hwdiv,+strict-align,+thumb-mode,-crc,-dotprod,-hwdiv-arm,-ras" "use-soft-float"="false" }
...
---
diff --git a/llvm/test/CodeGen/WebAssembly/memory-interleave.ll b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
index 104ec31..5eb49fd 100644
--- a/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
+++ b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
@@ -2103,10 +2103,7 @@ for.body: ; preds = %entry, %for.body
; CHECK-LABEL: four_floats_same_op:
; CHECK: loop
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: f32x4.mul
-; CHECK: v128.store
+; CHECK-NOT: v128.load
define hidden void @four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
entry:
%cmp45.not = icmp eq i32 %N, 0
diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmax.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmax.ll
index 45f4ddd..f224a0d 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmax.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmax.ll
@@ -54,6 +54,250 @@ define <2 x double> @test_minimumnum_f64x2(<2 x double> %a, <2 x double> %b) {
ret <2 x double> %result
}
+define <4 x float> @test_pmax_v4f32_olt(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_pmax_v4f32_olt:
+; CHECK: .functype test_pmax_v4f32_olt (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: f32x4.relaxed_max
+; CHECK-NEXT: # fallthrough-return
+ %c = fcmp olt <4 x float> %x, %y
+ %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+ ret <4 x float> %a
+}
+
+define <4 x float> @test_pmax_v4f32_ole(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_pmax_v4f32_ole:
+; CHECK: .functype test_pmax_v4f32_ole (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: f32x4.relaxed_max
+; CHECK-NEXT: # fallthrough-return
+ %c = fcmp ole <4 x float> %x, %y
+ %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+ ret <4 x float> %a
+}
+
+define <4 x float> @test_pmax_v4f32_ogt(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_pmax_v4f32_ogt:
+; CHECK: .functype test_pmax_v4f32_ogt (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: f32x4.relaxed_max
+; CHECK-NEXT: # fallthrough-return
+ %c = fcmp ogt <4 x float> %y, %x
+ %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+ ret <4 x float> %a
+}
+
+define <4 x float> @test_pmax_v4f32_oge(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_pmax_v4f32_oge:
+; CHECK: .functype test_pmax_v4f32_oge (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: f32x4.relaxed_max
+; CHECK-NEXT: # fallthrough-return
+ %c = fcmp oge <4 x float> %y, %x
+ %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+ ret <4 x float> %a
+}
+
+; For setlt
+define <4 x float> @pmax_v4f32_fast_olt(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: pmax_v4f32_fast_olt:
+; CHECK: .functype pmax_v4f32_fast_olt (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: f32x4.relaxed_max
+; CHECK-NEXT: # fallthrough-return
+ %c = fcmp fast olt <4 x float> %x, %y
+ %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+ ret <4 x float> %a
+}
+
+; For setle
+define <4 x float> @test_pmax_v4f32_fast_ole(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_pmax_v4f32_fast_ole:
+; CHECK: .functype test_pmax_v4f32_fast_ole (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: f32x4.relaxed_max
+; CHECK-NEXT: # fallthrough-return
+ %c = fcmp fast ole <4 x float> %x, %y
+ %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+ ret <4 x float> %a
+}
+
+; For setgt
+define <4 x float> @test_pmax_v4f32_fast_ogt(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_pmax_v4f32_fast_ogt:
+; CHECK: .functype test_pmax_v4f32_fast_ogt (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: f32x4.relaxed_max
+; CHECK-NEXT: # fallthrough-return
+ %c = fcmp fast ogt <4 x float> %x, %y
+ %a = select <4 x i1> %c, <4 x float> %x, <4 x float> %y
+ ret <4 x float> %a
+}
+
+; For setge
+define <4 x float> @test_pmax_v4f32_fast_oge(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_pmax_v4f32_fast_oge:
+; CHECK: .functype test_pmax_v4f32_fast_oge (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: f32x4.relaxed_max
+; CHECK-NEXT: # fallthrough-return
+ %c = fcmp fast oge <4 x float> %x, %y
+ %a = select <4 x i1> %c, <4 x float> %x, <4 x float> %y
+ ret <4 x float> %a
+}
+
+define <4 x i32> @test_pmax_int_v4f32(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: test_pmax_int_v4f32:
+; CHECK: .functype test_pmax_int_v4f32 (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: f32x4.relaxed_max
+; CHECK-NEXT: # fallthrough-return
+ %fx = bitcast <4 x i32> %x to <4 x float>
+ %fy = bitcast <4 x i32> %y to <4 x float>
+ %c = fcmp olt <4 x float> %fy, %fx
+ %a = select <4 x i1> %c, <4 x i32> %x, <4 x i32> %y
+ ret <4 x i32> %a
+}
+
+define <2 x double> @test_pmax_v2f64_olt(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: test_pmax_v2f64_olt:
+; CHECK: .functype test_pmax_v2f64_olt (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: f64x2.relaxed_max
+; CHECK-NEXT: # fallthrough-return
+ %c = fcmp olt <2 x double> %x, %y
+ %a = select <2 x i1> %c, <2 x double> %y, <2 x double> %x
+ ret <2 x double> %a
+}
+
+define <2 x double> @test_pmax_v2f64_ole(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: test_pmax_v2f64_ole:
+; CHECK: .functype test_pmax_v2f64_ole (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: f64x2.relaxed_max
+; CHECK-NEXT: # fallthrough-return
+ %c = fcmp ole <2 x double> %x, %y
+ %a = select <2 x i1> %c, <2 x double> %y, <2 x double> %x
+ ret <2 x double> %a
+}
+
+define <2 x double> @test_pmax_v2f64_ogt(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: test_pmax_v2f64_ogt:
+; CHECK: .functype test_pmax_v2f64_ogt (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: f64x2.relaxed_max
+; CHECK-NEXT: # fallthrough-return
+ %c = fcmp ogt <2 x double> %x, %y
+ %a = select <2 x i1> %c, <2 x double> %x, <2 x double> %y
+ ret <2 x double> %a
+}
+define <2 x double> @test_pmax_v2f64_oge(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: test_pmax_v2f64_oge:
+; CHECK: .functype test_pmax_v2f64_oge (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: f64x2.relaxed_max
+; CHECK-NEXT: # fallthrough-return
+ %c = fcmp oge <2 x double> %x, %y
+ %a = select <2 x i1> %c, <2 x double> %x, <2 x double> %y
+ ret <2 x double> %a
+}
+
+; For setlt
+define <2 x double> @pmax_v2f64_fast_olt(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: pmax_v2f64_fast_olt:
+; CHECK: .functype pmax_v2f64_fast_olt (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: f64x2.relaxed_max
+; CHECK-NEXT: # fallthrough-return
+ %c = fcmp fast olt <2 x double> %x, %y
+ %a = select <2 x i1> %c, <2 x double> %y, <2 x double> %x
+ ret <2 x double> %a
+}
+
+; For setle
+define <2 x double> @test_pmax_v2f64_fast_ole(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: test_pmax_v2f64_fast_ole:
+; CHECK: .functype test_pmax_v2f64_fast_ole (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: f64x2.relaxed_max
+; CHECK-NEXT: # fallthrough-return
+ %c = fcmp fast ole <2 x double> %x, %y
+ %a = select <2 x i1> %c, <2 x double> %y, <2 x double> %x
+ ret <2 x double> %a
+}
+; For setgt
+define <2 x double> @test_pmax_v2f64_fast_ogt(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: test_pmax_v2f64_fast_ogt:
+; CHECK: .functype test_pmax_v2f64_fast_ogt (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: f64x2.relaxed_max
+; CHECK-NEXT: # fallthrough-return
+ %c = fcmp fast ogt <2 x double> %x, %y
+ %a = select <2 x i1> %c, <2 x double> %x, <2 x double> %y
+ ret <2 x double> %a
+}
+
+; For setge
+define <2 x double> @test_pmax_v2f64_fast_oge(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: test_pmax_v2f64_fast_oge:
+; CHECK: .functype test_pmax_v2f64_fast_oge (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: f64x2.relaxed_max
+; CHECK-NEXT: # fallthrough-return
+ %c = fcmp fast oge <2 x double> %x, %y
+ %a = select <2 x i1> %c, <2 x double> %x, <2 x double> %y
+ ret <2 x double> %a
+}
+
+define <2 x i64> @test_pmax_int_v2f64(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: test_pmax_int_v2f64:
+; CHECK: .functype test_pmax_int_v2f64 (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: f64x2.relaxed_max
+; CHECK-NEXT: # fallthrough-return
+ %fx = bitcast <2 x i64> %x to <2 x double>
+ %fy = bitcast <2 x i64> %y to <2 x double>
+ %c = fcmp olt <2 x double> %fy, %fx
+ %a = select <2 x i1> %c, <2 x i64> %x, <2 x i64> %y
+ ret <2 x i64> %a
+}
+
declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>)
declare <4 x float> @llvm.maximumnum.v4f32(<4 x float>, <4 x float>)
declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>)
diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmin.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmin.ll
index f3eec02..4604465 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmin.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmin.ll
@@ -53,6 +53,252 @@ define <2 x double> @test_minimumnum_f64x2(<2 x double> %a, <2 x double> %b) {
ret <2 x double> %result
}
+define <4 x float> @test_pmin_v4f32_olt(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_pmin_v4f32_olt:
+; CHECK: .functype test_pmin_v4f32_olt (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: f32x4.relaxed_min
+; CHECK-NEXT: # fallthrough-return
+ %c = fcmp olt <4 x float> %y, %x
+ %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+ ret <4 x float> %a
+}
+
+define <4 x float> @test_pmin_v4f32_ole(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_pmin_v4f32_ole:
+; CHECK: .functype test_pmin_v4f32_ole (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: f32x4.relaxed_min
+; CHECK-NEXT: # fallthrough-return
+ %c = fcmp ole <4 x float> %y, %x
+ %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+ ret <4 x float> %a
+}
+
+define <4 x float> @test_pmin_v4f32_ogt(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_pmin_v4f32_ogt:
+; CHECK: .functype test_pmin_v4f32_ogt (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: f32x4.relaxed_min
+; CHECK-NEXT: # fallthrough-return
+ %c = fcmp ogt <4 x float> %x, %y
+ %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+ ret <4 x float> %a
+}
+
+define <4 x float> @test_pmin_v4f32_oge(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_pmin_v4f32_oge:
+; CHECK: .functype test_pmin_v4f32_oge (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: f32x4.relaxed_min
+; CHECK-NEXT: # fallthrough-return
+ %c = fcmp oge <4 x float> %x, %y
+ %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+ ret <4 x float> %a
+}
+
+; For setlt
+define <4 x float> @pmin_v4f32_fast_olt(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: pmin_v4f32_fast_olt:
+; CHECK: .functype pmin_v4f32_fast_olt (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: f32x4.relaxed_min
+; CHECK-NEXT: # fallthrough-return
+ %c = fcmp fast olt <4 x float> %y, %x
+ %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+ ret <4 x float> %a
+}
+
+; For setle
+define <4 x float> @test_pmin_v4f32_fast_ole(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_pmin_v4f32_fast_ole:
+; CHECK: .functype test_pmin_v4f32_fast_ole (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: f32x4.relaxed_min
+; CHECK-NEXT: # fallthrough-return
+ %c = fcmp fast ole <4 x float> %y, %x
+ %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+ ret <4 x float> %a
+}
+
+; For setgt
+define <4 x float> @test_pmin_v4f32_fast_ogt(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_pmin_v4f32_fast_ogt:
+; CHECK: .functype test_pmin_v4f32_fast_ogt (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: f32x4.relaxed_min
+; CHECK-NEXT: # fallthrough-return
+ %c = fcmp fast ogt <4 x float> %x, %y
+ %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+ ret <4 x float> %a
+}
+
+; For setge
+define <4 x float> @test_pmin_v4f32_fast_oge(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_pmin_v4f32_fast_oge:
+; CHECK: .functype test_pmin_v4f32_fast_oge (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: f32x4.relaxed_min
+; CHECK-NEXT: # fallthrough-return
+ %c = fcmp fast oge <4 x float> %x, %y
+ %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+ ret <4 x float> %a
+}
+
+define <4 x i32> @test_pmin_int_v4f32(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: test_pmin_int_v4f32:
+; CHECK: .functype test_pmin_int_v4f32 (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: f32x4.relaxed_min
+; CHECK-NEXT: # fallthrough-return
+ %fx = bitcast <4 x i32> %x to <4 x float>
+ %fy = bitcast <4 x i32> %y to <4 x float>
+ %c = fcmp olt <4 x float> %fy, %fx
+ %a = select <4 x i1> %c, <4 x i32> %y, <4 x i32> %x
+ ret <4 x i32> %a
+}
+
+define <2 x double> @test_pmin_v2f64_olt(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: test_pmin_v2f64_olt:
+; CHECK: .functype test_pmin_v2f64_olt (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: f64x2.relaxed_min
+; CHECK-NEXT: # fallthrough-return
+ %c = fcmp olt <2 x double> %y, %x
+ %a = select <2 x i1> %c, <2 x double> %y, <2 x double> %x
+ ret <2 x double> %a
+}
+
+define <2 x double> @test_pmin_v2f64_ole(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: test_pmin_v2f64_ole:
+; CHECK: .functype test_pmin_v2f64_ole (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: f64x2.relaxed_min
+; CHECK-NEXT: # fallthrough-return
+ %c = fcmp ole <2 x double> %y, %x
+ %a = select <2 x i1> %c, <2 x double> %y, <2 x double> %x
+ ret <2 x double> %a
+}
+
+define <2 x double> @test_pmin_v2f64_ogt(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: test_pmin_v2f64_ogt:
+; CHECK: .functype test_pmin_v2f64_ogt (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: f64x2.relaxed_min
+; CHECK-NEXT: # fallthrough-return
+ %c = fcmp ogt <2 x double> %x, %y
+ %a = select <2 x i1> %c, <2 x double> %y, <2 x double> %x
+ ret <2 x double> %a
+}
+
+define <2 x double> @test_pmin_v2f64_oge(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: test_pmin_v2f64_oge:
+; CHECK: .functype test_pmin_v2f64_oge (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: f64x2.relaxed_min
+; CHECK-NEXT: # fallthrough-return
+ %c = fcmp oge <2 x double> %x, %y
+ %a = select <2 x i1> %c, <2 x double> %y, <2 x double> %x
+ ret <2 x double> %a
+}
+
+; For setlt
+define <2 x double> @pmin_v2f64_fast_olt(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: pmin_v2f64_fast_olt:
+; CHECK: .functype pmin_v2f64_fast_olt (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: f64x2.relaxed_min
+; CHECK-NEXT: # fallthrough-return
+ %c = fcmp fast olt <2 x double> %y, %x
+ %a = select <2 x i1> %c, <2 x double> %y, <2 x double> %x
+ ret <2 x double> %a
+}
+
+; For setle
+define <2 x double> @test_pmin_v2f64_fast_ole(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: test_pmin_v2f64_fast_ole:
+; CHECK: .functype test_pmin_v2f64_fast_ole (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: f64x2.relaxed_min
+; CHECK-NEXT: # fallthrough-return
+ %c = fcmp fast ole <2 x double> %y, %x
+ %a = select <2 x i1> %c, <2 x double> %y, <2 x double> %x
+ ret <2 x double> %a
+}
+
+; For setgt
+define <2 x double> @test_pmin_v2f64_fast_ogt(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: test_pmin_v2f64_fast_ogt:
+; CHECK: .functype test_pmin_v2f64_fast_ogt (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: f64x2.relaxed_min
+; CHECK-NEXT: # fallthrough-return
+ %c = fcmp fast ogt <2 x double> %x, %y
+ %a = select <2 x i1> %c, <2 x double> %y, <2 x double> %x
+ ret <2 x double> %a
+}
+
+; For setge
+define <2 x double> @test_pmin_v2f64_fast_oge(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: test_pmin_v2f64_fast_oge:
+; CHECK: .functype test_pmin_v2f64_fast_oge (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: f64x2.relaxed_min
+; CHECK-NEXT: # fallthrough-return
+ %c = fcmp fast oge <2 x double> %x, %y
+ %a = select <2 x i1> %c, <2 x double> %y, <2 x double> %x
+ ret <2 x double> %a
+}
+
+define <2 x i64> @test_pmin_int_v2f64(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: test_pmin_int_v2f64:
+; CHECK: .functype test_pmin_int_v2f64 (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: f64x2.relaxed_min
+; CHECK-NEXT: # fallthrough-return
+ %fx = bitcast <2 x i64> %x to <2 x double>
+ %fy = bitcast <2 x i64> %y to <2 x double>
+ %c = fcmp olt <2 x double> %fy, %fx
+ %a = select <2 x i1> %c, <2 x i64> %y, <2 x i64> %x
+ ret <2 x i64> %a
+}
+
declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>)
declare <4 x float> @llvm.fminimumnum.v4f32(<4 x float>, <4 x float>)
declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>)
diff --git a/llvm/test/CodeGen/X86/2013-03-13-VEX-DestReg.ll b/llvm/test/CodeGen/X86/2013-03-13-VEX-DestReg.ll
index f82cd11..f12fc4a8 100644
--- a/llvm/test/CodeGen/X86/2013-03-13-VEX-DestReg.ll
+++ b/llvm/test/CodeGen/X86/2013-03-13-VEX-DestReg.ll
@@ -23,6 +23,6 @@ entry:
declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) #1
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/GlobalISel/x86_64-legalize-sitofp.mir b/llvm/test/CodeGen/X86/GlobalISel/x86_64-legalize-sitofp.mir
index 64e569c..3d40240 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/x86_64-legalize-sitofp.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/x86_64-legalize-sitofp.mir
@@ -63,7 +63,7 @@
ret double %conv
}
- attributes #0 = { norecurse nounwind readnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #0 = { norecurse nounwind readnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/X86/GlobalISel/x86_64-select-sitofp.mir b/llvm/test/CodeGen/X86/GlobalISel/x86_64-select-sitofp.mir
index 0bb6061..8f76ad5 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/x86_64-select-sitofp.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/x86_64-select-sitofp.mir
@@ -35,7 +35,7 @@
ret double %conv
}
- attributes #0 = { norecurse nounwind readnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #0 = { norecurse nounwind readnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/X86/StackColoring-use-between-allocas.mir b/llvm/test/CodeGen/X86/StackColoring-use-between-allocas.mir
index a992222..1612485 100644
--- a/llvm/test/CodeGen/X86/StackColoring-use-between-allocas.mir
+++ b/llvm/test/CodeGen/X86/StackColoring-use-between-allocas.mir
@@ -64,9 +64,9 @@
; Function Attrs: nounwind
declare void @llvm.stackprotector(ptr, ptr) #3
- attributes #0 = { ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="64" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #0 = { ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="64" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "use-soft-float"="false" }
attributes #1 = { argmemonly nounwind willreturn }
- attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "use-soft-float"="false" }
attributes #3 = { nounwind }
!llvm.module.flags = !{!0, !1}
diff --git a/llvm/test/CodeGen/X86/atom-fixup-lea4.ll b/llvm/test/CodeGen/X86/atom-fixup-lea4.ll
index 8e7a463..69689f0 100644
--- a/llvm/test/CodeGen/X86/atom-fixup-lea4.ll
+++ b/llvm/test/CodeGen/X86/atom-fixup-lea4.ll
@@ -18,5 +18,5 @@ entry:
; Function Attrs: uwtable
declare void @_ZN12ValueWrapperIS_IS_IdEEEC2Ev(ptr) unnamed_addr #0 align 2
-attributes #0 = { uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 45277ce..3e7b73a 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -1,12 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64 | FileCheck %s --check-prefixes=CHECK,CHECK-O3
-; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,CHECK-O3
-; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,CHECK-O3
-; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,CHECK-O3
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,CHECK-SSE-O3
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX-O3
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX-O3
; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64 | FileCheck %s --check-prefixes=CHECK,CHECK-O0
-; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,CHECK-O0
-; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,CHECK-O0
-; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,CHECK-O0
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,CHECK-SSE-O0
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX-O0
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX-O0
define void @test1(ptr %ptr, i32 %val1) {
; CHECK-LABEL: test1:
@@ -34,6 +34,943 @@ define i32 @test3(ptr %ptr) {
%val = load atomic i32, ptr %ptr seq_cst, align 4
ret i32 %val
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-O0: {{.*}}
-; CHECK-O3: {{.*}}
+
+define <1 x i32> @atomic_vec1_i32(ptr %x) {
+; CHECK-LABEL: atomic_vec1_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl (%rdi), %eax
+; CHECK-NEXT: retq
+ %ret = load atomic <1 x i32>, ptr %x acquire, align 4
+ ret <1 x i32> %ret
+}
+
+define <1 x i8> @atomic_vec1_i8(ptr %x) {
+; CHECK-O3-LABEL: atomic_vec1_i8:
+; CHECK-O3: # %bb.0:
+; CHECK-O3-NEXT: movzbl (%rdi), %eax
+; CHECK-O3-NEXT: retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_i8:
+; CHECK-SSE-O3: # %bb.0:
+; CHECK-SSE-O3-NEXT: movzbl (%rdi), %eax
+; CHECK-SSE-O3-NEXT: retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_i8:
+; CHECK-AVX-O3: # %bb.0:
+; CHECK-AVX-O3-NEXT: movzbl (%rdi), %eax
+; CHECK-AVX-O3-NEXT: retq
+;
+; CHECK-O0-LABEL: atomic_vec1_i8:
+; CHECK-O0: # %bb.0:
+; CHECK-O0-NEXT: movb (%rdi), %al
+; CHECK-O0-NEXT: retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_i8:
+; CHECK-SSE-O0: # %bb.0:
+; CHECK-SSE-O0-NEXT: movb (%rdi), %al
+; CHECK-SSE-O0-NEXT: retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_i8:
+; CHECK-AVX-O0: # %bb.0:
+; CHECK-AVX-O0-NEXT: movb (%rdi), %al
+; CHECK-AVX-O0-NEXT: retq
+ %ret = load atomic <1 x i8>, ptr %x acquire, align 1
+ ret <1 x i8> %ret
+}
+
+define <1 x i16> @atomic_vec1_i16(ptr %x) {
+; CHECK-O3-LABEL: atomic_vec1_i16:
+; CHECK-O3: # %bb.0:
+; CHECK-O3-NEXT: movzwl (%rdi), %eax
+; CHECK-O3-NEXT: retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_i16:
+; CHECK-SSE-O3: # %bb.0:
+; CHECK-SSE-O3-NEXT: movzwl (%rdi), %eax
+; CHECK-SSE-O3-NEXT: retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_i16:
+; CHECK-AVX-O3: # %bb.0:
+; CHECK-AVX-O3-NEXT: movzwl (%rdi), %eax
+; CHECK-AVX-O3-NEXT: retq
+;
+; CHECK-O0-LABEL: atomic_vec1_i16:
+; CHECK-O0: # %bb.0:
+; CHECK-O0-NEXT: movw (%rdi), %ax
+; CHECK-O0-NEXT: retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_i16:
+; CHECK-SSE-O0: # %bb.0:
+; CHECK-SSE-O0-NEXT: movw (%rdi), %ax
+; CHECK-SSE-O0-NEXT: retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_i16:
+; CHECK-AVX-O0: # %bb.0:
+; CHECK-AVX-O0-NEXT: movw (%rdi), %ax
+; CHECK-AVX-O0-NEXT: retq
+ %ret = load atomic <1 x i16>, ptr %x acquire, align 2
+ ret <1 x i16> %ret
+}
+
+define <1 x i32> @atomic_vec1_i8_zext(ptr %x) {
+; CHECK-O3-LABEL: atomic_vec1_i8_zext:
+; CHECK-O3: # %bb.0:
+; CHECK-O3-NEXT: movzbl (%rdi), %eax
+; CHECK-O3-NEXT: movzbl %al, %eax
+; CHECK-O3-NEXT: retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_i8_zext:
+; CHECK-SSE-O3: # %bb.0:
+; CHECK-SSE-O3-NEXT: movzbl (%rdi), %eax
+; CHECK-SSE-O3-NEXT: movzbl %al, %eax
+; CHECK-SSE-O3-NEXT: retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_i8_zext:
+; CHECK-AVX-O3: # %bb.0:
+; CHECK-AVX-O3-NEXT: movzbl (%rdi), %eax
+; CHECK-AVX-O3-NEXT: movzbl %al, %eax
+; CHECK-AVX-O3-NEXT: retq
+;
+; CHECK-O0-LABEL: atomic_vec1_i8_zext:
+; CHECK-O0: # %bb.0:
+; CHECK-O0-NEXT: movb (%rdi), %al
+; CHECK-O0-NEXT: movzbl %al, %eax
+; CHECK-O0-NEXT: retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_i8_zext:
+; CHECK-SSE-O0: # %bb.0:
+; CHECK-SSE-O0-NEXT: movb (%rdi), %al
+; CHECK-SSE-O0-NEXT: movzbl %al, %eax
+; CHECK-SSE-O0-NEXT: retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_i8_zext:
+; CHECK-AVX-O0: # %bb.0:
+; CHECK-AVX-O0-NEXT: movb (%rdi), %al
+; CHECK-AVX-O0-NEXT: movzbl %al, %eax
+; CHECK-AVX-O0-NEXT: retq
+ %ret = load atomic <1 x i8>, ptr %x acquire, align 1
+ %zret = zext <1 x i8> %ret to <1 x i32>
+ ret <1 x i32> %zret
+}
+
+define <1 x i64> @atomic_vec1_i16_sext(ptr %x) {
+; CHECK-O3-LABEL: atomic_vec1_i16_sext:
+; CHECK-O3: # %bb.0:
+; CHECK-O3-NEXT: movzwl (%rdi), %eax
+; CHECK-O3-NEXT: movswq %ax, %rax
+; CHECK-O3-NEXT: retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_i16_sext:
+; CHECK-SSE-O3: # %bb.0:
+; CHECK-SSE-O3-NEXT: movzwl (%rdi), %eax
+; CHECK-SSE-O3-NEXT: movswq %ax, %rax
+; CHECK-SSE-O3-NEXT: retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_i16_sext:
+; CHECK-AVX-O3: # %bb.0:
+; CHECK-AVX-O3-NEXT: movzwl (%rdi), %eax
+; CHECK-AVX-O3-NEXT: movswq %ax, %rax
+; CHECK-AVX-O3-NEXT: retq
+;
+; CHECK-O0-LABEL: atomic_vec1_i16_sext:
+; CHECK-O0: # %bb.0:
+; CHECK-O0-NEXT: movw (%rdi), %ax
+; CHECK-O0-NEXT: movswq %ax, %rax
+; CHECK-O0-NEXT: retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_i16_sext:
+; CHECK-SSE-O0: # %bb.0:
+; CHECK-SSE-O0-NEXT: movw (%rdi), %ax
+; CHECK-SSE-O0-NEXT: movswq %ax, %rax
+; CHECK-SSE-O0-NEXT: retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_i16_sext:
+; CHECK-AVX-O0: # %bb.0:
+; CHECK-AVX-O0-NEXT: movw (%rdi), %ax
+; CHECK-AVX-O0-NEXT: movswq %ax, %rax
+; CHECK-AVX-O0-NEXT: retq
+ %ret = load atomic <1 x i16>, ptr %x acquire, align 2
+ %sret = sext <1 x i16> %ret to <1 x i64>
+ ret <1 x i64> %sret
+}
+
+define <1 x ptr addrspace(270)> @atomic_vec1_ptr270(ptr %x) {
+; CHECK-LABEL: atomic_vec1_ptr270:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl (%rdi), %eax
+; CHECK-NEXT: retq
+ %ret = load atomic <1 x ptr addrspace(270)>, ptr %x acquire, align 4
+ ret <1 x ptr addrspace(270)> %ret
+}
+
+define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
+; CHECK-O3-LABEL: atomic_vec1_bfloat:
+; CHECK-O3: # %bb.0:
+; CHECK-O3-NEXT: movzwl (%rdi), %eax
+; CHECK-O3-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-O3-NEXT: retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_bfloat:
+; CHECK-SSE-O3: # %bb.0:
+; CHECK-SSE-O3-NEXT: movzwl (%rdi), %eax
+; CHECK-SSE-O3-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O3-NEXT: retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_bfloat:
+; CHECK-AVX-O3: # %bb.0:
+; CHECK-AVX-O3-NEXT: movzwl (%rdi), %eax
+; CHECK-AVX-O3-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-AVX-O3-NEXT: retq
+;
+; CHECK-O0-LABEL: atomic_vec1_bfloat:
+; CHECK-O0: # %bb.0:
+; CHECK-O0-NEXT: movw (%rdi), %cx
+; CHECK-O0-NEXT: # implicit-def: $eax
+; CHECK-O0-NEXT: movw %cx, %ax
+; CHECK-O0-NEXT: # implicit-def: $xmm0
+; CHECK-O0-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-O0-NEXT: retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_bfloat:
+; CHECK-SSE-O0: # %bb.0:
+; CHECK-SSE-O0-NEXT: movw (%rdi), %cx
+; CHECK-SSE-O0-NEXT: # implicit-def: $eax
+; CHECK-SSE-O0-NEXT: movw %cx, %ax
+; CHECK-SSE-O0-NEXT: # implicit-def: $xmm0
+; CHECK-SSE-O0-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O0-NEXT: retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_bfloat:
+; CHECK-AVX-O0: # %bb.0:
+; CHECK-AVX-O0-NEXT: movw (%rdi), %cx
+; CHECK-AVX-O0-NEXT: # implicit-def: $eax
+; CHECK-AVX-O0-NEXT: movw %cx, %ax
+; CHECK-AVX-O0-NEXT: # implicit-def: $xmm0
+; CHECK-AVX-O0-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-AVX-O0-NEXT: retq
+ %ret = load atomic <1 x bfloat>, ptr %x acquire, align 2
+ ret <1 x bfloat> %ret
+}
+
+define <1 x ptr> @atomic_vec1_ptr_align(ptr %x) nounwind {
+; CHECK-LABEL: atomic_vec1_ptr_align:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq (%rdi), %rax
+; CHECK-NEXT: retq
+ %ret = load atomic <1 x ptr>, ptr %x acquire, align 8
+ ret <1 x ptr> %ret
+}
+
+define <1 x i64> @atomic_vec1_i64_align(ptr %x) nounwind {
+; CHECK-LABEL: atomic_vec1_i64_align:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq (%rdi), %rax
+; CHECK-NEXT: retq
+ %ret = load atomic <1 x i64>, ptr %x acquire, align 8
+ ret <1 x i64> %ret
+}
+
+define <1 x ptr> @atomic_vec1_ptr(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec1_ptr:
+; CHECK-O3: # %bb.0:
+; CHECK-O3-NEXT: pushq %rax
+; CHECK-O3-NEXT: movq %rdi, %rsi
+; CHECK-O3-NEXT: movq %rsp, %rdx
+; CHECK-O3-NEXT: movl $8, %edi
+; CHECK-O3-NEXT: movl $2, %ecx
+; CHECK-O3-NEXT: callq __atomic_load@PLT
+; CHECK-O3-NEXT: movq (%rsp), %rax
+; CHECK-O3-NEXT: popq %rcx
+; CHECK-O3-NEXT: retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_ptr:
+; CHECK-SSE-O3: # %bb.0:
+; CHECK-SSE-O3-NEXT: pushq %rax
+; CHECK-SSE-O3-NEXT: movq %rdi, %rsi
+; CHECK-SSE-O3-NEXT: movq %rsp, %rdx
+; CHECK-SSE-O3-NEXT: movl $8, %edi
+; CHECK-SSE-O3-NEXT: movl $2, %ecx
+; CHECK-SSE-O3-NEXT: callq __atomic_load@PLT
+; CHECK-SSE-O3-NEXT: movq (%rsp), %rax
+; CHECK-SSE-O3-NEXT: popq %rcx
+; CHECK-SSE-O3-NEXT: retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_ptr:
+; CHECK-AVX-O3: # %bb.0:
+; CHECK-AVX-O3-NEXT: pushq %rax
+; CHECK-AVX-O3-NEXT: movq %rdi, %rsi
+; CHECK-AVX-O3-NEXT: movq %rsp, %rdx
+; CHECK-AVX-O3-NEXT: movl $8, %edi
+; CHECK-AVX-O3-NEXT: movl $2, %ecx
+; CHECK-AVX-O3-NEXT: callq __atomic_load@PLT
+; CHECK-AVX-O3-NEXT: movq (%rsp), %rax
+; CHECK-AVX-O3-NEXT: popq %rcx
+; CHECK-AVX-O3-NEXT: retq
+;
+; CHECK-O0-LABEL: atomic_vec1_ptr:
+; CHECK-O0: # %bb.0:
+; CHECK-O0-NEXT: pushq %rax
+; CHECK-O0-NEXT: movq %rdi, %rsi
+; CHECK-O0-NEXT: movl $8, %edi
+; CHECK-O0-NEXT: movq %rsp, %rdx
+; CHECK-O0-NEXT: movl $2, %ecx
+; CHECK-O0-NEXT: callq __atomic_load@PLT
+; CHECK-O0-NEXT: movq (%rsp), %rax
+; CHECK-O0-NEXT: popq %rcx
+; CHECK-O0-NEXT: retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_ptr:
+; CHECK-SSE-O0: # %bb.0:
+; CHECK-SSE-O0-NEXT: pushq %rax
+; CHECK-SSE-O0-NEXT: movq %rdi, %rsi
+; CHECK-SSE-O0-NEXT: movl $8, %edi
+; CHECK-SSE-O0-NEXT: movq %rsp, %rdx
+; CHECK-SSE-O0-NEXT: movl $2, %ecx
+; CHECK-SSE-O0-NEXT: callq __atomic_load@PLT
+; CHECK-SSE-O0-NEXT: movq (%rsp), %rax
+; CHECK-SSE-O0-NEXT: popq %rcx
+; CHECK-SSE-O0-NEXT: retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_ptr:
+; CHECK-AVX-O0: # %bb.0:
+; CHECK-AVX-O0-NEXT: pushq %rax
+; CHECK-AVX-O0-NEXT: movq %rdi, %rsi
+; CHECK-AVX-O0-NEXT: movl $8, %edi
+; CHECK-AVX-O0-NEXT: movq %rsp, %rdx
+; CHECK-AVX-O0-NEXT: movl $2, %ecx
+; CHECK-AVX-O0-NEXT: callq __atomic_load@PLT
+; CHECK-AVX-O0-NEXT: movq (%rsp), %rax
+; CHECK-AVX-O0-NEXT: popq %rcx
+; CHECK-AVX-O0-NEXT: retq
+ %ret = load atomic <1 x ptr>, ptr %x acquire, align 4
+ ret <1 x ptr> %ret
+}
+
+define <1 x half> @atomic_vec1_half(ptr %x) {
+; CHECK-O3-LABEL: atomic_vec1_half:
+; CHECK-O3: # %bb.0:
+; CHECK-O3-NEXT: movzwl (%rdi), %eax
+; CHECK-O3-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-O3-NEXT: retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_half:
+; CHECK-SSE-O3: # %bb.0:
+; CHECK-SSE-O3-NEXT: movzwl (%rdi), %eax
+; CHECK-SSE-O3-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O3-NEXT: retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_half:
+; CHECK-AVX-O3: # %bb.0:
+; CHECK-AVX-O3-NEXT: movzwl (%rdi), %eax
+; CHECK-AVX-O3-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-AVX-O3-NEXT: retq
+;
+; CHECK-O0-LABEL: atomic_vec1_half:
+; CHECK-O0: # %bb.0:
+; CHECK-O0-NEXT: movw (%rdi), %cx
+; CHECK-O0-NEXT: # implicit-def: $eax
+; CHECK-O0-NEXT: movw %cx, %ax
+; CHECK-O0-NEXT: # implicit-def: $xmm0
+; CHECK-O0-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-O0-NEXT: retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_half:
+; CHECK-SSE-O0: # %bb.0:
+; CHECK-SSE-O0-NEXT: movw (%rdi), %cx
+; CHECK-SSE-O0-NEXT: # implicit-def: $eax
+; CHECK-SSE-O0-NEXT: movw %cx, %ax
+; CHECK-SSE-O0-NEXT: # implicit-def: $xmm0
+; CHECK-SSE-O0-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O0-NEXT: retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_half:
+; CHECK-AVX-O0: # %bb.0:
+; CHECK-AVX-O0-NEXT: movw (%rdi), %cx
+; CHECK-AVX-O0-NEXT: # implicit-def: $eax
+; CHECK-AVX-O0-NEXT: movw %cx, %ax
+; CHECK-AVX-O0-NEXT: # implicit-def: $xmm0
+; CHECK-AVX-O0-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-AVX-O0-NEXT: retq
+ %ret = load atomic <1 x half>, ptr %x acquire, align 2
+ ret <1 x half> %ret
+}
+
+define <1 x float> @atomic_vec1_float(ptr %x) {
+; CHECK-O3-LABEL: atomic_vec1_float:
+; CHECK-O3: # %bb.0:
+; CHECK-O3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-O3-NEXT: retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_float:
+; CHECK-SSE-O3: # %bb.0:
+; CHECK-SSE-O3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-O3-NEXT: retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_float:
+; CHECK-AVX-O3: # %bb.0:
+; CHECK-AVX-O3-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX-O3-NEXT: retq
+;
+; CHECK-O0-LABEL: atomic_vec1_float:
+; CHECK-O0: # %bb.0:
+; CHECK-O0-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-O0-NEXT: retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_float:
+; CHECK-SSE-O0: # %bb.0:
+; CHECK-SSE-O0-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-O0-NEXT: retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_float:
+; CHECK-AVX-O0: # %bb.0:
+; CHECK-AVX-O0-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX-O0-NEXT: retq
+ %ret = load atomic <1 x float>, ptr %x acquire, align 4
+ ret <1 x float> %ret
+}
+
+define <1 x double> @atomic_vec1_double_align(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec1_double_align:
+; CHECK-O3: # %bb.0:
+; CHECK-O3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-O3-NEXT: retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_double_align:
+; CHECK-SSE-O3: # %bb.0:
+; CHECK-SSE-O3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-SSE-O3-NEXT: retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_double_align:
+; CHECK-AVX-O3: # %bb.0:
+; CHECK-AVX-O3-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-AVX-O3-NEXT: retq
+;
+; CHECK-O0-LABEL: atomic_vec1_double_align:
+; CHECK-O0: # %bb.0:
+; CHECK-O0-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-O0-NEXT: retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_double_align:
+; CHECK-SSE-O0: # %bb.0:
+; CHECK-SSE-O0-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-SSE-O0-NEXT: retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_double_align:
+; CHECK-AVX-O0: # %bb.0:
+; CHECK-AVX-O0-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-AVX-O0-NEXT: retq
+ %ret = load atomic <1 x double>, ptr %x acquire, align 8
+ ret <1 x double> %ret
+}
+
+define <1 x i64> @atomic_vec1_i64(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec1_i64:
+; CHECK-O3: # %bb.0:
+; CHECK-O3-NEXT: pushq %rax
+; CHECK-O3-NEXT: movq %rdi, %rsi
+; CHECK-O3-NEXT: movq %rsp, %rdx
+; CHECK-O3-NEXT: movl $8, %edi
+; CHECK-O3-NEXT: movl $2, %ecx
+; CHECK-O3-NEXT: callq __atomic_load@PLT
+; CHECK-O3-NEXT: movq (%rsp), %rax
+; CHECK-O3-NEXT: popq %rcx
+; CHECK-O3-NEXT: retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_i64:
+; CHECK-SSE-O3: # %bb.0:
+; CHECK-SSE-O3-NEXT: pushq %rax
+; CHECK-SSE-O3-NEXT: movq %rdi, %rsi
+; CHECK-SSE-O3-NEXT: movq %rsp, %rdx
+; CHECK-SSE-O3-NEXT: movl $8, %edi
+; CHECK-SSE-O3-NEXT: movl $2, %ecx
+; CHECK-SSE-O3-NEXT: callq __atomic_load@PLT
+; CHECK-SSE-O3-NEXT: movq (%rsp), %rax
+; CHECK-SSE-O3-NEXT: popq %rcx
+; CHECK-SSE-O3-NEXT: retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_i64:
+; CHECK-AVX-O3: # %bb.0:
+; CHECK-AVX-O3-NEXT: pushq %rax
+; CHECK-AVX-O3-NEXT: movq %rdi, %rsi
+; CHECK-AVX-O3-NEXT: movq %rsp, %rdx
+; CHECK-AVX-O3-NEXT: movl $8, %edi
+; CHECK-AVX-O3-NEXT: movl $2, %ecx
+; CHECK-AVX-O3-NEXT: callq __atomic_load@PLT
+; CHECK-AVX-O3-NEXT: movq (%rsp), %rax
+; CHECK-AVX-O3-NEXT: popq %rcx
+; CHECK-AVX-O3-NEXT: retq
+;
+; CHECK-O0-LABEL: atomic_vec1_i64:
+; CHECK-O0: # %bb.0:
+; CHECK-O0-NEXT: pushq %rax
+; CHECK-O0-NEXT: movq %rdi, %rsi
+; CHECK-O0-NEXT: movl $8, %edi
+; CHECK-O0-NEXT: movq %rsp, %rdx
+; CHECK-O0-NEXT: movl $2, %ecx
+; CHECK-O0-NEXT: callq __atomic_load@PLT
+; CHECK-O0-NEXT: movq (%rsp), %rax
+; CHECK-O0-NEXT: popq %rcx
+; CHECK-O0-NEXT: retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_i64:
+; CHECK-SSE-O0: # %bb.0:
+; CHECK-SSE-O0-NEXT: pushq %rax
+; CHECK-SSE-O0-NEXT: movq %rdi, %rsi
+; CHECK-SSE-O0-NEXT: movl $8, %edi
+; CHECK-SSE-O0-NEXT: movq %rsp, %rdx
+; CHECK-SSE-O0-NEXT: movl $2, %ecx
+; CHECK-SSE-O0-NEXT: callq __atomic_load@PLT
+; CHECK-SSE-O0-NEXT: movq (%rsp), %rax
+; CHECK-SSE-O0-NEXT: popq %rcx
+; CHECK-SSE-O0-NEXT: retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_i64:
+; CHECK-AVX-O0: # %bb.0:
+; CHECK-AVX-O0-NEXT: pushq %rax
+; CHECK-AVX-O0-NEXT: movq %rdi, %rsi
+; CHECK-AVX-O0-NEXT: movl $8, %edi
+; CHECK-AVX-O0-NEXT: movq %rsp, %rdx
+; CHECK-AVX-O0-NEXT: movl $2, %ecx
+; CHECK-AVX-O0-NEXT: callq __atomic_load@PLT
+; CHECK-AVX-O0-NEXT: movq (%rsp), %rax
+; CHECK-AVX-O0-NEXT: popq %rcx
+; CHECK-AVX-O0-NEXT: retq
+ %ret = load atomic <1 x i64>, ptr %x acquire, align 4
+ ret <1 x i64> %ret
+}
+
+define <1 x double> @atomic_vec1_double(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec1_double:
+; CHECK-O3: # %bb.0:
+; CHECK-O3-NEXT: pushq %rax
+; CHECK-O3-NEXT: movq %rdi, %rsi
+; CHECK-O3-NEXT: movq %rsp, %rdx
+; CHECK-O3-NEXT: movl $8, %edi
+; CHECK-O3-NEXT: movl $2, %ecx
+; CHECK-O3-NEXT: callq __atomic_load@PLT
+; CHECK-O3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-O3-NEXT: popq %rax
+; CHECK-O3-NEXT: retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_double:
+; CHECK-SSE-O3: # %bb.0:
+; CHECK-SSE-O3-NEXT: pushq %rax
+; CHECK-SSE-O3-NEXT: movq %rdi, %rsi
+; CHECK-SSE-O3-NEXT: movq %rsp, %rdx
+; CHECK-SSE-O3-NEXT: movl $8, %edi
+; CHECK-SSE-O3-NEXT: movl $2, %ecx
+; CHECK-SSE-O3-NEXT: callq __atomic_load@PLT
+; CHECK-SSE-O3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-SSE-O3-NEXT: popq %rax
+; CHECK-SSE-O3-NEXT: retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_double:
+; CHECK-AVX-O3: # %bb.0:
+; CHECK-AVX-O3-NEXT: pushq %rax
+; CHECK-AVX-O3-NEXT: movq %rdi, %rsi
+; CHECK-AVX-O3-NEXT: movq %rsp, %rdx
+; CHECK-AVX-O3-NEXT: movl $8, %edi
+; CHECK-AVX-O3-NEXT: movl $2, %ecx
+; CHECK-AVX-O3-NEXT: callq __atomic_load@PLT
+; CHECK-AVX-O3-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-AVX-O3-NEXT: popq %rax
+; CHECK-AVX-O3-NEXT: retq
+;
+; CHECK-O0-LABEL: atomic_vec1_double:
+; CHECK-O0: # %bb.0:
+; CHECK-O0-NEXT: pushq %rax
+; CHECK-O0-NEXT: movq %rdi, %rsi
+; CHECK-O0-NEXT: movl $8, %edi
+; CHECK-O0-NEXT: movq %rsp, %rdx
+; CHECK-O0-NEXT: movl $2, %ecx
+; CHECK-O0-NEXT: callq __atomic_load@PLT
+; CHECK-O0-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-O0-NEXT: popq %rax
+; CHECK-O0-NEXT: retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_double:
+; CHECK-SSE-O0: # %bb.0:
+; CHECK-SSE-O0-NEXT: pushq %rax
+; CHECK-SSE-O0-NEXT: movq %rdi, %rsi
+; CHECK-SSE-O0-NEXT: movl $8, %edi
+; CHECK-SSE-O0-NEXT: movq %rsp, %rdx
+; CHECK-SSE-O0-NEXT: movl $2, %ecx
+; CHECK-SSE-O0-NEXT: callq __atomic_load@PLT
+; CHECK-SSE-O0-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-SSE-O0-NEXT: popq %rax
+; CHECK-SSE-O0-NEXT: retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_double:
+; CHECK-AVX-O0: # %bb.0:
+; CHECK-AVX-O0-NEXT: pushq %rax
+; CHECK-AVX-O0-NEXT: movq %rdi, %rsi
+; CHECK-AVX-O0-NEXT: movl $8, %edi
+; CHECK-AVX-O0-NEXT: movq %rsp, %rdx
+; CHECK-AVX-O0-NEXT: movl $2, %ecx
+; CHECK-AVX-O0-NEXT: callq __atomic_load@PLT
+; CHECK-AVX-O0-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-AVX-O0-NEXT: popq %rax
+; CHECK-AVX-O0-NEXT: retq
+ %ret = load atomic <1 x double>, ptr %x acquire, align 4
+ ret <1 x double> %ret
+}
+
+define <2 x i32> @atomic_vec2_i32(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec2_i32:
+; CHECK-O3: # %bb.0:
+; CHECK-O3-NEXT: pushq %rax
+; CHECK-O3-NEXT: movq %rdi, %rsi
+; CHECK-O3-NEXT: movq %rsp, %rdx
+; CHECK-O3-NEXT: movl $8, %edi
+; CHECK-O3-NEXT: movl $2, %ecx
+; CHECK-O3-NEXT: callq __atomic_load@PLT
+; CHECK-O3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-O3-NEXT: popq %rax
+; CHECK-O3-NEXT: retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec2_i32:
+; CHECK-SSE-O3: # %bb.0:
+; CHECK-SSE-O3-NEXT: pushq %rax
+; CHECK-SSE-O3-NEXT: movq %rdi, %rsi
+; CHECK-SSE-O3-NEXT: movq %rsp, %rdx
+; CHECK-SSE-O3-NEXT: movl $8, %edi
+; CHECK-SSE-O3-NEXT: movl $2, %ecx
+; CHECK-SSE-O3-NEXT: callq __atomic_load@PLT
+; CHECK-SSE-O3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-SSE-O3-NEXT: popq %rax
+; CHECK-SSE-O3-NEXT: retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec2_i32:
+; CHECK-AVX-O3: # %bb.0:
+; CHECK-AVX-O3-NEXT: pushq %rax
+; CHECK-AVX-O3-NEXT: movq %rdi, %rsi
+; CHECK-AVX-O3-NEXT: movq %rsp, %rdx
+; CHECK-AVX-O3-NEXT: movl $8, %edi
+; CHECK-AVX-O3-NEXT: movl $2, %ecx
+; CHECK-AVX-O3-NEXT: callq __atomic_load@PLT
+; CHECK-AVX-O3-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-AVX-O3-NEXT: popq %rax
+; CHECK-AVX-O3-NEXT: retq
+;
+; CHECK-O0-LABEL: atomic_vec2_i32:
+; CHECK-O0: # %bb.0:
+; CHECK-O0-NEXT: pushq %rax
+; CHECK-O0-NEXT: movq %rdi, %rsi
+; CHECK-O0-NEXT: movl $8, %edi
+; CHECK-O0-NEXT: movq %rsp, %rdx
+; CHECK-O0-NEXT: movl $2, %ecx
+; CHECK-O0-NEXT: callq __atomic_load@PLT
+; CHECK-O0-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-O0-NEXT: popq %rax
+; CHECK-O0-NEXT: retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec2_i32:
+; CHECK-SSE-O0: # %bb.0:
+; CHECK-SSE-O0-NEXT: pushq %rax
+; CHECK-SSE-O0-NEXT: movq %rdi, %rsi
+; CHECK-SSE-O0-NEXT: movl $8, %edi
+; CHECK-SSE-O0-NEXT: movq %rsp, %rdx
+; CHECK-SSE-O0-NEXT: movl $2, %ecx
+; CHECK-SSE-O0-NEXT: callq __atomic_load@PLT
+; CHECK-SSE-O0-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-SSE-O0-NEXT: popq %rax
+; CHECK-SSE-O0-NEXT: retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec2_i32:
+; CHECK-AVX-O0: # %bb.0:
+; CHECK-AVX-O0-NEXT: pushq %rax
+; CHECK-AVX-O0-NEXT: movq %rdi, %rsi
+; CHECK-AVX-O0-NEXT: movl $8, %edi
+; CHECK-AVX-O0-NEXT: movq %rsp, %rdx
+; CHECK-AVX-O0-NEXT: movl $2, %ecx
+; CHECK-AVX-O0-NEXT: callq __atomic_load@PLT
+; CHECK-AVX-O0-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-AVX-O0-NEXT: popq %rax
+; CHECK-AVX-O0-NEXT: retq
+ %ret = load atomic <2 x i32>, ptr %x acquire, align 4
+ ret <2 x i32> %ret
+}
+
+define <4 x float> @atomic_vec4_float(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec4_float:
+; CHECK-O3: # %bb.0:
+; CHECK-O3-NEXT: subq $24, %rsp
+; CHECK-O3-NEXT: movq %rdi, %rsi
+; CHECK-O3-NEXT: movq %rsp, %rdx
+; CHECK-O3-NEXT: movl $16, %edi
+; CHECK-O3-NEXT: movl $2, %ecx
+; CHECK-O3-NEXT: callq __atomic_load@PLT
+; CHECK-O3-NEXT: movaps (%rsp), %xmm0
+; CHECK-O3-NEXT: addq $24, %rsp
+; CHECK-O3-NEXT: retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec4_float:
+; CHECK-SSE-O3: # %bb.0:
+; CHECK-SSE-O3-NEXT: subq $24, %rsp
+; CHECK-SSE-O3-NEXT: movq %rdi, %rsi
+; CHECK-SSE-O3-NEXT: movq %rsp, %rdx
+; CHECK-SSE-O3-NEXT: movl $16, %edi
+; CHECK-SSE-O3-NEXT: movl $2, %ecx
+; CHECK-SSE-O3-NEXT: callq __atomic_load@PLT
+; CHECK-SSE-O3-NEXT: movaps (%rsp), %xmm0
+; CHECK-SSE-O3-NEXT: addq $24, %rsp
+; CHECK-SSE-O3-NEXT: retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec4_float:
+; CHECK-AVX-O3: # %bb.0:
+; CHECK-AVX-O3-NEXT: subq $24, %rsp
+; CHECK-AVX-O3-NEXT: movq %rdi, %rsi
+; CHECK-AVX-O3-NEXT: movq %rsp, %rdx
+; CHECK-AVX-O3-NEXT: movl $16, %edi
+; CHECK-AVX-O3-NEXT: movl $2, %ecx
+; CHECK-AVX-O3-NEXT: callq __atomic_load@PLT
+; CHECK-AVX-O3-NEXT: vmovaps (%rsp), %xmm0
+; CHECK-AVX-O3-NEXT: addq $24, %rsp
+; CHECK-AVX-O3-NEXT: retq
+;
+; CHECK-O0-LABEL: atomic_vec4_float:
+; CHECK-O0: # %bb.0:
+; CHECK-O0-NEXT: subq $24, %rsp
+; CHECK-O0-NEXT: movq %rdi, %rsi
+; CHECK-O0-NEXT: movl $16, %edi
+; CHECK-O0-NEXT: movq %rsp, %rdx
+; CHECK-O0-NEXT: movl $2, %ecx
+; CHECK-O0-NEXT: callq __atomic_load@PLT
+; CHECK-O0-NEXT: movaps (%rsp), %xmm0
+; CHECK-O0-NEXT: addq $24, %rsp
+; CHECK-O0-NEXT: retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec4_float:
+; CHECK-SSE-O0: # %bb.0:
+; CHECK-SSE-O0-NEXT: subq $24, %rsp
+; CHECK-SSE-O0-NEXT: movq %rdi, %rsi
+; CHECK-SSE-O0-NEXT: movl $16, %edi
+; CHECK-SSE-O0-NEXT: movq %rsp, %rdx
+; CHECK-SSE-O0-NEXT: movl $2, %ecx
+; CHECK-SSE-O0-NEXT: callq __atomic_load@PLT
+; CHECK-SSE-O0-NEXT: movaps (%rsp), %xmm0
+; CHECK-SSE-O0-NEXT: addq $24, %rsp
+; CHECK-SSE-O0-NEXT: retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec4_float:
+; CHECK-AVX-O0: # %bb.0:
+; CHECK-AVX-O0-NEXT: subq $24, %rsp
+; CHECK-AVX-O0-NEXT: movq %rdi, %rsi
+; CHECK-AVX-O0-NEXT: movl $16, %edi
+; CHECK-AVX-O0-NEXT: movq %rsp, %rdx
+; CHECK-AVX-O0-NEXT: movl $2, %ecx
+; CHECK-AVX-O0-NEXT: callq __atomic_load@PLT
+; CHECK-AVX-O0-NEXT: vmovaps (%rsp), %xmm0
+; CHECK-AVX-O0-NEXT: addq $24, %rsp
+; CHECK-AVX-O0-NEXT: retq
+ %ret = load atomic <4 x float>, ptr %x acquire, align 4
+ ret <4 x float> %ret
+}
+
+define <8 x double> @atomic_vec8_double(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec8_double:
+; CHECK-O3: # %bb.0:
+; CHECK-O3-NEXT: subq $72, %rsp
+; CHECK-O3-NEXT: movq %rdi, %rsi
+; CHECK-O3-NEXT: movq %rsp, %rdx
+; CHECK-O3-NEXT: movl $64, %edi
+; CHECK-O3-NEXT: movl $2, %ecx
+; CHECK-O3-NEXT: callq __atomic_load@PLT
+; CHECK-O3-NEXT: movaps (%rsp), %xmm0
+; CHECK-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2
+; CHECK-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3
+; CHECK-O3-NEXT: addq $72, %rsp
+; CHECK-O3-NEXT: retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec8_double:
+; CHECK-SSE-O3: # %bb.0:
+; CHECK-SSE-O3-NEXT: subq $72, %rsp
+; CHECK-SSE-O3-NEXT: movq %rdi, %rsi
+; CHECK-SSE-O3-NEXT: movq %rsp, %rdx
+; CHECK-SSE-O3-NEXT: movl $64, %edi
+; CHECK-SSE-O3-NEXT: movl $2, %ecx
+; CHECK-SSE-O3-NEXT: callq __atomic_load@PLT
+; CHECK-SSE-O3-NEXT: movaps (%rsp), %xmm0
+; CHECK-SSE-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-SSE-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2
+; CHECK-SSE-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3
+; CHECK-SSE-O3-NEXT: addq $72, %rsp
+; CHECK-SSE-O3-NEXT: retq
+;
+; CHECK-O0-LABEL: atomic_vec8_double:
+; CHECK-O0: # %bb.0:
+; CHECK-O0-NEXT: subq $72, %rsp
+; CHECK-O0-NEXT: movq %rdi, %rsi
+; CHECK-O0-NEXT: movl $64, %edi
+; CHECK-O0-NEXT: movq %rsp, %rdx
+; CHECK-O0-NEXT: movl $2, %ecx
+; CHECK-O0-NEXT: callq __atomic_load@PLT
+; CHECK-O0-NEXT: movapd (%rsp), %xmm0
+; CHECK-O0-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1
+; CHECK-O0-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2
+; CHECK-O0-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3
+; CHECK-O0-NEXT: addq $72, %rsp
+; CHECK-O0-NEXT: retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec8_double:
+; CHECK-SSE-O0: # %bb.0:
+; CHECK-SSE-O0-NEXT: subq $72, %rsp
+; CHECK-SSE-O0-NEXT: movq %rdi, %rsi
+; CHECK-SSE-O0-NEXT: movl $64, %edi
+; CHECK-SSE-O0-NEXT: movq %rsp, %rdx
+; CHECK-SSE-O0-NEXT: movl $2, %ecx
+; CHECK-SSE-O0-NEXT: callq __atomic_load@PLT
+; CHECK-SSE-O0-NEXT: movapd (%rsp), %xmm0
+; CHECK-SSE-O0-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1
+; CHECK-SSE-O0-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2
+; CHECK-SSE-O0-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3
+; CHECK-SSE-O0-NEXT: addq $72, %rsp
+; CHECK-SSE-O0-NEXT: retq
+ %ret = load atomic <8 x double>, ptr %x acquire, align 4
+ ret <8 x double> %ret
+}
+
+define <16 x bfloat> @atomic_vec16_bfloat(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec16_bfloat:
+; CHECK-O3: # %bb.0:
+; CHECK-O3-NEXT: subq $40, %rsp
+; CHECK-O3-NEXT: movq %rdi, %rsi
+; CHECK-O3-NEXT: movq %rsp, %rdx
+; CHECK-O3-NEXT: movl $32, %edi
+; CHECK-O3-NEXT: movl $2, %ecx
+; CHECK-O3-NEXT: callq __atomic_load@PLT
+; CHECK-O3-NEXT: movaps (%rsp), %xmm0
+; CHECK-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-O3-NEXT: addq $40, %rsp
+; CHECK-O3-NEXT: retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec16_bfloat:
+; CHECK-SSE-O3: # %bb.0:
+; CHECK-SSE-O3-NEXT: subq $40, %rsp
+; CHECK-SSE-O3-NEXT: movq %rdi, %rsi
+; CHECK-SSE-O3-NEXT: movq %rsp, %rdx
+; CHECK-SSE-O3-NEXT: movl $32, %edi
+; CHECK-SSE-O3-NEXT: movl $2, %ecx
+; CHECK-SSE-O3-NEXT: callq __atomic_load@PLT
+; CHECK-SSE-O3-NEXT: movaps (%rsp), %xmm0
+; CHECK-SSE-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-SSE-O3-NEXT: addq $40, %rsp
+; CHECK-SSE-O3-NEXT: retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec16_bfloat:
+; CHECK-AVX-O3: # %bb.0:
+; CHECK-AVX-O3-NEXT: subq $40, %rsp
+; CHECK-AVX-O3-NEXT: movq %rdi, %rsi
+; CHECK-AVX-O3-NEXT: movq %rsp, %rdx
+; CHECK-AVX-O3-NEXT: movl $32, %edi
+; CHECK-AVX-O3-NEXT: movl $2, %ecx
+; CHECK-AVX-O3-NEXT: callq __atomic_load@PLT
+; CHECK-AVX-O3-NEXT: vmovups (%rsp), %ymm0
+; CHECK-AVX-O3-NEXT: addq $40, %rsp
+; CHECK-AVX-O3-NEXT: retq
+;
+; CHECK-O0-LABEL: atomic_vec16_bfloat:
+; CHECK-O0: # %bb.0:
+; CHECK-O0-NEXT: subq $40, %rsp
+; CHECK-O0-NEXT: movq %rdi, %rsi
+; CHECK-O0-NEXT: movl $32, %edi
+; CHECK-O0-NEXT: movq %rsp, %rdx
+; CHECK-O0-NEXT: movl $2, %ecx
+; CHECK-O0-NEXT: callq __atomic_load@PLT
+; CHECK-O0-NEXT: movaps (%rsp), %xmm0
+; CHECK-O0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-O0-NEXT: addq $40, %rsp
+; CHECK-O0-NEXT: retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec16_bfloat:
+; CHECK-SSE-O0: # %bb.0:
+; CHECK-SSE-O0-NEXT: subq $40, %rsp
+; CHECK-SSE-O0-NEXT: movq %rdi, %rsi
+; CHECK-SSE-O0-NEXT: movl $32, %edi
+; CHECK-SSE-O0-NEXT: movq %rsp, %rdx
+; CHECK-SSE-O0-NEXT: movl $2, %ecx
+; CHECK-SSE-O0-NEXT: callq __atomic_load@PLT
+; CHECK-SSE-O0-NEXT: movaps (%rsp), %xmm0
+; CHECK-SSE-O0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-SSE-O0-NEXT: addq $40, %rsp
+; CHECK-SSE-O0-NEXT: retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec16_bfloat:
+; CHECK-AVX-O0: # %bb.0:
+; CHECK-AVX-O0-NEXT: subq $40, %rsp
+; CHECK-AVX-O0-NEXT: movq %rdi, %rsi
+; CHECK-AVX-O0-NEXT: movl $32, %edi
+; CHECK-AVX-O0-NEXT: movq %rsp, %rdx
+; CHECK-AVX-O0-NEXT: movl $2, %ecx
+; CHECK-AVX-O0-NEXT: callq __atomic_load@PLT
+; CHECK-AVX-O0-NEXT: vmovups (%rsp), %ymm0
+; CHECK-AVX-O0-NEXT: addq $40, %rsp
+; CHECK-AVX-O0-NEXT: retq
+ %ret = load atomic <16 x bfloat>, ptr %x acquire, align 4
+ ret <16 x bfloat> %ret
+}
+
+define <32 x half> @atomic_vec32_half(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec32_half:
+; CHECK-O3: # %bb.0:
+; CHECK-O3-NEXT: subq $72, %rsp
+; CHECK-O3-NEXT: movq %rdi, %rsi
+; CHECK-O3-NEXT: movq %rsp, %rdx
+; CHECK-O3-NEXT: movl $64, %edi
+; CHECK-O3-NEXT: movl $2, %ecx
+; CHECK-O3-NEXT: callq __atomic_load@PLT
+; CHECK-O3-NEXT: movaps (%rsp), %xmm0
+; CHECK-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2
+; CHECK-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3
+; CHECK-O3-NEXT: addq $72, %rsp
+; CHECK-O3-NEXT: retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec32_half:
+; CHECK-SSE-O3: # %bb.0:
+; CHECK-SSE-O3-NEXT: subq $72, %rsp
+; CHECK-SSE-O3-NEXT: movq %rdi, %rsi
+; CHECK-SSE-O3-NEXT: movq %rsp, %rdx
+; CHECK-SSE-O3-NEXT: movl $64, %edi
+; CHECK-SSE-O3-NEXT: movl $2, %ecx
+; CHECK-SSE-O3-NEXT: callq __atomic_load@PLT
+; CHECK-SSE-O3-NEXT: movaps (%rsp), %xmm0
+; CHECK-SSE-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-SSE-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2
+; CHECK-SSE-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3
+; CHECK-SSE-O3-NEXT: addq $72, %rsp
+; CHECK-SSE-O3-NEXT: retq
+;
+; CHECK-O0-LABEL: atomic_vec32_half:
+; CHECK-O0: # %bb.0:
+; CHECK-O0-NEXT: subq $72, %rsp
+; CHECK-O0-NEXT: movq %rdi, %rsi
+; CHECK-O0-NEXT: movl $64, %edi
+; CHECK-O0-NEXT: movq %rsp, %rdx
+; CHECK-O0-NEXT: movl $2, %ecx
+; CHECK-O0-NEXT: callq __atomic_load@PLT
+; CHECK-O0-NEXT: movaps (%rsp), %xmm0
+; CHECK-O0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-O0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2
+; CHECK-O0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3
+; CHECK-O0-NEXT: addq $72, %rsp
+; CHECK-O0-NEXT: retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec32_half:
+; CHECK-SSE-O0: # %bb.0:
+; CHECK-SSE-O0-NEXT: subq $72, %rsp
+; CHECK-SSE-O0-NEXT: movq %rdi, %rsi
+; CHECK-SSE-O0-NEXT: movl $64, %edi
+; CHECK-SSE-O0-NEXT: movq %rsp, %rdx
+; CHECK-SSE-O0-NEXT: movl $2, %ecx
+; CHECK-SSE-O0-NEXT: callq __atomic_load@PLT
+; CHECK-SSE-O0-NEXT: movaps (%rsp), %xmm0
+; CHECK-SSE-O0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-SSE-O0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2
+; CHECK-SSE-O0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3
+; CHECK-SSE-O0-NEXT: addq $72, %rsp
+; CHECK-SSE-O0-NEXT: retq
+ %ret = load atomic <32 x half>, ptr %x acquire, align 4
+ ret <32 x half> %ret
+}
diff --git a/llvm/test/CodeGen/X86/avoid-sfb-g-no-change.mir b/llvm/test/CodeGen/X86/avoid-sfb-g-no-change.mir
index 99fee27..88d7682 100644
--- a/llvm/test/CodeGen/X86/avoid-sfb-g-no-change.mir
+++ b/llvm/test/CodeGen/X86/avoid-sfb-g-no-change.mir
@@ -52,7 +52,7 @@
; Function Attrs: nounwind
declare void @llvm.stackprotector(ptr, ptr) #2
- attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
attributes #1 = { nounwind readnone speculatable }
attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/avoid-sfb-g-no-change2.mir b/llvm/test/CodeGen/X86/avoid-sfb-g-no-change2.mir
index 50b2433..8dbd4e2 100644
--- a/llvm/test/CodeGen/X86/avoid-sfb-g-no-change2.mir
+++ b/llvm/test/CodeGen/X86/avoid-sfb-g-no-change2.mir
@@ -63,7 +63,7 @@
; Function Attrs: nounwind
declare void @llvm.stackprotector(ptr, ptr) #2
- attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
attributes #1 = { nounwind readnone speculatable }
attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/avoid-sfb-g-no-change3.mir b/llvm/test/CodeGen/X86/avoid-sfb-g-no-change3.mir
index 7a4b993..c0924ea 100644
--- a/llvm/test/CodeGen/X86/avoid-sfb-g-no-change3.mir
+++ b/llvm/test/CodeGen/X86/avoid-sfb-g-no-change3.mir
@@ -73,7 +73,7 @@
; Function Attrs: nounwind
declare void @llvm.stackprotector(ptr, ptr) #2
- attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
attributes #1 = { nounwind readnone speculatable }
attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/avoid-sfb-overlaps.ll b/llvm/test/CodeGen/X86/avoid-sfb-overlaps.ll
index da9d16c..f074390 100644
--- a/llvm/test/CodeGen/X86/avoid-sfb-overlaps.ll
+++ b/llvm/test/CodeGen/X86/avoid-sfb-overlaps.ll
@@ -502,6 +502,6 @@ entry:
ret void
}
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
attributes #1 = { argmemonly nounwind }
diff --git a/llvm/test/CodeGen/X86/avx512-cmp-kor-sequence.ll b/llvm/test/CodeGen/X86/avx512-cmp-kor-sequence.ll
index b4ba239..7fd4f59 100644
--- a/llvm/test/CodeGen/X86/avx512-cmp-kor-sequence.ll
+++ b/llvm/test/CodeGen/X86/avx512-cmp-kor-sequence.ll
@@ -48,5 +48,5 @@ entry:
; Function Attrs: nounwind readnone
declare <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, <16 x i1>, i32) #1
-attributes #0 = { nounwind readnone uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="broadwell" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512f,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+evex512,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-vzeroupper" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readnone uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="broadwell" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512f,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+evex512,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-vzeroupper" "use-soft-float"="false" }
attributes #1 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/X86/basic-block-address-map-empty-function.ll b/llvm/test/CodeGen/X86/basic-block-address-map-empty-function.ll
index 4e76262..423e318 100644
--- a/llvm/test/CodeGen/X86/basic-block-address-map-empty-function.ll
+++ b/llvm/test/CodeGen/X86/basic-block-address-map-empty-function.ll
@@ -19,7 +19,7 @@ entry:
; CHECK: func:
; CHECK: .Lfunc_begin1:
; CHECK: .section .llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text{{$}}
-; CHECK-NEXT: .byte 3 # version
+; CHECK-NEXT: .byte 4 # version
; BASIC-NEXT: .byte 0 # feature
; PGO-NEXT: .byte 3 # feature
; CHECK-NEXT: .quad .Lfunc_begin1 # function address
diff --git a/llvm/test/CodeGen/X86/basic-block-address-map-function-sections.ll b/llvm/test/CodeGen/X86/basic-block-address-map-function-sections.ll
index f610b04..e32e522 100644
--- a/llvm/test/CodeGen/X86/basic-block-address-map-function-sections.ll
+++ b/llvm/test/CodeGen/X86/basic-block-address-map-function-sections.ll
@@ -10,7 +10,7 @@ define dso_local i32 @_Z3barv() {
; CHECK-LABEL: _Z3barv:
; CHECK-NEXT: [[BAR_BEGIN:.Lfunc_begin[0-9]+]]:
; CHECK: .section .llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text._Z3barv{{$}}
-; CHECK-NEXT: .byte 3 # version
+; CHECK-NEXT: .byte 4 # version
; CHECK-NEXT: .byte 0 # feature
; CHECK-NEXT: .quad [[BAR_BEGIN]] # function address
@@ -23,7 +23,7 @@ define dso_local i32 @_Z3foov() {
; CHECK-LABEL: _Z3foov:
; CHECK-NEXT: [[FOO_BEGIN:.Lfunc_begin[0-9]+]]:
; CHECK: .section .llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text._Z3foov{{$}}
-; CHECK-NEXT: .byte 3 # version
+; CHECK-NEXT: .byte 4 # version
; CHECK-NEXT: .byte 32 # feature
; CHECK-NEXT: .quad [[FOO_BEGIN]] # function address
@@ -36,6 +36,6 @@ define linkonce_odr dso_local i32 @_Z4fooTIiET_v() comdat {
; CHECK-LABEL: _Z4fooTIiET_v:
; CHECK-NEXT: [[FOOCOMDAT_BEGIN:.Lfunc_begin[0-9]+]]:
; CHECK: .section .llvm_bb_addr_map,"oG",@llvm_bb_addr_map,.text._Z4fooTIiET_v,_Z4fooTIiET_v,comdat{{$}}
-; CHECK-NEXT: .byte 3 # version
+; CHECK-NEXT: .byte 4 # version
; CHECK-NEXT: .byte 0 # feature
; CHECK-NEXT: .quad [[FOOCOMDAT_BEGIN]] # function address
diff --git a/llvm/test/CodeGen/X86/basic-block-address-map-pgo-features.ll b/llvm/test/CodeGen/X86/basic-block-address-map-pgo-features.ll
index ba76f3e..12b1297 100644
--- a/llvm/test/CodeGen/X86/basic-block-address-map-pgo-features.ll
+++ b/llvm/test/CodeGen/X86/basic-block-address-map-pgo-features.ll
@@ -69,7 +69,7 @@ declare i32 @__gxx_personality_v0(...)
; CHECK-LABEL: .Lfunc_end0:
; CHECK: .section .llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text._Z3bazb{{$}}
-; CHECK-NEXT: .byte 3 # version
+; CHECK-NEXT: .byte 4 # version
; BASIC-NEXT: .byte 32 # feature
; PGO-ALL-NEXT: .byte 39 # feature
; FEC-ONLY-NEXT:.byte 33 # feature
diff --git a/llvm/test/CodeGen/X86/basic-block-address-map-with-basic-block-sections.ll b/llvm/test/CodeGen/X86/basic-block-address-map-with-basic-block-sections.ll
index 6157f1a..aeb6dc95 100644
--- a/llvm/test/CodeGen/X86/basic-block-address-map-with-basic-block-sections.ll
+++ b/llvm/test/CodeGen/X86/basic-block-address-map-with-basic-block-sections.ll
@@ -47,7 +47,7 @@ declare i32 @__gxx_personality_v0(...)
; CHECK-LABEL: .Lfunc_end0:
; CHECK: .section .llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text.hot._Z3bazb
-; CHECK-NEXT: .byte 3 # version
+; CHECK-NEXT: .byte 4 # version
; CHECK-NEXT: .byte 40 # feature
; CHECK-NEXT: .byte 2 # number of basic block ranges
; CHECK-NEXT: .quad .Lfunc_begin0 # base address
diff --git a/llvm/test/CodeGen/X86/basic-block-address-map-with-emit-bb-hash.ll b/llvm/test/CodeGen/X86/basic-block-address-map-with-emit-bb-hash.ll
new file mode 100644
index 0000000..a5678877
--- /dev/null
+++ b/llvm/test/CodeGen/X86/basic-block-address-map-with-emit-bb-hash.ll
@@ -0,0 +1,94 @@
+; Check the basic block sections labels option works when used along with -emit-bb-hash.
+; RUN: llc < %s -mtriple=x86_64 -function-sections -unique-section-names=true -basic-block-address-map -emit-bb-hash | FileCheck %s --check-prefixes=CHECK,UNIQ
+
+define void @_Z3bazb(i1 zeroext, i1 zeroext) personality ptr @__gxx_personality_v0 {
+ br i1 %0, label %3, label %8
+
+3:
+ %4 = invoke i32 @_Z3barv()
+ to label %8 unwind label %6
+ br label %10
+
+6:
+ landingpad { ptr, i32 }
+ catch ptr null
+ br label %12
+
+8:
+ %9 = call i32 @_Z3foov()
+ br i1 %1, label %12, label %10
+
+10:
+ %11 = select i1 %1, ptr blockaddress(@_Z3bazb, %3), ptr blockaddress(@_Z3bazb, %12) ; <ptr> [#uses=1]
+ indirectbr ptr %11, [label %3, label %12]
+
+12:
+ ret void
+}
+
+declare i32 @_Z3barv() #1
+
+declare i32 @_Z3foov() #1
+
+declare i32 @__gxx_personality_v0(...)
+
+; UNIQ: .section .text._Z3bazb,"ax",@progbits{{$}}
+; NOUNIQ: .section .text,"ax",@progbits,unique,1
+; CHECK-LABEL: _Z3bazb:
+; CHECK-LABEL: .Lfunc_begin0:
+; CHECK-LABEL: .LBB_END0_0:
+; CHECK-LABEL: .LBB0_1:
+; CHECK-LABEL: .LBB0_1_CS0:
+; CHECK-LABEL: .LBB_END0_1:
+; CHECK-LABEL: .LBB0_2:
+; CHECK-LABEL: .LBB0_2_CS0:
+; CHECK-LABEL: .LBB_END0_2:
+; CHECK-LABEL: .LBB0_3:
+; CHECK-LABEL: .LBB_END0_3:
+; CHECK-LABEL: .Lfunc_end0:
+
+; UNIQ: .section .llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text._Z3bazb{{$}}
+;; Verify that with -unique-section-names=false, the unique id of the text section gets assigned to the llvm_bb_addr_map section.
+; NOUNIQ: .section .llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text,unique,1
+; CHECK-NEXT: .byte 4 # version
+; CHECK-NEXT: .byte 96 # feature
+; CHECK-NEXT: .quad .Lfunc_begin0 # function address
+; CHECK-NEXT: .byte 6 # number of basic blocks
+; CHECK-NEXT: .byte 0 # BB id
+; CHECK-NEXT: .uleb128 .Lfunc_begin0-.Lfunc_begin0
+; CHECK-NEXT: .byte 0 # number of callsites
+; CHECK-NEXT: .uleb128 .LBB_END0_0-.Lfunc_begin0
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .quad {{-?[0-9]+}}
+; CHECK-NEXT: .byte 1 # BB id
+; CHECK-NEXT: .uleb128 .LBB0_1-.LBB_END0_0
+; CHECK-NEXT: .byte 1 # number of callsites
+; CHECK-NEXT: .uleb128 .LBB0_1_CS0-.LBB0_1
+; CHECK-NEXT: .uleb128 .LBB_END0_1-.LBB0_1_CS0
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .quad {{-?[0-9]+}}
+; CHECK-NEXT: .byte 3 # BB id
+; CHECK-NEXT: .uleb128 .LBB0_2-.LBB_END0_1
+; CHECK-NEXT: .byte 1 # number of callsites
+; CHECK-NEXT: .uleb128 .LBB0_2_CS0-.LBB0_2
+; CHECK-NEXT: .uleb128 .LBB_END0_2-.LBB0_2_CS0
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .quad {{-?[0-9]+}}
+; CHECK-NEXT: .byte 4 # BB id
+; CHECK-NEXT: .uleb128 .LBB0_3-.LBB_END0_2
+; CHECK-NEXT: .byte 0 # number of callsites
+; CHECK-NEXT: .uleb128 .LBB_END0_3-.LBB0_3
+; CHECK-NEXT: .byte 16
+; CHECK-NEXT: .quad {{-?[0-9]+}}
+; CHECK-NEXT: .byte 5 # BB id
+; CHECK-NEXT: .uleb128 .LBB0_4-.LBB_END0_3
+; CHECK-NEXT: .byte 0 # number of callsites
+; CHECK-NEXT: .uleb128 .LBB_END0_4-.LBB0_4
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .quad {{-?[0-9]+}}
+; CHECK-NEXT: .byte 2 # BB id
+; CHECK-NEXT: .uleb128 .LBB0_5-.LBB_END0_4
+; CHECK-NEXT: .byte 0 # number of callsites
+; CHECK-NEXT: .uleb128 .LBB_END0_5-.LBB0_5
+; CHECK-NEXT: .byte 5
+; CHECK-NEXT: .quad {{-?[0-9]+}}
diff --git a/llvm/test/CodeGen/X86/basic-block-address-map-with-mfs.ll b/llvm/test/CodeGen/X86/basic-block-address-map-with-mfs.ll
index 1e8cee4..d49b313 100644
--- a/llvm/test/CodeGen/X86/basic-block-address-map-with-mfs.ll
+++ b/llvm/test/CodeGen/X86/basic-block-address-map-with-mfs.ll
@@ -58,7 +58,7 @@ declare i32 @qux()
; CHECK-LABEL: .Lfunc_end0:
; CHECK: .section .llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text.hot.foo
-; CHECK-NEXT: .byte 3 # version
+; CHECK-NEXT: .byte 4 # version
; BASIC-NEXT: .byte 40 # feature
; PGO-NEXT: .byte 47 # feature
; CHECK-NEXT: .byte 2 # number of basic block ranges
diff --git a/llvm/test/CodeGen/X86/basic-block-address-map.ll b/llvm/test/CodeGen/X86/basic-block-address-map.ll
index 5c8f3a6..64cf2c7 100644
--- a/llvm/test/CodeGen/X86/basic-block-address-map.ll
+++ b/llvm/test/CodeGen/X86/basic-block-address-map.ll
@@ -52,7 +52,7 @@ declare i32 @__gxx_personality_v0(...)
; UNIQ: .section .llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text._Z3bazb{{$}}
;; Verify that with -unique-section-names=false, the unique id of the text section gets assigned to the llvm_bb_addr_map section.
; NOUNIQ: .section .llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text,unique,1
-; CHECK-NEXT: .byte 3 # version
+; CHECK-NEXT: .byte 4 # version
; CHECK-NEXT: .byte 32 # feature
; CHECK-NEXT: .quad .Lfunc_begin0 # function address
; CHECK-NEXT: .byte 6 # number of basic blocks
diff --git a/llvm/test/CodeGen/X86/bit-piece-comment.ll b/llvm/test/CodeGen/X86/bit-piece-comment.ll
index d74863f..85c64a7 100644
--- a/llvm/test/CodeGen/X86/bit-piece-comment.ll
+++ b/llvm/test/CodeGen/X86/bit-piece-comment.ll
@@ -32,7 +32,7 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
; Function Attrs: nounwind readnone
declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
-attributes #0 = { norecurse nounwind optsize readnone uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind optsize readnone uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "use-soft-float"="false" }
attributes #1 = { nounwind readnone }
!llvm.dbg.cu = !{!0}
diff --git a/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll b/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll
index 632d90d..f36baba 100644
--- a/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll
+++ b/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll
@@ -27,7 +27,7 @@ entry:
!1 = !{i64 0, !"_ZTSFivE.generalized"}
!2 = !{i64 0, !"_ZTSFviE.generalized"}
-; CHECK: .section .llvm.callgraph,"o",@progbits,.text
+; CHECK: .section .llvm.callgraph,"o",@llvm_call_graph,.text
;; Version
; CHECK-NEXT: .byte 0
;; Flags -- Potential indirect target so LSB is set to 1. Other bits are 0.
diff --git a/llvm/test/CodeGen/X86/call-graph-section-assembly.ll b/llvm/test/CodeGen/X86/call-graph-section-assembly.ll
index ed6849a..cdbad66 100644
--- a/llvm/test/CodeGen/X86/call-graph-section-assembly.ll
+++ b/llvm/test/CodeGen/X86/call-graph-section-assembly.ll
@@ -36,7 +36,7 @@ entry:
!4 = !{!5}
!5 = !{i64 0, !"_ZTSFPvS_E.generalized"}
-; CHECK: .section .llvm.callgraph,"o",@progbits,.text
+; CHECK: .section .llvm.callgraph,"o",@llvm_call_graph,.text
;; Version
; CHECK-NEXT: .byte 0
;; Flags
diff --git a/llvm/test/CodeGen/X86/catchpad-regmask.ll b/llvm/test/CodeGen/X86/catchpad-regmask.ll
index 9dba897..713d015 100644
--- a/llvm/test/CodeGen/X86/catchpad-regmask.ll
+++ b/llvm/test/CodeGen/X86/catchpad-regmask.ll
@@ -130,7 +130,7 @@ unreachable: ; preds = %entry
; CHECK: retq # CATCHRET
-attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "use-soft-float"="false" }
attributes #1 = { noreturn }
!llvm.module.flags = !{!0}
diff --git a/llvm/test/CodeGen/X86/catchpad-weight.ll b/llvm/test/CodeGen/X86/catchpad-weight.ll
index e97f358..699243d 100644
--- a/llvm/test/CodeGen/X86/catchpad-weight.ll
+++ b/llvm/test/CodeGen/X86/catchpad-weight.ll
@@ -74,8 +74,8 @@ declare void @"\01??1HasDtor@@QEAA@XZ"(ptr) #3
; Function Attrs: nounwind argmemonly
declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #1
-attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "use-soft-float"="false" }
attributes #1 = { nounwind argmemonly }
-attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "use-soft-float"="false" }
+attributes #3 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "use-soft-float"="false" }
attributes #4 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/clang-section-coff.ll b/llvm/test/CodeGen/X86/clang-section-coff.ll
index 02381fd..6b76bb6 100644
--- a/llvm/test/CodeGen/X86/clang-section-coff.ll
+++ b/llvm/test/CodeGen/X86/clang-section-coff.ll
@@ -37,8 +37,8 @@ attributes #0 = { "bss-section"="my_bss.1" "data-section"="my_data.1" "rodata-se
attributes #1 = { "data-section"="my_data.1" "rodata-section"="my_rodata.1" }
attributes #2 = { "bss-section"="my_bss.2" "rodata-section"="my_rodata.1" }
attributes #3 = { "bss-section"="my_bss.2" "data-section"="my_data.2" "rodata-section"="my_rodata.2" }
-attributes #6 = { "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #7 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #6 = { "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #7 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
!llvm.module.flags = !{!0, !1, !2, !3}
diff --git a/llvm/test/CodeGen/X86/cleanuppad-inalloca.ll b/llvm/test/CodeGen/X86/cleanuppad-inalloca.ll
index 01e7019..863f580 100644
--- a/llvm/test/CodeGen/X86/cleanuppad-inalloca.ll
+++ b/llvm/test/CodeGen/X86/cleanuppad-inalloca.ll
@@ -65,4 +65,4 @@ declare i32 @__CxxFrameHandler3(...)
declare x86_thiscallcc void @"\01??1A@@QAE@XZ"(ptr) #0
-attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/combine-adc.ll b/llvm/test/CodeGen/X86/combine-adc.ll
index 2241736..a2aaea3 100644
--- a/llvm/test/CodeGen/X86/combine-adc.ll
+++ b/llvm/test/CodeGen/X86/combine-adc.ll
@@ -89,4 +89,52 @@ define i32 @adc_merge_constants(i32 %a0) nounwind {
ret i32 %sum
}
+define i32 @adc_merge_sub(i32 %a0) nounwind {
+; X86-LABEL: adc_merge_sub:
+; X86: # %bb.0:
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: addl $42, %edi
+; X86-NEXT: setb %al
+; X86-NEXT: movl %edi, %esi
+; X86-NEXT: negl %esi
+; X86-NEXT: pushl %eax
+; X86-NEXT: calll use@PLT
+; X86-NEXT: addl $4, %esp
+; X86-NEXT: xorl %edi, %esi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: retl
+;
+; X64-LABEL: adc_merge_sub:
+; X64: # %bb.0:
+; X64-NEXT: pushq %rbp
+; X64-NEXT: pushq %rbx
+; X64-NEXT: pushq %rax
+; X64-NEXT: movl %edi, %ebx
+; X64-NEXT: xorl %edi, %edi
+; X64-NEXT: addl $42, %ebx
+; X64-NEXT: setb %dil
+; X64-NEXT: movl %ebx, %ebp
+; X64-NEXT: negl %ebp
+; X64-NEXT: callq use@PLT
+; X64-NEXT: xorl %ebx, %ebp
+; X64-NEXT: movl %ebp, %eax
+; X64-NEXT: addq $8, %rsp
+; X64-NEXT: popq %rbx
+; X64-NEXT: popq %rbp
+; X64-NEXT: retq
+ %adc = tail call { i8, i32 } @llvm.x86.addcarry.32(i8 0, i32 %a0, i32 42)
+ %carry = extractvalue { i8, i32 } %adc, 0
+ call void @use(i8 %carry)
+ %sum = extractvalue { i8, i32 } %adc, 1
+ %sub = sub i32 -42, %a0
+ %result = xor i32 %sum, %sub
+ ret i32 %result
+}
+
declare { i8, i32 } @llvm.x86.addcarry.32(i8, i32, i32)
+declare void @use(i8)
diff --git a/llvm/test/CodeGen/X86/combine-sbb.ll b/llvm/test/CodeGen/X86/combine-sbb.ll
index 89aee96..62744d4 100644
--- a/llvm/test/CodeGen/X86/combine-sbb.ll
+++ b/llvm/test/CodeGen/X86/combine-sbb.ll
@@ -333,4 +333,85 @@ define i32 @PR40483_sub6(ptr, i32) nounwind {
ret i32 %10
}
+define i32 @sbb_merge_add1(i32 %a0) nounwind {
+; X86-LABEL: sbb_merge_add1:
+; X86: # %bb.0:
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: cmpl $42, {{[0-9]+}}(%esp)
+; X86-NEXT: setb %al
+; X86-NEXT: pushl %eax
+; X86-NEXT: calll use@PLT
+; X86-NEXT: addl $4, %esp
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: sbb_merge_add1:
+; X64: # %bb.0:
+; X64-NEXT: pushq %rax
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: cmpl $42, %edi
+; X64-NEXT: setb %al
+; X64-NEXT: movl %eax, %edi
+; X64-NEXT: callq use@PLT
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: popq %rcx
+; X64-NEXT: retq
+ %sbb = tail call { i8, i32 } @llvm.x86.subborrow.32(i8 0, i32 %a0, i32 42)
+ %borrow = extractvalue { i8, i32 } %sbb, 0
+ call void @use(i8 %borrow)
+ %diff = extractvalue { i8, i32 } %sbb, 1
+ %add = add i32 %a0, -42
+ %result = xor i32 %diff, %add
+ ret i32 %result
+}
+
+define i32 @sbb_merge_add2(i32 %a0) nounwind {
+; X86-LABEL: sbb_merge_add2:
+; X86: # %bb.0:
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl $42, %edi
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: subl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: setb %al
+; X86-NEXT: movl %edi, %esi
+; X86-NEXT: negl %esi
+; X86-NEXT: pushl %eax
+; X86-NEXT: calll use@PLT
+; X86-NEXT: addl $4, %esp
+; X86-NEXT: xorl %edi, %esi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: retl
+;
+; X64-LABEL: sbb_merge_add2:
+; X64: # %bb.0:
+; X64-NEXT: pushq %rbp
+; X64-NEXT: pushq %rbx
+; X64-NEXT: pushq %rax
+; X64-NEXT: movl $42, %ebp
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: subl %edi, %ebp
+; X64-NEXT: setb %al
+; X64-NEXT: movl %ebp, %ebx
+; X64-NEXT: negl %ebx
+; X64-NEXT: movl %eax, %edi
+; X64-NEXT: callq use@PLT
+; X64-NEXT: xorl %ebp, %ebx
+; X64-NEXT: movl %ebx, %eax
+; X64-NEXT: addq $8, %rsp
+; X64-NEXT: popq %rbx
+; X64-NEXT: popq %rbp
+; X64-NEXT: retq
+ %sbb = tail call { i8, i32 } @llvm.x86.subborrow.32(i8 0, i32 42, i32 %a0)
+ %borrow = extractvalue { i8, i32 } %sbb, 0
+ call void @use(i8 %borrow)
+ %diff = extractvalue { i8, i32 } %sbb, 1
+ %add = add i32 %a0, -42
+ %result = xor i32 %diff, %add
+ ret i32 %result
+}
+
declare { i8, i32 } @llvm.x86.subborrow.32(i8, i32, i32)
+declare void @use(i8)
diff --git a/llvm/test/CodeGen/X86/complex-fastmath.ll b/llvm/test/CodeGen/X86/complex-fastmath.ll
index 29a37a1..21bb64a 100644
--- a/llvm/test/CodeGen/X86/complex-fastmath.ll
+++ b/llvm/test/CodeGen/X86/complex-fastmath.ll
@@ -212,4 +212,4 @@ define <2 x double> @complex_mul_f64(<2 x double>, <2 x double>) #0 {
ret <2 x double> %14
}
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "less-precise-fpmad"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "unsafe-fp-math"="true" }
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "less-precise-fpmad"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" }
diff --git a/llvm/test/CodeGen/X86/crash-lre-eliminate-dead-def.ll b/llvm/test/CodeGen/X86/crash-lre-eliminate-dead-def.ll
index ddddcfa..9f51fa4 100644
--- a/llvm/test/CodeGen/X86/crash-lre-eliminate-dead-def.ll
+++ b/llvm/test/CodeGen/X86/crash-lre-eliminate-dead-def.ll
@@ -264,5 +264,5 @@ unreachable: ; preds = %cleanup100
; Function Attrs: nounwind
declare void @printf(ptr nocapture readonly, ...) #1
-attributes #0 = { noreturn nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noreturn nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/dag-optnone.ll b/llvm/test/CodeGen/X86/dag-optnone.ll
index 66e4c1d..022694e 100644
--- a/llvm/test/CodeGen/X86/dag-optnone.ll
+++ b/llvm/test/CodeGen/X86/dag-optnone.ll
@@ -28,7 +28,7 @@
; a repeated fadd that can be combined into an fmul. We show that this
; happens in both the non-optnone function and the optnone function.
-define float @foo(float %x, ...) #0 {
+define float @foo(float %x, ...) {
entry:
%add = fadd fast float %x, %x
%add1 = fadd fast float %add, %x
@@ -68,5 +68,4 @@ entry:
ret void
}
-attributes #0 = { "unsafe-fp-math"="true" }
-attributes #1 = { noinline optnone "unsafe-fp-math"="true" }
+attributes #1 = { noinline optnone }
diff --git a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll
index b428ce4..71ad598 100644
--- a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll
+++ b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll
@@ -96,6 +96,17 @@ entry:
define void @_Z2x6v() local_unnamed_addr {
; CHECK-LABEL: _Z2x6v:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movq x1@GOTPCREL(%rip), %rax
+; CHECK-NEXT: movl (%rax), %edx
+; CHECK-NEXT: andl $511, %edx # imm = 0x1FF
+; CHECK-NEXT: leaq 1(%rdx), %rax
+; CHECK-NEXT: movq x4@GOTPCREL(%rip), %rcx
+; CHECK-NEXT: movl %eax, (%rcx)
+; CHECK-NEXT: movq x3@GOTPCREL(%rip), %rcx
+; CHECK-NEXT: movl (%rcx), %ecx
+; CHECK-NEXT: testl %ecx, %ecx
+; CHECK-NEXT: je .LBB1_18
+; CHECK-NEXT: # %bb.1: # %for.cond1thread-pre-split.lr.ph
; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: pushq %r15
@@ -114,58 +125,47 @@ define void @_Z2x6v() local_unnamed_addr {
; CHECK-NEXT: .cfi_offset %r14, -32
; CHECK-NEXT: .cfi_offset %r15, -24
; CHECK-NEXT: .cfi_offset %rbp, -16
-; CHECK-NEXT: movq x1@GOTPCREL(%rip), %rax
-; CHECK-NEXT: movl (%rax), %ebx
-; CHECK-NEXT: andl $511, %ebx # imm = 0x1FF
-; CHECK-NEXT: leaq 1(%rbx), %rax
-; CHECK-NEXT: movq x4@GOTPCREL(%rip), %rcx
-; CHECK-NEXT: movl %eax, (%rcx)
-; CHECK-NEXT: movq x3@GOTPCREL(%rip), %rcx
-; CHECK-NEXT: movl (%rcx), %ecx
-; CHECK-NEXT: testl %ecx, %ecx
-; CHECK-NEXT: je .LBB1_18
-; CHECK-NEXT: # %bb.1: # %for.cond1thread-pre-split.lr.ph
-; CHECK-NEXT: movq x5@GOTPCREL(%rip), %rdx
-; CHECK-NEXT: movq (%rdx), %rsi
-; CHECK-NEXT: movl %ecx, %edx
-; CHECK-NEXT: notl %edx
-; CHECK-NEXT: leaq 8(,%rdx,8), %rdi
+; CHECK-NEXT: movq x5@GOTPCREL(%rip), %rsi
+; CHECK-NEXT: movq (%rsi), %rsi
+; CHECK-NEXT: movl %ecx, %edi
+; CHECK-NEXT: notl %edi
+; CHECK-NEXT: leaq 8(,%rdi,8), %rdi
; CHECK-NEXT: imulq %rax, %rdi
; CHECK-NEXT: addq %rsi, %rdi
; CHECK-NEXT: movq x2@GOTPCREL(%rip), %r8
-; CHECK-NEXT: movl (%r8), %edx
-; CHECK-NEXT: leal 8(,%rbx,8), %eax
+; CHECK-NEXT: movl (%r8), %r9d
+; CHECK-NEXT: leal 8(,%rdx,8), %eax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: leaq 32(%rsi), %r11
-; CHECK-NEXT: leaq 8(,%rbx,8), %rbx
-; CHECK-NEXT: xorl %r14d, %r14d
-; CHECK-NEXT: movq x0@GOTPCREL(%rip), %r15
-; CHECK-NEXT: movq %rsi, %r12
+; CHECK-NEXT: leaq 32(%rsi), %rbx
+; CHECK-NEXT: leaq 8(,%rdx,8), %r14
+; CHECK-NEXT: xorl %r15d, %r15d
+; CHECK-NEXT: movq x0@GOTPCREL(%rip), %r12
+; CHECK-NEXT: movq %rsi, %r13
; CHECK-NEXT: jmp .LBB1_2
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB1_15: # %for.cond1.for.inc3_crit_edge
; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT: movl %edx, (%r8)
+; CHECK-NEXT: movl %r9d, (%r8)
; CHECK-NEXT: .LBB1_16: # %for.inc3
; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT: addq %rbx, %r12
-; CHECK-NEXT: incq %r14
-; CHECK-NEXT: addq %rbx, %r11
+; CHECK-NEXT: addq %r14, %r13
+; CHECK-NEXT: incq %r15
+; CHECK-NEXT: addq %r14, %rbx
; CHECK-NEXT: incl %ecx
; CHECK-NEXT: je .LBB1_17
; CHECK-NEXT: .LBB1_2: # %for.cond1thread-pre-split
; CHECK-NEXT: # =>This Loop Header: Depth=1
; CHECK-NEXT: # Child Loop BB1_12 Depth 2
; CHECK-NEXT: # Child Loop BB1_14 Depth 2
-; CHECK-NEXT: testl %edx, %edx
+; CHECK-NEXT: testl %r9d, %r9d
; CHECK-NEXT: jns .LBB1_16
; CHECK-NEXT: # %bb.3: # %for.body2.preheader
; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT: movslq %edx, %r13
-; CHECK-NEXT: testq %r13, %r13
+; CHECK-NEXT: movslq %r9d, %r9
+; CHECK-NEXT: testq %r9, %r9
; CHECK-NEXT: movq $-1, %rbp
-; CHECK-NEXT: cmovnsq %r13, %rbp
-; CHECK-NEXT: subq %r13, %rbp
+; CHECK-NEXT: cmovnsq %r9, %rbp
+; CHECK-NEXT: subq %r9, %rbp
; CHECK-NEXT: incq %rbp
; CHECK-NEXT: cmpq $4, %rbp
; CHECK-NEXT: jb .LBB1_14
@@ -177,20 +177,20 @@ define void @_Z2x6v() local_unnamed_addr {
; CHECK-NEXT: # %bb.5: # %vector.memcheck
; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; CHECK-NEXT: imulq %r14, %rax
-; CHECK-NEXT: leaq (%rsi,%rax), %r10
-; CHECK-NEXT: leaq (%r10,%r13,8), %r9
-; CHECK-NEXT: testq %r13, %r13
-; CHECK-NEXT: movq $-1, %r10
-; CHECK-NEXT: cmovnsq %r13, %r10
-; CHECK-NEXT: cmpq %r15, %r9
+; CHECK-NEXT: imulq %r15, %rax
+; CHECK-NEXT: leaq (%rsi,%rax), %r11
+; CHECK-NEXT: leaq (%r11,%r9,8), %r10
+; CHECK-NEXT: testq %r9, %r9
+; CHECK-NEXT: movq $-1, %r11
+; CHECK-NEXT: cmovnsq %r9, %r11
+; CHECK-NEXT: cmpq %r12, %r10
; CHECK-NEXT: jae .LBB1_7
; CHECK-NEXT: # %bb.6: # %vector.memcheck
; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT: leaq 8(%rsi), %r9
-; CHECK-NEXT: addq %r9, %rax
-; CHECK-NEXT: leaq (%rax,%r10,8), %rax
-; CHECK-NEXT: cmpq %r15, %rax
+; CHECK-NEXT: leaq 8(%rsi), %r10
+; CHECK-NEXT: addq %r10, %rax
+; CHECK-NEXT: leaq (%rax,%r11,8), %rax
+; CHECK-NEXT: cmpq %r12, %rax
; CHECK-NEXT: ja .LBB1_14
; CHECK-NEXT: .LBB1_7: # %vector.body.preheader
; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1
@@ -201,50 +201,47 @@ define void @_Z2x6v() local_unnamed_addr {
; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1
; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; CHECK-NEXT: movdqu %xmm0, (%r12,%r13,8)
-; CHECK-NEXT: movdqu %xmm0, 16(%r12,%r13,8)
-; CHECK-NEXT: movl $4, %r10d
+; CHECK-NEXT: movdqu %xmm0, (%r13,%r9,8)
+; CHECK-NEXT: movdqu %xmm0, 16(%r13,%r9,8)
+; CHECK-NEXT: movl $4, %r11d
; CHECK-NEXT: shrq $2, %rax
; CHECK-NEXT: jne .LBB1_11
; CHECK-NEXT: jmp .LBB1_13
; CHECK-NEXT: .LBB1_8: # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT: xorl %r10d, %r10d
+; CHECK-NEXT: xorl %r11d, %r11d
; CHECK-NEXT: shrq $2, %rax
; CHECK-NEXT: je .LBB1_13
; CHECK-NEXT: .LBB1_11: # %vector.body.preheader.new
; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1
; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; CHECK-NEXT: movq %r10, %rax
+; CHECK-NEXT: movq %r11, %rax
; CHECK-NEXT: subq %rdx, %rax
-; CHECK-NEXT: addq %r13, %r10
-; CHECK-NEXT: leaq (%r11,%r10,8), %r10
+; CHECK-NEXT: addq %r9, %r11
+; CHECK-NEXT: leaq (%rbx,%r11,8), %r11
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB1_12: # %vector.body
; CHECK-NEXT: # Parent Loop BB1_2 Depth=1
; CHECK-NEXT: # => This Inner Loop Header: Depth=2
-; CHECK-NEXT: movdqu %xmm0, -32(%r10)
-; CHECK-NEXT: movdqu %xmm0, -16(%r10)
-; CHECK-NEXT: movdqu %xmm0, (%r10)
-; CHECK-NEXT: movdqu %xmm0, 16(%r10)
-; CHECK-NEXT: addq $64, %r10
+; CHECK-NEXT: movdqu %xmm0, -32(%r11)
+; CHECK-NEXT: movdqu %xmm0, -16(%r11)
+; CHECK-NEXT: movdqu %xmm0, (%r11)
+; CHECK-NEXT: movdqu %xmm0, 16(%r11)
+; CHECK-NEXT: addq $64, %r11
; CHECK-NEXT: addq $8, %rax
; CHECK-NEXT: jne .LBB1_12
; CHECK-NEXT: .LBB1_13: # %middle.block
; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT: addq %rdx, %r13
+; CHECK-NEXT: addq %rdx, %r9
; CHECK-NEXT: cmpq %rdx, %rbp
-; CHECK-NEXT: movq %r13, %rdx
; CHECK-NEXT: je .LBB1_15
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB1_14: # %for.body2
; CHECK-NEXT: # Parent Loop BB1_2 Depth=1
; CHECK-NEXT: # => This Inner Loop Header: Depth=2
-; CHECK-NEXT: movq (%r15), %rax
-; CHECK-NEXT: movq %rax, (%r12,%r13,8)
-; CHECK-NEXT: leaq 1(%r13), %rdx
-; CHECK-NEXT: cmpq $-1, %r13
-; CHECK-NEXT: movq %rdx, %r13
+; CHECK-NEXT: movq (%r12), %rax
+; CHECK-NEXT: movq %rax, (%r13,%r9,8)
+; CHECK-NEXT: incq %r9
; CHECK-NEXT: jl .LBB1_14
; CHECK-NEXT: jmp .LBB1_15
; CHECK-NEXT: .LBB1_17: # %for.cond.for.end5_crit_edge
@@ -252,7 +249,6 @@ define void @_Z2x6v() local_unnamed_addr {
; CHECK-NEXT: movq %rdi, (%rax)
; CHECK-NEXT: movq x3@GOTPCREL(%rip), %rax
; CHECK-NEXT: movl $0, (%rax)
-; CHECK-NEXT: .LBB1_18: # %for.end5
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: popq %r12
@@ -265,6 +261,13 @@ define void @_Z2x6v() local_unnamed_addr {
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: popq %rbp
; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: .cfi_restore %rbx
+; CHECK-NEXT: .cfi_restore %r12
+; CHECK-NEXT: .cfi_restore %r13
+; CHECK-NEXT: .cfi_restore %r14
+; CHECK-NEXT: .cfi_restore %r15
+; CHECK-NEXT: .cfi_restore %rbp
+; CHECK-NEXT: .LBB1_18: # %for.end5
; CHECK-NEXT: retq
entry:
%0 = load i32, ptr @x1, align 4
diff --git a/llvm/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll b/llvm/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
index deba5a8..18e5490 100644
--- a/llvm/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
+++ b/llvm/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
@@ -112,9 +112,9 @@ declare void @_Z3fooPcjPKc(ptr, i32, ptr) #2
; Function Attrs: nounwind readnone
declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #3
-attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "use-soft-float"="false" }
attributes #1 = { argmemonly nounwind }
-attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "use-soft-float"="false" }
attributes #3 = { nounwind readnone }
attributes #4 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/dbg-changes-codegen.ll b/llvm/test/CodeGen/X86/dbg-changes-codegen.ll
index fabdbbb..c688895 100644
--- a/llvm/test/CodeGen/X86/dbg-changes-codegen.ll
+++ b/llvm/test/CodeGen/X86/dbg-changes-codegen.ll
@@ -68,8 +68,8 @@ _ZN7Flibble3barEP6Wibble.exit: ; preds = %entry, %if.then.i
; Function Attrs: nounwind readnone
declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
-attributes #0 = { nounwind readonly uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readonly uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { nounwind readnone }
!1 = distinct !DISubprogram()
diff --git a/llvm/test/CodeGen/X86/dbg-combine.ll b/llvm/test/CodeGen/X86/dbg-combine.ll
index b3d2213..3ff5a26 100644
--- a/llvm/test/CodeGen/X86/dbg-combine.ll
+++ b/llvm/test/CodeGen/X86/dbg-combine.ll
@@ -63,7 +63,7 @@ declare ptr @llvm.stacksave() #2
; Function Attrs: nounwind
declare void @llvm.stackrestore(ptr) #2
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/debug-loclists-lto.ll b/llvm/test/CodeGen/X86/debug-loclists-lto.ll
index fde8e00..2bd927f 100644
--- a/llvm/test/CodeGen/X86/debug-loclists-lto.ll
+++ b/llvm/test/CodeGen/X86/debug-loclists-lto.ll
@@ -34,8 +34,8 @@ entry:
ret void, !dbg !29
}
-attributes #0 = { uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
attributes #2 = { nounwind readnone speculatable willreturn }
!llvm.dbg.cu = !{!0, !7}
diff --git a/llvm/test/CodeGen/X86/debugloc-argsize.ll b/llvm/test/CodeGen/X86/debugloc-argsize.ll
index 3cfeb6e..f4527c5 100644
--- a/llvm/test/CodeGen/X86/debugloc-argsize.ll
+++ b/llvm/test/CodeGen/X86/debugloc-argsize.ll
@@ -30,7 +30,7 @@ declare ptr @__cxa_begin_catch(ptr)
declare void @__cxa_end_catch()
-attributes #0 = { optsize "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { optsize "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+sse,+sse2" "use-soft-float"="false" }
attributes #1 = { optsize }
attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/early-cfi-sections.ll b/llvm/test/CodeGen/X86/early-cfi-sections.ll
index 3a9e62a..8ab0340 100644
--- a/llvm/test/CodeGen/X86/early-cfi-sections.ll
+++ b/llvm/test/CodeGen/X86/early-cfi-sections.ll
@@ -12,7 +12,7 @@ entry:
ret void, !dbg !8
}
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4}
diff --git a/llvm/test/CodeGen/X86/fadd-combines.ll b/llvm/test/CodeGen/X86/fadd-combines.ll
index 2c06c53..a44671c 100644
--- a/llvm/test/CodeGen/X86/fadd-combines.ll
+++ b/llvm/test/CodeGen/X86/fadd-combines.ll
@@ -275,4 +275,4 @@ define <2 x double> @fmul2_negated_vec(<2 x double> %a, <2 x double> %b, <2 x do
ret <2 x double> %sub
}
-attributes #0 = { "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" }
+attributes #0 = { "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" }
diff --git a/llvm/test/CodeGen/X86/fastmath-float-half-conversion.ll b/llvm/test/CodeGen/X86/fastmath-float-half-conversion.ll
index 5afa12c..1bc94b1 100644
--- a/llvm/test/CodeGen/X86/fastmath-float-half-conversion.ll
+++ b/llvm/test/CodeGen/X86/fastmath-float-half-conversion.ll
@@ -65,5 +65,5 @@ entry:
declare i16 @llvm.convert.to.fp16.f64(double)
declare i16 @llvm.convert.to.fp16.f80(x86_fp80)
-attributes #0 = { nounwind readnone "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readnone "use-soft-float"="false" }
+attributes #1 = { nounwind readnone "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/fdiv.ll b/llvm/test/CodeGen/X86/fdiv.ll
index 67bad09..859f54e 100644
--- a/llvm/test/CodeGen/X86/fdiv.ll
+++ b/llvm/test/CodeGen/X86/fdiv.ll
@@ -54,7 +54,7 @@ define double @denormal2(double %x) {
; Deleting the negates does not require unsafe-fp-math.
-define float @double_negative(float %x, float %y) #0 {
+define float @double_negative(float %x, float %y) {
; CHECK-LABEL: double_negative:
; CHECK: # %bb.0:
; CHECK-NEXT: divss %xmm1, %xmm0
@@ -65,7 +65,7 @@ define float @double_negative(float %x, float %y) #0 {
ret float %div
}
-define <4 x float> @double_negative_vector(<4 x float> %x, <4 x float> %y) #0 {
+define <4 x float> @double_negative_vector(<4 x float> %x, <4 x float> %y) {
; CHECK-LABEL: double_negative_vector:
; CHECK: # %bb.0:
; CHECK-NEXT: divps %xmm1, %xmm0
@@ -80,7 +80,7 @@ define <4 x float> @double_negative_vector(<4 x float> %x, <4 x float> %y) #0 {
; clang/gcc), due to order of argument evaluation not being well defined. We
; ended up hitting llvm_unreachable in getNegatedExpression when building with
; gcc. Just make sure that we get a deterministic result.
-define float @fdiv_fneg_combine(float %a0, float %a1, float %a2) #0 {
+define float @fdiv_fneg_combine(float %a0, float %a1, float %a2) {
; CHECK-LABEL: fdiv_fneg_combine:
; CHECK: # %bb.0:
; CHECK-NEXT: movaps %xmm0, %xmm3
@@ -99,6 +99,3 @@ define float @fdiv_fneg_combine(float %a0, float %a1, float %a2) #0 {
%div5 = fdiv fast float %mul2, %sub4
ret float %div5
}
-
-attributes #0 = { "unsafe-fp-math"="false" }
-
diff --git a/llvm/test/CodeGen/X86/fma_patterns_wide.ll b/llvm/test/CodeGen/X86/fma_patterns_wide.ll
index 4c16cf9..0c3ec8d 100644
--- a/llvm/test/CodeGen/X86/fma_patterns_wide.ll
+++ b/llvm/test/CodeGen/X86/fma_patterns_wide.ll
@@ -1021,7 +1021,7 @@ define <8 x double> @test_v8f64_interp_ninf(<8 x double> %x, <8 x double> %y, <8
; Pattern: (fneg (fma x, y, z)) -> (fma x, -y, -z)
;
-define <16 x float> @test_v16f32_fneg_fmadd(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) #0 {
+define <16 x float> @test_v16f32_fneg_fmadd(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
; FMA-LABEL: test_v16f32_fneg_fmadd:
; FMA: # %bb.0:
; FMA-NEXT: vfnmsub213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) - ymm4
@@ -1044,7 +1044,7 @@ define <16 x float> @test_v16f32_fneg_fmadd(<16 x float> %a0, <16 x float> %a1,
ret <16 x float> %neg
}
-define <8 x double> @test_v8f64_fneg_fmsub(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) #0 {
+define <8 x double> @test_v8f64_fneg_fmsub(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
; FMA-LABEL: test_v8f64_fneg_fmsub:
; FMA: # %bb.0:
; FMA-NEXT: vfnmadd213pd {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm4
@@ -1067,7 +1067,7 @@ define <8 x double> @test_v8f64_fneg_fmsub(<8 x double> %a0, <8 x double> %a1, <
ret <8 x double> %neg
}
-define <16 x float> @test_v16f32_fneg_fnmadd(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) #0 {
+define <16 x float> @test_v16f32_fneg_fnmadd(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
; FMA-LABEL: test_v16f32_fneg_fnmadd:
; FMA: # %bb.0:
; FMA-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm4
@@ -1091,7 +1091,7 @@ define <16 x float> @test_v16f32_fneg_fnmadd(<16 x float> %a0, <16 x float> %a1,
ret <16 x float> %neg1
}
-define <8 x double> @test_v8f64_fneg_fnmsub(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) #0 {
+define <8 x double> @test_v8f64_fneg_fnmsub(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
; FMA-LABEL: test_v8f64_fneg_fnmsub:
; FMA: # %bb.0:
; FMA-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm4
@@ -1119,7 +1119,7 @@ define <8 x double> @test_v8f64_fneg_fnmsub(<8 x double> %a0, <8 x double> %a1,
; Pattern: (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2)
;
-define <16 x float> @test_v16f32_fma_x_c1_fmul_x_c2(<16 x float> %x) #0 {
+define <16 x float> @test_v16f32_fma_x_c1_fmul_x_c2(<16 x float> %x) {
; FMA-LABEL: test_v16f32_fma_x_c1_fmul_x_c2:
; FMA: # %bb.0:
; FMA-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
@@ -1146,7 +1146,7 @@ define <16 x float> @test_v16f32_fma_x_c1_fmul_x_c2(<16 x float> %x) #0 {
; Pattern: (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y)
;
-define <16 x float> @test_v16f32_fma_fmul_x_c1_c2_y(<16 x float> %x, <16 x float> %y) #0 {
+define <16 x float> @test_v16f32_fma_fmul_x_c1_c2_y(<16 x float> %x, <16 x float> %y) {
; FMA-LABEL: test_v16f32_fma_fmul_x_c1_c2_y:
; FMA: # %bb.0:
; FMA-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * mem) + ymm2
@@ -1171,7 +1171,7 @@ define <16 x float> @test_v16f32_fma_fmul_x_c1_c2_y(<16 x float> %x, <16 x float
; Pattern: (fneg (fmul x, y)) -> (fnmsub x, y, 0)
-define <16 x float> @test_v16f32_fneg_fmul(<16 x float> %x, <16 x float> %y) #0 {
+define <16 x float> @test_v16f32_fneg_fmul(<16 x float> %x, <16 x float> %y) {
; FMA-LABEL: test_v16f32_fneg_fmul:
; FMA: # %bb.0:
; FMA-NEXT: vxorps %xmm4, %xmm4, %xmm4
@@ -1196,7 +1196,7 @@ define <16 x float> @test_v16f32_fneg_fmul(<16 x float> %x, <16 x float> %y) #0
ret <16 x float> %n
}
-define <8 x double> @test_v8f64_fneg_fmul(<8 x double> %x, <8 x double> %y) #0 {
+define <8 x double> @test_v8f64_fneg_fmul(<8 x double> %x, <8 x double> %y) {
; FMA-LABEL: test_v8f64_fneg_fmul:
; FMA: # %bb.0:
; FMA-NEXT: vxorpd %xmm4, %xmm4, %xmm4
@@ -1221,7 +1221,7 @@ define <8 x double> @test_v8f64_fneg_fmul(<8 x double> %x, <8 x double> %y) #0 {
ret <8 x double> %n
}
-define <8 x double> @test_v8f64_fneg_fmul_no_nsz(<8 x double> %x, <8 x double> %y) #0 {
+define <8 x double> @test_v8f64_fneg_fmul_no_nsz(<8 x double> %x, <8 x double> %y) {
; FMA-LABEL: test_v8f64_fneg_fmul_no_nsz:
; FMA: # %bb.0:
; FMA-NEXT: vmulpd %ymm3, %ymm1, %ymm1
@@ -1250,7 +1250,6 @@ define <8 x double> @test_v8f64_fneg_fmul_no_nsz(<8 x double> %x, <8 x double> %
ret <8 x double> %n
}
-attributes #0 = { "unsafe-fp-math"="true" }
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; AVX512-INFS: {{.*}}
; FMA-INFS: {{.*}}
diff --git a/llvm/test/CodeGen/X86/fold-tied-op.ll b/llvm/test/CodeGen/X86/fold-tied-op.ll
index 5ea2964..d60d397 100644
--- a/llvm/test/CodeGen/X86/fold-tied-op.ll
+++ b/llvm/test/CodeGen/X86/fold-tied-op.ll
@@ -158,7 +158,7 @@ if.end: ; preds = %if.else, %if.then
ret i64 undef
}
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
!llvm.ident = !{!0}
diff --git a/llvm/test/CodeGen/X86/fp-intrinsics-flags.ll b/llvm/test/CodeGen/X86/fp-intrinsics-flags.ll
index bd32430..fc5279d0 100644
--- a/llvm/test/CodeGen/X86/fp-intrinsics-flags.ll
+++ b/llvm/test/CodeGen/X86/fp-intrinsics-flags.ll
@@ -100,40 +100,52 @@ entry:
ret i32 %result
}
-; These 2 divs only differ in their exception behavior and will be CSEd. Make
-; sure the nofpexcept flag is not set on the combined node.
+; These 4 divs only differ in their exception behavior. They form two groups,
+; whithin each the constrained functions have the same exception hehavior and
+; may be CSE'd. Instructions with different exception behavior belong to
+; different groups, they have different chain argument and cannot be CSE'd.
define void @binop_cse(double %a, double %b, ptr %x, ptr %y) #0 {
entry:
; CHECK-LABEL: name: binop_cse
-; CHECK: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.0)
-; CHECK: [[MOV32rm1:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.1, align 16)
-; CHECK: [[MOVSDrm_alt:%[0-9]+]]:fr64 = MOVSDrm_alt %fixed-stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.3, align 16)
-; CHECK: %3:fr64 = DIVSDrm [[MOVSDrm_alt]], %fixed-stack.2, 1, $noreg, 0, $noreg, implicit $mxcsr :: (load (s64) from %fixed-stack.2)
-; CHECK: MOVSDmr killed [[MOV32rm1]], 1, $noreg, 0, $noreg, %3 :: (store (s64) into %ir.x, align 4)
-; CHECK: MOVSDmr killed [[MOV32rm]], 1, $noreg, 0, $noreg, %3 :: (store (s64) into %ir.y, align 4)
+; CHECK: [[Y:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.0)
+; CHECK: [[X:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.1, align 16)
+; CHECK: [[B:%[0-9]+]]:fr64 = MOVSDrm_alt %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.2)
+; CHECK: [[A:%[0-9]+]]:fr64 = MOVSDrm_alt %fixed-stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.3, align 16)
+; CHECK: [[DIV0:%[0-9]+]]:fr64 = DIVSDrr [[A]], [[B]], implicit $mxcsr
+; CHECK: [[DIV1:%[0-9]+]]:fr64 = nofpexcept DIVSDrr [[A]], [[B]], implicit $mxcsr
+; CHECK: MOVSDmr killed [[X]], 1, $noreg, 0, $noreg, [[DIV1]] :: (store (s64) into %ir.x, align 4)
+; CHECK: MOVSDmr killed [[Y]], 1, $noreg, 0, $noreg, [[DIV1]] :: (store (s64) into %ir.y, align 4)
; CHECK: RET 0
%div = call double @llvm.experimental.constrained.fdiv.f64(double %a, double %b, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+ %div1 = call double @llvm.experimental.constrained.fdiv.f64(double %a, double %b, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
%div2 = call double @llvm.experimental.constrained.fdiv.f64(double %a, double %b, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
- store double %div, ptr %x
- store double %div2, ptr %y
+ %div3 = call double @llvm.experimental.constrained.fdiv.f64(double %a, double %b, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
+ store double %div2, ptr %x
+ store double %div3, ptr %y
ret void
}
-; These 2 sitofps only differ in their exception behavior and will be CSEd. Make
-; sure the nofpexcept flag is not set on the combined node.
+; These 4 divs only differ in their exception behavior. They form two groups,
+; whithin each the constrained functions have the same exception hehavior and
+; may be CSE'd. Instructions with different exception behavior belong to
+; different groups, they have different chain argument and cannot be CSE'd.
define void @sitofp_cse(i32 %a, ptr %x, ptr %y) #0 {
entry:
; CHECK-LABEL: name: sitofp_cse
-; CHECK: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.0, align 8)
-; CHECK: [[MOV32rm1:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.1)
-; CHECK: %2:fr64 = CVTSI2SDrm %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.2, align 16)
-; CHECK: MOVSDmr killed [[MOV32rm1]], 1, $noreg, 0, $noreg, %2 :: (store (s64) into %ir.x, align 4)
-; CHECK: MOVSDmr killed [[MOV32rm]], 1, $noreg, 0, $noreg, %2 :: (store (s64) into %ir.y, align 4)
+; CHECK: [[Y:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.0, align 8)
+; CHECK: [[X:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.1)
+; CHECK: [[A:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.2, align 16)
+; CHECK: [[CVT0:%[0-9]+]]:fr64 = CVTSI2SDrr [[A]]
+; CHECK: [[CVT1:%[0-9]+]]:fr64 = nofpexcept CVTSI2SDrr [[A]]
+; CHECK: MOVSDmr killed [[X]], 1, $noreg, 0, $noreg, [[CVT1]] :: (store (s64) into %ir.x, align 4)
+; CHECK: MOVSDmr killed [[Y]], 1, $noreg, 0, $noreg, [[CVT1]] :: (store (s64) into %ir.y, align 4)
; CHECK: RET 0
%result = call double @llvm.experimental.constrained.sitofp.f64.i32(i32 %a, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+ %result1 = call double @llvm.experimental.constrained.sitofp.f64.i32(i32 %a, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
%result2 = call double @llvm.experimental.constrained.sitofp.f64.i32(i32 %a, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
- store double %result, ptr %x
- store double %result2, ptr %y
+ %result3 = call double @llvm.experimental.constrained.sitofp.f64.i32(i32 %a, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
+ store double %result2, ptr %x
+ store double %result3, ptr %y
ret void
}
diff --git a/llvm/test/CodeGen/X86/fp128-g.ll b/llvm/test/CodeGen/X86/fp128-g.ll
index 58a57d3..d2b956f 100644
--- a/llvm/test/CodeGen/X86/fp128-g.ll
+++ b/llvm/test/CodeGen/X86/fp128-g.ll
@@ -106,8 +106,8 @@ entry:
; Function Attrs: nounwind readnone
declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
-attributes #0 = { nounwind readonly uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { norecurse nounwind readonly uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readonly uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "use-soft-float"="false" }
+attributes #1 = { norecurse nounwind readonly uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "use-soft-float"="false" }
attributes #2 = { nounwind readnone }
!llvm.dbg.cu = !{!2}
diff --git a/llvm/test/CodeGen/X86/fp128-i128.ll b/llvm/test/CodeGen/X86/fp128-i128.ll
index f176a29..ef616ca 100644
--- a/llvm/test/CodeGen/X86/fp128-i128.ll
+++ b/llvm/test/CodeGen/X86/fp128-i128.ll
@@ -526,6 +526,6 @@ cleanup: ; preds = %entry, %if.then
}
-attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+ssse3,+sse3,+popcnt,+sse,+sse2,+sse4.1,+sse4.2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+ssse3,+sse3,+popcnt,+sse,+sse2,+sse4.1,+sse4.2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+ssse3,+sse3,+popcnt,+sse,+sse2,+sse4.1,+sse4.2" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+ssse3,+sse3,+popcnt,+sse,+sse2,+sse4.1,+sse4.2" "use-soft-float"="false" }
attributes #2 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/X86/frame-order.ll b/llvm/test/CodeGen/X86/frame-order.ll
index dcbcb48..f410acf 100644
--- a/llvm/test/CodeGen/X86/frame-order.ll
+++ b/llvm/test/CodeGen/X86/frame-order.ll
@@ -74,9 +74,9 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
declare void @capture(ptr) #2
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "use-soft-float"="false" }
attributes #1 = { nounwind readnone }
-attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "use-soft-float"="false" }
attributes #3 = { nounwind }
!llvm.dbg.cu = !{!0}
diff --git a/llvm/test/CodeGen/X86/fsafdo_test2.ll b/llvm/test/CodeGen/X86/fsafdo_test2.ll
index d83e241..fc4c1e8 100644
--- a/llvm/test/CodeGen/X86/fsafdo_test2.ll
+++ b/llvm/test/CodeGen/X86/fsafdo_test2.ll
@@ -196,10 +196,10 @@ if.end9.3:
}
-attributes #0 = { noinline nounwind uwtable "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline nounwind uwtable "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-soft-float"="false" }
attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { nofree noinline norecurse nounwind uwtable "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind uwtable "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nofree noinline norecurse nounwind uwtable "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-soft-float"="false" }
+attributes #3 = { nounwind uwtable "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-soft-float"="false" }
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/X86/i386-shrink-wrapping.ll b/llvm/test/CodeGen/X86/i386-shrink-wrapping.ll
index 7bb3bf42..4347d62 100644
--- a/llvm/test/CodeGen/X86/i386-shrink-wrapping.ll
+++ b/llvm/test/CodeGen/X86/i386-shrink-wrapping.ll
@@ -130,5 +130,5 @@ for.end: ; preds = %for.cond.preheader
; Function Attrs: nounwind
declare i32 @varfunc(ptr nocapture readonly, ...) #0
-attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse" "use-soft-float"="false" }
attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/inline-asm-A-constraint.ll b/llvm/test/CodeGen/X86/inline-asm-A-constraint.ll
index f982196..11a1f39 100644
--- a/llvm/test/CodeGen/X86/inline-asm-A-constraint.ll
+++ b/llvm/test/CodeGen/X86/inline-asm-A-constraint.ll
@@ -23,7 +23,7 @@ entry:
; CHECK: lock
; CHECK-NEXT: cmpxchg16b
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
attributes #1 = { nounwind }
!llvm.ident = !{!0}
diff --git a/llvm/test/CodeGen/X86/label-annotation.ll b/llvm/test/CodeGen/X86/label-annotation.ll
index 626040c..05e4e87 100644
--- a/llvm/test/CodeGen/X86/label-annotation.ll
+++ b/llvm/test/CodeGen/X86/label-annotation.ll
@@ -77,8 +77,8 @@ entry:
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
attributes #2 = { inaccessiblememonly noduplicate nounwind }
attributes #3 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/label-heapallocsite.ll b/llvm/test/CodeGen/X86/label-heapallocsite.ll
index 31bca25..72834be6c 100644
--- a/llvm/test/CodeGen/X86/label-heapallocsite.ll
+++ b/llvm/test/CodeGen/X86/label-heapallocsite.ll
@@ -98,8 +98,8 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) #2
; CHECK-NEXT: .short [[LABEL5]]-[[LABEL4]]
; CHECK-NEXT: .long 4096
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
attributes #2 = { nounwind readnone speculatable willreturn }
attributes #3 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/late-remat-update.mir b/llvm/test/CodeGen/X86/late-remat-update.mir
index 3212312..9108002 100644
--- a/llvm/test/CodeGen/X86/late-remat-update.mir
+++ b/llvm/test/CodeGen/X86/late-remat-update.mir
@@ -39,8 +39,8 @@
; Function Attrs: nounwind
declare void @llvm.stackprotector(ptr, ptr) #2
- attributes #0 = { noreturn uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
- attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #0 = { noreturn uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+ attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
attributes #2 = { nounwind }
!llvm.module.flags = !{!0, !1}
diff --git a/llvm/test/CodeGen/X86/lea-opt-memop-check-1.ll b/llvm/test/CodeGen/X86/lea-opt-memop-check-1.ll
index 5199b15..96780af0 100644
--- a/llvm/test/CodeGen/X86/lea-opt-memop-check-1.ll
+++ b/llvm/test/CodeGen/X86/lea-opt-memop-check-1.ll
@@ -92,4 +92,4 @@ if.end:
; CHECK: pushl ([[REG3]])
}
-attributes #0 = { nounwind optsize "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+pclmul,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind optsize "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+pclmul,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/lifetime-alias.ll b/llvm/test/CodeGen/X86/lifetime-alias.ll
index 3efaccb..22e350c 100644
--- a/llvm/test/CodeGen/X86/lifetime-alias.ll
+++ b/llvm/test/CodeGen/X86/lifetime-alias.ll
@@ -140,10 +140,10 @@ declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture reado
; Function Attrs: argmemonly nounwind
declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #1
-attributes #0 = { norecurse uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
attributes #1 = { argmemonly nounwind }
-attributes #2 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nobuiltin nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #3 = { nobuiltin nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
attributes #4 = { nounwind }
attributes #5 = { noreturn nounwind }
attributes #6 = { builtin nounwind }
diff --git a/llvm/test/CodeGen/X86/limit-split-cost.mir b/llvm/test/CodeGen/X86/limit-split-cost.mir
index 5b8bb98..8e4e786 100644
--- a/llvm/test/CodeGen/X86/limit-split-cost.mir
+++ b/llvm/test/CodeGen/X86/limit-split-cost.mir
@@ -53,8 +53,8 @@
; Function Attrs: nounwind
declare void @llvm.stackprotector(ptr, ptr) #2
- attributes #0 = { uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
- attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #0 = { uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+ attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
attributes #2 = { nounwind }
!llvm.module.flags = !{!0, !1}
diff --git a/llvm/test/CodeGen/X86/merge-vector-stores-scale-idx-crash.ll b/llvm/test/CodeGen/X86/merge-vector-stores-scale-idx-crash.ll
index 3dba5eb..206d453 100644
--- a/llvm/test/CodeGen/X86/merge-vector-stores-scale-idx-crash.ll
+++ b/llvm/test/CodeGen/X86/merge-vector-stores-scale-idx-crash.ll
@@ -41,5 +41,5 @@ if.end: ; preds = %entry
declare <4 x float> @_Z1bv() local_unnamed_addr
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/misched-copy.ll b/llvm/test/CodeGen/X86/misched-copy.ll
index fa6cd15..e3ceddf 100644
--- a/llvm/test/CodeGen/X86/misched-copy.ll
+++ b/llvm/test/CodeGen/X86/misched-copy.ll
@@ -42,7 +42,7 @@ end:
ret i64 %add
}
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
!0 = !{!"float", !1}
!1 = !{!"omnipotent char", !2}
diff --git a/llvm/test/CodeGen/X86/misched-matmul.ll b/llvm/test/CodeGen/X86/misched-matmul.ll
index a6c489d..9029167 100644
--- a/llvm/test/CodeGen/X86/misched-matmul.ll
+++ b/llvm/test/CodeGen/X86/misched-matmul.ll
@@ -222,4 +222,4 @@ entry:
ret void
}
-attributes #0 = { noinline nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/movpc32-check.ll b/llvm/test/CodeGen/X86/movpc32-check.ll
index e3730d0..4585dcb 100644
--- a/llvm/test/CodeGen/X86/movpc32-check.ll
+++ b/llvm/test/CodeGen/X86/movpc32-check.ll
@@ -12,8 +12,8 @@ entry:
declare void @bar(...) #1
-attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="i686" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="i686" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="i686" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="i686" "use-soft-float"="false" }
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/X86/ms-inline-asm-avx512.ll b/llvm/test/CodeGen/X86/ms-inline-asm-avx512.ll
index ec17b1d..04db25b 100644
--- a/llvm/test/CodeGen/X86/ms-inline-asm-avx512.ll
+++ b/llvm/test/CodeGen/X86/ms-inline-asm-avx512.ll
@@ -20,5 +20,5 @@ entry:
; CHECK: movq %rax, 7(%rsp)
; CHECK: retq
-attributes #0 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "use-soft-float"="false" }
attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/nocf_check.ll b/llvm/test/CodeGen/X86/nocf_check.ll
index 7b184ed..742b07d 100644
--- a/llvm/test/CodeGen/X86/nocf_check.ll
+++ b/llvm/test/CodeGen/X86/nocf_check.ll
@@ -66,8 +66,8 @@ bb2:
ret void
}
-attributes #0 = { nocf_check noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nocf_check noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-soft-float"="false" }
+attributes #1 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-soft-float"="false" }
attributes #2 = { nocf_check }
!llvm.module.flags = !{!0}
diff --git a/llvm/test/CodeGen/X86/pr15705.ll b/llvm/test/CodeGen/X86/pr15705.ll
index 3dd4aab..2de9a34 100644
--- a/llvm/test/CodeGen/X86/pr15705.ll
+++ b/llvm/test/CodeGen/X86/pr15705.ll
@@ -45,4 +45,4 @@ return:
ret i32 %retval.0
}
-attributes #0 = { nounwind readnone ssp "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readnone ssp "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/pr18846.ll b/llvm/test/CodeGen/X86/pr18846.ll
index 93a9a5d..4239f46 100644
--- a/llvm/test/CodeGen/X86/pr18846.ll
+++ b/llvm/test/CodeGen/X86/pr18846.ll
@@ -122,7 +122,7 @@ for.body65: ; preds = %for.body29
; Function Attrs: nounwind
declare void @llvm.x86.avx.storeu.ps.256(ptr, <8 x float>) #1
-attributes #0 = { uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
attributes #1 = { nounwind }
!llvm.ident = !{!0}
diff --git a/llvm/test/CodeGen/X86/pr31045.ll b/llvm/test/CodeGen/X86/pr31045.ll
index 4aa73d7..78ba7cc 100644
--- a/llvm/test/CodeGen/X86/pr31045.ll
+++ b/llvm/test/CodeGen/X86/pr31045.ll
@@ -73,4 +73,4 @@ entry:
ret void
}
-attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/pr32610.ll b/llvm/test/CodeGen/X86/pr32610.ll
index dc11ba8..6f3602d 100644
--- a/llvm/test/CodeGen/X86/pr32610.ll
+++ b/llvm/test/CodeGen/X86/pr32610.ll
@@ -50,7 +50,7 @@ entry:
ret void
}
-attributes #0 = { norecurse nounwind optsize ssp "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind optsize ssp "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "use-soft-float"="false" }
!llvm.ident = !{!0}
diff --git a/llvm/test/CodeGen/X86/pr34080-2.ll b/llvm/test/CodeGen/X86/pr34080-2.ll
index de34bfb1..279373a 100644
--- a/llvm/test/CodeGen/X86/pr34080-2.ll
+++ b/llvm/test/CodeGen/X86/pr34080-2.ll
@@ -132,4 +132,4 @@ define void @computeJD(ptr) nounwind {
ret void
}
-attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="i486" "target-features"="+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="i486" "target-features"="+x87" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/pr34080.ll b/llvm/test/CodeGen/X86/pr34080.ll
index d07d1aa..3b46bd3 100644
--- a/llvm/test/CodeGen/X86/pr34080.ll
+++ b/llvm/test/CodeGen/X86/pr34080.ll
@@ -162,4 +162,4 @@ entry:
ret void
}
-attributes #0 = { noinline uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/pr34629.ll b/llvm/test/CodeGen/X86/pr34629.ll
index eeb61d2..f7747b1 100644
--- a/llvm/test/CodeGen/X86/pr34629.ll
+++ b/llvm/test/CodeGen/X86/pr34629.ll
@@ -38,7 +38,7 @@ if.end: ; preds = %entry, %if.then
ret void
}
-attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/X86/pr34634.ll b/llvm/test/CodeGen/X86/pr34634.ll
index a374112..980961a 100644
--- a/llvm/test/CodeGen/X86/pr34634.ll
+++ b/llvm/test/CodeGen/X86/pr34634.ll
@@ -54,7 +54,7 @@ entry:
ret i32 0
}
-attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/X86/pr42727.ll b/llvm/test/CodeGen/X86/pr42727.ll
index cf1fa5a..18e884b 100644
--- a/llvm/test/CodeGen/X86/pr42727.ll
+++ b/llvm/test/CodeGen/X86/pr42727.ll
@@ -29,5 +29,5 @@ entry:
ret void
}
-attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+avx,+avx2,+cx8,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+avx,+avx2,+cx8,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/pr48064.mir b/llvm/test/CodeGen/X86/pr48064.mir
index eb74edd..d2eaea7 100644
--- a/llvm/test/CodeGen/X86/pr48064.mir
+++ b/llvm/test/CodeGen/X86/pr48064.mir
@@ -185,8 +185,8 @@
; Function Attrs: nounwind
declare void @llvm.x86.seh.ehregnode(ptr) #7
- attributes #0 = { noinline nounwind sspstrong "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
- attributes #1 = { norecurse sspstrong "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #0 = { noinline nounwind sspstrong "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-soft-float"="false" }
+ attributes #1 = { norecurse sspstrong "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-soft-float"="false" }
attributes #2 = { argmemonly nofree nosync nounwind willreturn }
attributes #3 = { nofree }
attributes #4 = { nofree nosync nounwind willreturn }
diff --git a/llvm/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll b/llvm/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll
index 0d5d822..e497575 100644
--- a/llvm/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll
+++ b/llvm/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll
@@ -178,4 +178,4 @@ bb439: ; preds = %bb222, %bb85
ret void
}
-attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/recip-fastmath.ll b/llvm/test/CodeGen/X86/recip-fastmath.ll
index dab7a6a..f8d28ae 100644
--- a/llvm/test/CodeGen/X86/recip-fastmath.ll
+++ b/llvm/test/CodeGen/X86/recip-fastmath.ll
@@ -1400,7 +1400,7 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 {
ret <16 x float> %div
}
-attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!divf,!vec-divf" }
-attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf,vec-divf" }
-attributes #2 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:2,vec-divf:2" }
+attributes #0 = { "reciprocal-estimates"="!divf,!vec-divf" }
+attributes #1 = { "reciprocal-estimates"="divf,vec-divf" }
+attributes #2 = { "reciprocal-estimates"="divf:2,vec-divf:2" }
diff --git a/llvm/test/CodeGen/X86/recip-fastmath2.ll b/llvm/test/CodeGen/X86/recip-fastmath2.ll
index 77ccaff..7fa13cb 100644
--- a/llvm/test/CodeGen/X86/recip-fastmath2.ll
+++ b/llvm/test/CodeGen/X86/recip-fastmath2.ll
@@ -1841,8 +1841,8 @@ define <16 x float> @v16f32_no_step2(<16 x float> %x) #3 {
ret <16 x float> %div
}
-attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!divf,!vec-divf" }
-attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf,vec-divf" }
-attributes #2 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:2,vec-divf:2" }
-attributes #3 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:0,vec-divf:0" }
+attributes #0 = { "reciprocal-estimates"="!divf,!vec-divf" }
+attributes #1 = { "reciprocal-estimates"="divf,vec-divf" }
+attributes #2 = { "reciprocal-estimates"="divf:2,vec-divf:2" }
+attributes #3 = { "reciprocal-estimates"="divf:0,vec-divf:0" }
diff --git a/llvm/test/CodeGen/X86/regalloc-advanced-split-cost.ll b/llvm/test/CodeGen/X86/regalloc-advanced-split-cost.ll
index 50422a8..ea1ca51 100644
--- a/llvm/test/CodeGen/X86/regalloc-advanced-split-cost.ll
+++ b/llvm/test/CodeGen/X86/regalloc-advanced-split-cost.ll
@@ -72,7 +72,7 @@ if.end: ; preds = %if.else, %if.then
ret i32 %add
}
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+x87" "use-soft-float"="false" }
attributes #1 = { nounwind }
!llvm.module.flags = !{!0, !1}
diff --git a/llvm/test/CodeGen/X86/regparm.ll b/llvm/test/CodeGen/X86/regparm.ll
index 6d6802e..95009b5 100644
--- a/llvm/test/CodeGen/X86/regparm.ll
+++ b/llvm/test/CodeGen/X86/regparm.ll
@@ -38,7 +38,7 @@ entry:
declare void @llvm.memset.p0.i32(ptr nocapture writeonly, i8, i32, i1) #1
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
attributes #1 = { argmemonly nounwind }
!llvm.module.flags = !{!0}
diff --git a/llvm/test/CodeGen/X86/seh-catchpad.ll b/llvm/test/CodeGen/X86/seh-catchpad.ll
index cb85f39..85e465b 100644
--- a/llvm/test/CodeGen/X86/seh-catchpad.ll
+++ b/llvm/test/CodeGen/X86/seh-catchpad.ll
@@ -189,9 +189,9 @@ entry:
; Function Attrs: nounwind
declare i32 @puts(ptr nocapture readonly) #3
-attributes #0 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { noinline nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+sse,+sse2" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+sse,+sse2" "use-soft-float"="false" }
+attributes #2 = { noinline nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+sse,+sse2" "use-soft-float"="false" }
+attributes #3 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+sse,+sse2" "use-soft-float"="false" }
attributes #4 = { noinline }
attributes #5 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/seh-except-finally.ll b/llvm/test/CodeGen/X86/seh-except-finally.ll
index 539d776..fedb0c4 100644
--- a/llvm/test/CodeGen/X86/seh-except-finally.ll
+++ b/llvm/test/CodeGen/X86/seh-except-finally.ll
@@ -136,10 +136,10 @@ declare ptr @llvm.localaddress() #4
; Function Attrs: nounwind readnone
declare i32 @llvm.eh.typeid.for(ptr) #4
-attributes #0 = { noinline nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { noinline nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "use-soft-float"="false" }
+attributes #1 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "use-soft-float"="false" }
+attributes #2 = { noinline nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "use-soft-float"="false" }
+attributes #3 = { "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "use-soft-float"="false" }
attributes #4 = { nounwind readnone }
attributes #5 = { noinline }
attributes #6 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/seh-no-invokes.ll b/llvm/test/CodeGen/X86/seh-no-invokes.ll
index 63e91d3..112031c 100644
--- a/llvm/test/CodeGen/X86/seh-no-invokes.ll
+++ b/llvm/test/CodeGen/X86/seh-no-invokes.ll
@@ -63,8 +63,8 @@ declare i32 @_except_handler3(...)
; Function Attrs: nounwind
declare void @llvm.localescape(...) #3
-attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #1 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
attributes #2 = { nounwind readnone }
attributes #3 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/shrinkwrap-hang.ll b/llvm/test/CodeGen/X86/shrinkwrap-hang.ll
index fe42d31..7e98b8a 100644
--- a/llvm/test/CodeGen/X86/shrinkwrap-hang.ll
+++ b/llvm/test/CodeGen/X86/shrinkwrap-hang.ll
@@ -29,4 +29,4 @@ if.end3: ; preds = %if.end
ret void
}
-attributes #0 = { norecurse nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath.ll b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
index a260b32..83bfcd7 100644
--- a/llvm/test/CodeGen/X86/sqrt-fastmath.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
@@ -1012,15 +1012,15 @@ define double @sqrt_simplify_before_recip_order(double %x, ptr %p) nounwind {
ret double %sqrt_fast
}
-attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!sqrtf,!vec-sqrtf,!divf,!vec-divf" }
-attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" }
+attributes #0 = { "reciprocal-estimates"="!sqrtf,!vec-sqrtf,!divf,!vec-divf" }
+attributes #1 = { "reciprocal-estimates"="sqrt,vec-sqrt" }
attributes #2 = { nounwind readnone }
-attributes #3 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="preserve-sign,ieee" }
-attributes #4 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="ieee,preserve-sign" }
-attributes #5 = { "unsafe-fp-math"="true" "reciprocal-estimates"="all:0" }
-attributes #6 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="preserve-sign,dynamic" }
+attributes #3 = { "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="preserve-sign,ieee" }
+attributes #4 = { "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="ieee,preserve-sign" }
+attributes #5 = { "reciprocal-estimates"="all:0" }
+attributes #6 = { "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="preserve-sign,dynamic" }
-; Attributes without "unsafe-fp-math"="true"
+; Attributes without
; TODO: Merge with previous attributes when this attribute can be deleted.
attributes #7 = { "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="preserve-sign,ieee" } ; #3
attributes #8 = { "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="preserve-sign,dynamic" } ; #6
diff --git a/llvm/test/CodeGen/X86/sse1.ll b/llvm/test/CodeGen/X86/sse1.ll
index 8ac86d1..5005752 100644
--- a/llvm/test/CodeGen/X86/sse1.ll
+++ b/llvm/test/CodeGen/X86/sse1.ll
@@ -251,5 +251,5 @@ define <2 x float> @PR31672() #0 {
declare <2 x float> @llvm.sqrt.v2f32(<2 x float>) #1
-attributes #0 = { nounwind "unsafe-fp-math"="true" }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll
index 665a84a..d7c9438 100644
--- a/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll
@@ -1912,7 +1912,7 @@ define <4 x float> @stack_fold_insertps(<4 x float> %a0, <4 x float> %a1) {
}
declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
-define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) #0 {
+define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: stack_fold_maxpd:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1927,7 +1927,7 @@ define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) #0 {
}
declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
-define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double> %a1) #1 {
+define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: stack_fold_maxpd_commutable:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1941,7 +1941,7 @@ define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double>
ret <2 x double> %2
}
-define <4 x double> @stack_fold_maxpd_ymm(<4 x double> %a0, <4 x double> %a1) #0 {
+define <4 x double> @stack_fold_maxpd_ymm(<4 x double> %a0, <4 x double> %a1) {
; CHECK-LABEL: stack_fold_maxpd_ymm:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -1956,7 +1956,7 @@ define <4 x double> @stack_fold_maxpd_ymm(<4 x double> %a0, <4 x double> %a1) #0
}
declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
-define <4 x double> @stack_fold_maxpd_ymm_commutable(<4 x double> %a0, <4 x double> %a1) #1 {
+define <4 x double> @stack_fold_maxpd_ymm_commutable(<4 x double> %a0, <4 x double> %a1) {
; CHECK-LABEL: stack_fold_maxpd_ymm_commutable:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -1970,7 +1970,7 @@ define <4 x double> @stack_fold_maxpd_ymm_commutable(<4 x double> %a0, <4 x doub
ret <4 x double> %2
}
-define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) #0 {
+define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) {
; CHECK-LABEL: stack_fold_maxps:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1985,7 +1985,7 @@ define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) #0 {
}
declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
-define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1) #1 {
+define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1) {
; CHECK-LABEL: stack_fold_maxps_commutable:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1999,7 +1999,7 @@ define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1
ret <4 x float> %2
}
-define <8 x float> @stack_fold_maxps_ymm(<8 x float> %a0, <8 x float> %a1) #0 {
+define <8 x float> @stack_fold_maxps_ymm(<8 x float> %a0, <8 x float> %a1) {
; CHECK-LABEL: stack_fold_maxps_ymm:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -2014,7 +2014,7 @@ define <8 x float> @stack_fold_maxps_ymm(<8 x float> %a0, <8 x float> %a1) #0 {
}
declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
-define <8 x float> @stack_fold_maxps_ymm_commutable(<8 x float> %a0, <8 x float> %a1) #1 {
+define <8 x float> @stack_fold_maxps_ymm_commutable(<8 x float> %a0, <8 x float> %a1) {
; CHECK-LABEL: stack_fold_maxps_ymm_commutable:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -2028,7 +2028,7 @@ define <8 x float> @stack_fold_maxps_ymm_commutable(<8 x float> %a0, <8 x float>
ret <8 x float> %2
}
-define double @stack_fold_maxsd(double %a0, double %a1) #0 {
+define double @stack_fold_maxsd(double %a0, double %a1) {
; CHECK-LABEL: stack_fold_maxsd:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -2043,7 +2043,7 @@ define double @stack_fold_maxsd(double %a0, double %a1) #0 {
ret double %3
}
-define double @stack_fold_maxsd_commutable(double %a0, double %a1) #1 {
+define double @stack_fold_maxsd_commutable(double %a0, double %a1) {
; CHECK-LABEL: stack_fold_maxsd_commutable:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -2058,7 +2058,7 @@ define double @stack_fold_maxsd_commutable(double %a0, double %a1) #1 {
ret double %3
}
-define <2 x double> @stack_fold_maxsd_int(<2 x double> %a0, <2 x double> %a1) #0 {
+define <2 x double> @stack_fold_maxsd_int(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: stack_fold_maxsd_int:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2073,7 +2073,7 @@ define <2 x double> @stack_fold_maxsd_int(<2 x double> %a0, <2 x double> %a1) #0
}
declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
-define float @stack_fold_maxss(float %a0, float %a1) #0 {
+define float @stack_fold_maxss(float %a0, float %a1) {
; CHECK-LABEL: stack_fold_maxss:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -2088,7 +2088,7 @@ define float @stack_fold_maxss(float %a0, float %a1) #0 {
ret float %3
}
-define float @stack_fold_maxss_commutable(float %a0, float %a1) #1 {
+define float @stack_fold_maxss_commutable(float %a0, float %a1) {
; CHECK-LABEL: stack_fold_maxss_commutable:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -2103,7 +2103,7 @@ define float @stack_fold_maxss_commutable(float %a0, float %a1) #1 {
ret float %3
}
-define <4 x float> @stack_fold_maxss_int(<4 x float> %a0, <4 x float> %a1) #0 {
+define <4 x float> @stack_fold_maxss_int(<4 x float> %a0, <4 x float> %a1) {
; CHECK-LABEL: stack_fold_maxss_int:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2118,7 +2118,7 @@ define <4 x float> @stack_fold_maxss_int(<4 x float> %a0, <4 x float> %a1) #0 {
}
declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
-define <2 x double> @stack_fold_minpd(<2 x double> %a0, <2 x double> %a1) #0 {
+define <2 x double> @stack_fold_minpd(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: stack_fold_minpd:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2133,7 +2133,7 @@ define <2 x double> @stack_fold_minpd(<2 x double> %a0, <2 x double> %a1) #0 {
}
declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
-define <2 x double> @stack_fold_minpd_commutable(<2 x double> %a0, <2 x double> %a1) #1 {
+define <2 x double> @stack_fold_minpd_commutable(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: stack_fold_minpd_commutable:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2147,7 +2147,7 @@ define <2 x double> @stack_fold_minpd_commutable(<2 x double> %a0, <2 x double>
ret <2 x double> %2
}
-define <4 x double> @stack_fold_minpd_ymm(<4 x double> %a0, <4 x double> %a1) #0 {
+define <4 x double> @stack_fold_minpd_ymm(<4 x double> %a0, <4 x double> %a1) {
; CHECK-LABEL: stack_fold_minpd_ymm:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -2162,7 +2162,7 @@ define <4 x double> @stack_fold_minpd_ymm(<4 x double> %a0, <4 x double> %a1) #0
}
declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
-define <4 x double> @stack_fold_minpd_ymm_commutable(<4 x double> %a0, <4 x double> %a1) #1 {
+define <4 x double> @stack_fold_minpd_ymm_commutable(<4 x double> %a0, <4 x double> %a1) {
; CHECK-LABEL: stack_fold_minpd_ymm_commutable:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -2176,7 +2176,7 @@ define <4 x double> @stack_fold_minpd_ymm_commutable(<4 x double> %a0, <4 x doub
ret <4 x double> %2
}
-define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) #0 {
+define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) {
; CHECK-LABEL: stack_fold_minps:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2191,7 +2191,7 @@ define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) #0 {
}
declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
-define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1) #1 {
+define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1) {
; CHECK-LABEL: stack_fold_minps_commutable:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2205,7 +2205,7 @@ define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1
ret <4 x float> %2
}
-define <8 x float> @stack_fold_minps_ymm(<8 x float> %a0, <8 x float> %a1) #0 {
+define <8 x float> @stack_fold_minps_ymm(<8 x float> %a0, <8 x float> %a1) {
; CHECK-LABEL: stack_fold_minps_ymm:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -2220,7 +2220,7 @@ define <8 x float> @stack_fold_minps_ymm(<8 x float> %a0, <8 x float> %a1) #0 {
}
declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
-define <8 x float> @stack_fold_minps_ymm_commutable(<8 x float> %a0, <8 x float> %a1) #1 {
+define <8 x float> @stack_fold_minps_ymm_commutable(<8 x float> %a0, <8 x float> %a1) {
; CHECK-LABEL: stack_fold_minps_ymm_commutable:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -2234,7 +2234,7 @@ define <8 x float> @stack_fold_minps_ymm_commutable(<8 x float> %a0, <8 x float>
ret <8 x float> %2
}
-define double @stack_fold_minsd(double %a0, double %a1) #0 {
+define double @stack_fold_minsd(double %a0, double %a1) {
; CHECK-LABEL: stack_fold_minsd:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -2249,7 +2249,7 @@ define double @stack_fold_minsd(double %a0, double %a1) #0 {
ret double %3
}
-define double @stack_fold_minsd_commutable(double %a0, double %a1) #1 {
+define double @stack_fold_minsd_commutable(double %a0, double %a1) {
; CHECK-LABEL: stack_fold_minsd_commutable:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -2279,7 +2279,7 @@ define <2 x double> @stack_fold_minsd_int(<2 x double> %a0, <2 x double> %a1) {
}
declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
-define float @stack_fold_minss(float %a0, float %a1) #0 {
+define float @stack_fold_minss(float %a0, float %a1) {
; CHECK-LABEL: stack_fold_minss:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -2294,7 +2294,7 @@ define float @stack_fold_minss(float %a0, float %a1) #0 {
ret float %3
}
-define float @stack_fold_minss_commutable(float %a0, float %a1) #1 {
+define float @stack_fold_minss_commutable(float %a0, float %a1) {
; CHECK-LABEL: stack_fold_minss_commutable:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -2309,7 +2309,7 @@ define float @stack_fold_minss_commutable(float %a0, float %a1) #1 {
ret float %3
}
-define <4 x float> @stack_fold_minss_int(<4 x float> %a0, <4 x float> %a1) #0 {
+define <4 x float> @stack_fold_minss_int(<4 x float> %a0, <4 x float> %a1) {
; CHECK-LABEL: stack_fold_minss_int:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -3632,6 +3632,3 @@ define <8 x float> @stack_fold_xorps_ymm(<8 x float> %a0, <8 x float> %a1) {
%6 = fadd <8 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
ret <8 x float> %6
}
-
-attributes #0 = { "unsafe-fp-math"="false" }
-attributes #1 = { "unsafe-fp-math"="true" }
diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll
index a75cdf9d..43743d5 100644
--- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll
@@ -609,7 +609,7 @@ define <4 x float> @stack_fold_insertps(<4 x float> %a0, <4 x float> %a1) {
}
declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
-define <8 x double> @stack_fold_maxpd_zmm(<8 x double> %a0, <8 x double> %a1) #0 {
+define <8 x double> @stack_fold_maxpd_zmm(<8 x double> %a0, <8 x double> %a1) {
; CHECK-LABEL: stack_fold_maxpd_zmm:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -695,7 +695,7 @@ define <8 x double> @stack_fold_maxpd_zmm_commutable_kz(<8 x double> %a0, <8 x d
ret <8 x double> %4
}
-define <16 x float> @stack_fold_maxps_zmm(<16 x float> %a0, <16 x float> %a1) #0 {
+define <16 x float> @stack_fold_maxps_zmm(<16 x float> %a0, <16 x float> %a1) {
; CHECK-LABEL: stack_fold_maxps_zmm:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -781,7 +781,7 @@ define <16 x float> @stack_fold_maxps_zmm_commutable_kz(<16 x float> %a0, <16 x
ret <16 x float> %4
}
-define <8 x double> @stack_fold_minpd_zmm(<8 x double> %a0, <8 x double> %a1) #0 {
+define <8 x double> @stack_fold_minpd_zmm(<8 x double> %a0, <8 x double> %a1) {
; CHECK-LABEL: stack_fold_minpd_zmm:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -867,7 +867,7 @@ define <8 x double> @stack_fold_minpd_zmm_commutable_kz(<8 x double> %a0, <8 x d
ret <8 x double> %4
}
-define <16 x float> @stack_fold_minps_zmm(<16 x float> %a0, <16 x float> %a1) #0 {
+define <16 x float> @stack_fold_minps_zmm(<16 x float> %a0, <16 x float> %a1) {
; CHECK-LABEL: stack_fold_minps_zmm:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -1157,7 +1157,7 @@ define <4 x float> @stack_fold_mulss_int(<4 x float> %a0, <4 x float> %a1) {
ret <4 x float> %5
}
-define <8 x double> @stack_fold_orpd_zmm(<8 x double> %a0, <8 x double> %a1) #0 {
+define <8 x double> @stack_fold_orpd_zmm(<8 x double> %a0, <8 x double> %a1) {
; CHECK-LABEL: stack_fold_orpd_zmm:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -1178,7 +1178,7 @@ define <8 x double> @stack_fold_orpd_zmm(<8 x double> %a0, <8 x double> %a1) #0
ret <8 x double> %6
}
-define <16 x float> @stack_fold_orps_zmm(<16 x float> %a0, <16 x float> %a1) #0 {
+define <16 x float> @stack_fold_orps_zmm(<16 x float> %a0, <16 x float> %a1) {
; CHECK-LABEL: stack_fold_orps_zmm:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -1414,7 +1414,7 @@ define <4 x float> @stack_fold_subss_int(<4 x float> %a0, <4 x float> %a1) {
ret <4 x float> %5
}
-define <8 x double> @stack_fold_xorpd_zmm(<8 x double> %a0, <8 x double> %a1) #0 {
+define <8 x double> @stack_fold_xorpd_zmm(<8 x double> %a0, <8 x double> %a1) {
; CHECK-LABEL: stack_fold_xorpd_zmm:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -1435,7 +1435,7 @@ define <8 x double> @stack_fold_xorpd_zmm(<8 x double> %a0, <8 x double> %a1) #0
ret <8 x double> %6
}
-define <16 x float> @stack_fold_xorps_zmm(<16 x float> %a0, <16 x float> %a1) #0 {
+define <16 x float> @stack_fold_xorps_zmm(<16 x float> %a0, <16 x float> %a1) {
; CHECK-LABEL: stack_fold_xorps_zmm:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -2058,5 +2058,4 @@ define <16 x float> @stack_fold_permilpsvar_zmm_maskz(<16 x float> %a0, <16 x i3
ret <16 x float> %4
}
-attributes #0 = { "unsafe-fp-math"="false" }
-attributes #1 = { "unsafe-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" }
+attributes #1 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" }
diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll
index 52d4d8b..b715df8 100644
--- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll
@@ -502,7 +502,7 @@ define <8 x half> @stack_fold_getmantsh_maskz(<8 x half> %a0, <8 x half> %a1, pt
ret <8 x half> %3
}
-define <32 x half> @stack_fold_maxph_zmm(<32 x half> %a0, <32 x half> %a1) #0 {
+define <32 x half> @stack_fold_maxph_zmm(<32 x half> %a0, <32 x half> %a1) {
; CHECK-LABEL: stack_fold_maxph_zmm:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -517,7 +517,7 @@ define <32 x half> @stack_fold_maxph_zmm(<32 x half> %a0, <32 x half> %a1) #0 {
}
declare <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half>, <32 x half>, i32) nounwind readnone
-define <32 x half> @stack_fold_maxph_zmm_commuted(<32 x half> %a0, <32 x half> %a1) #0 {
+define <32 x half> @stack_fold_maxph_zmm_commuted(<32 x half> %a0, <32 x half> %a1) {
; CHECK-LABEL: stack_fold_maxph_zmm_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -532,7 +532,7 @@ define <32 x half> @stack_fold_maxph_zmm_commuted(<32 x half> %a0, <32 x half> %
ret <32 x half> %2
}
-define <32 x half> @stack_fold_maxph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #0 {
+define <32 x half> @stack_fold_maxph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) {
; CHECK-LABEL: stack_fold_maxph_zmm_k:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -552,7 +552,7 @@ define <32 x half> @stack_fold_maxph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32
ret <32 x half> %5
}
-define <32 x half> @stack_fold_maxph_zmm_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #0 {
+define <32 x half> @stack_fold_maxph_zmm_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) {
; CHECK-LABEL: stack_fold_maxph_zmm_k_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -573,7 +573,7 @@ define <32 x half> @stack_fold_maxph_zmm_k_commuted(<32 x half> %a0, <32 x half>
ret <32 x half> %5
}
-define <32 x half> @stack_fold_maxph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 {
+define <32 x half> @stack_fold_maxph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) {
; CHECK-LABEL: stack_fold_maxph_zmm_kz:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -590,7 +590,7 @@ define <32 x half> @stack_fold_maxph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i3
ret <32 x half> %4
}
-define <32 x half> @stack_fold_maxph_zmm_kz_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 {
+define <32 x half> @stack_fold_maxph_zmm_kz_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask) {
; CHECK-LABEL: stack_fold_maxph_zmm_kz_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -710,7 +710,7 @@ define <32 x half> @stack_fold_maxph_zmm_commutable_kz_commuted(<32 x half> %a0,
ret <32 x half> %4
}
-define half @stack_fold_maxsh(half %a0, half %a1) #0 {
+define half @stack_fold_maxsh(half %a0, half %a1) {
; CHECK-LABEL: stack_fold_maxsh:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -725,7 +725,7 @@ define half @stack_fold_maxsh(half %a0, half %a1) #0 {
ret half %3
}
-define half @stack_fold_maxsh_commuted(half %a0, half %a1) #0 {
+define half @stack_fold_maxsh_commuted(half %a0, half %a1) {
; CHECK-LABEL: stack_fold_maxsh_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -772,7 +772,7 @@ define half @stack_fold_maxsh_commutable_commuted(half %a0, half %a1) #1 {
ret half %3
}
-define <8 x half> @stack_fold_maxsh_int(<8 x half> %a0, <8 x half> %a1) #0 {
+define <8 x half> @stack_fold_maxsh_int(<8 x half> %a0, <8 x half> %a1) {
; CHECK-LABEL: stack_fold_maxsh_int:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -820,7 +820,7 @@ define <8 x half> @stack_fold_maxsh_maskz(<8 x half> %a0, <8 x half> %a1, i8 %ma
ret <8 x half> %2
}
-define <32 x half> @stack_fold_minph_zmm(<32 x half> %a0, <32 x half> %a1) #0 {
+define <32 x half> @stack_fold_minph_zmm(<32 x half> %a0, <32 x half> %a1) {
; CHECK-LABEL: stack_fold_minph_zmm:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -835,7 +835,7 @@ define <32 x half> @stack_fold_minph_zmm(<32 x half> %a0, <32 x half> %a1) #0 {
}
declare <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half>, <32 x half>, i32) nounwind readnone
-define <32 x half> @stack_fold_minph_zmm_commuted(<32 x half> %a0, <32 x half> %a1) #0 {
+define <32 x half> @stack_fold_minph_zmm_commuted(<32 x half> %a0, <32 x half> %a1) {
; CHECK-LABEL: stack_fold_minph_zmm_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -850,7 +850,7 @@ define <32 x half> @stack_fold_minph_zmm_commuted(<32 x half> %a0, <32 x half> %
ret <32 x half> %2
}
-define <32 x half> @stack_fold_minph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #0 {
+define <32 x half> @stack_fold_minph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) {
; CHECK-LABEL: stack_fold_minph_zmm_k:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -870,7 +870,7 @@ define <32 x half> @stack_fold_minph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32
ret <32 x half> %5
}
-define <32 x half> @stack_fold_minph_zmm_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #0 {
+define <32 x half> @stack_fold_minph_zmm_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) {
; CHECK-LABEL: stack_fold_minph_zmm_k_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -891,7 +891,7 @@ define <32 x half> @stack_fold_minph_zmm_k_commuted(<32 x half> %a0, <32 x half>
ret <32 x half> %5
}
-define <32 x half> @stack_fold_minph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 {
+define <32 x half> @stack_fold_minph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) {
; CHECK-LABEL: stack_fold_minph_zmm_kz:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -908,7 +908,7 @@ define <32 x half> @stack_fold_minph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i3
ret <32 x half> %4
}
-define <32 x half> @stack_fold_minph_zmm_kz_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 {
+define <32 x half> @stack_fold_minph_zmm_kz_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask) {
; CHECK-LABEL: stack_fold_minph_zmm_kz_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -1028,7 +1028,7 @@ define <32 x half> @stack_fold_minph_zmm_commutable_kz_commuted(<32 x half> %a0,
ret <32 x half> %4
}
-define half @stack_fold_minsh(half %a0, half %a1) #0 {
+define half @stack_fold_minsh(half %a0, half %a1) {
; CHECK-LABEL: stack_fold_minsh:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -1043,7 +1043,7 @@ define half @stack_fold_minsh(half %a0, half %a1) #0 {
ret half %3
}
-define half @stack_fold_minsh_commuted(half %a0, half %a1) #0 {
+define half @stack_fold_minsh_commuted(half %a0, half %a1) {
; CHECK-LABEL: stack_fold_minsh_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -1090,7 +1090,7 @@ define half @stack_fold_minsh_commutable_commuted(half %a0, half %a1) #1 {
ret half %3
}
-define <8 x half> @stack_fold_minsh_int(<8 x half> %a0, <8 x half> %a1) #0 {
+define <8 x half> @stack_fold_minsh_int(<8 x half> %a0, <8 x half> %a1) {
; CHECK-LABEL: stack_fold_minsh_int:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2316,5 +2316,4 @@ define <4 x float> @stack_fold_fcmaddcsh_maskz(<4 x float> %a0, <4 x float> %a1,
}
declare <4 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32)
-attributes #0 = { "unsafe-fp-math"="false" }
-attributes #1 = { "unsafe-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" }
+attributes #1 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" }
diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll
index 4fed6bc..cd06f2d 100644
--- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll
@@ -381,7 +381,7 @@ define <16 x half> @stack_fold_getmantph_maskz_ymm(<16 x half> %a0, ptr %mask) {
ret <16 x half> %3
}
-define <8 x half> @stack_fold_maxph(<8 x half> %a0, <8 x half> %a1) #0 {
+define <8 x half> @stack_fold_maxph(<8 x half> %a0, <8 x half> %a1) {
; CHECK-LABEL: stack_fold_maxph:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -396,7 +396,7 @@ define <8 x half> @stack_fold_maxph(<8 x half> %a0, <8 x half> %a1) #0 {
}
declare <8 x half> @llvm.x86.avx512fp16.max.ph.128(<8 x half>, <8 x half>) nounwind readnone
-define <8 x half> @stack_fold_maxph_commutable(<8 x half> %a0, <8 x half> %a1) #1 {
+define <8 x half> @stack_fold_maxph_commutable(<8 x half> %a0, <8 x half> %a1) {
; CHECK-LABEL: stack_fold_maxph_commutable:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -410,7 +410,7 @@ define <8 x half> @stack_fold_maxph_commutable(<8 x half> %a0, <8 x half> %a1) #
ret <8 x half> %2
}
-define <16 x half> @stack_fold_maxph_ymm(<16 x half> %a0, <16 x half> %a1) #0 {
+define <16 x half> @stack_fold_maxph_ymm(<16 x half> %a0, <16 x half> %a1) {
; CHECK-LABEL: stack_fold_maxph_ymm:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -425,7 +425,7 @@ define <16 x half> @stack_fold_maxph_ymm(<16 x half> %a0, <16 x half> %a1) #0 {
}
declare <16 x half> @llvm.x86.avx512fp16.max.ph.256(<16 x half>, <16 x half>) nounwind readnone
-define <16 x half> @stack_fold_maxph_ymm_commutable(<16 x half> %a0, <16 x half> %a1) #1 {
+define <16 x half> @stack_fold_maxph_ymm_commutable(<16 x half> %a0, <16 x half> %a1) {
; CHECK-LABEL: stack_fold_maxph_ymm_commutable:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -439,7 +439,7 @@ define <16 x half> @stack_fold_maxph_ymm_commutable(<16 x half> %a0, <16 x half>
ret <16 x half> %2
}
-define <8 x half> @stack_fold_minph(<8 x half> %a0, <8 x half> %a1) #0 {
+define <8 x half> @stack_fold_minph(<8 x half> %a0, <8 x half> %a1) {
; CHECK-LABEL: stack_fold_minph:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -454,7 +454,7 @@ define <8 x half> @stack_fold_minph(<8 x half> %a0, <8 x half> %a1) #0 {
}
declare <8 x half> @llvm.x86.avx512fp16.min.ph.128(<8 x half>, <8 x half>) nounwind readnone
-define <8 x half> @stack_fold_minph_commutable(<8 x half> %a0, <8 x half> %a1) #1 {
+define <8 x half> @stack_fold_minph_commutable(<8 x half> %a0, <8 x half> %a1) {
; CHECK-LABEL: stack_fold_minph_commutable:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -468,7 +468,7 @@ define <8 x half> @stack_fold_minph_commutable(<8 x half> %a0, <8 x half> %a1) #
ret <8 x half> %2
}
-define <16 x half> @stack_fold_minph_ymm(<16 x half> %a0, <16 x half> %a1) #0 {
+define <16 x half> @stack_fold_minph_ymm(<16 x half> %a0, <16 x half> %a1) {
; CHECK-LABEL: stack_fold_minph_ymm:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -483,7 +483,7 @@ define <16 x half> @stack_fold_minph_ymm(<16 x half> %a0, <16 x half> %a1) #0 {
}
declare <16 x half> @llvm.x86.avx512fp16.min.ph.256(<16 x half>, <16 x half>) nounwind readnone
-define <16 x half> @stack_fold_minph_ymm_commutable(<16 x half> %a0, <16 x half> %a1) #1 {
+define <16 x half> @stack_fold_minph_ymm_commutable(<16 x half> %a0, <16 x half> %a1) {
; CHECK-LABEL: stack_fold_minph_ymm_commutable:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -1471,6 +1471,3 @@ define <8 x float> @stack_fold_fcmaddc_maskz_ymm(<8 x float> %a0, <8 x float> %a
ret <8 x float> %3
}
declare <8 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
-
-attributes #0 = { "unsafe-fp-math"="false" }
-attributes #1 = { "unsafe-fp-math"="true" }
diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll
index b370a80..bd56e61 100644
--- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll
@@ -457,7 +457,7 @@ define <4 x float> @stack_fold_cvtpd2ps_ymm(<4 x double> %a0) {
ret <4 x float> %2
}
-define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) #0 {
+define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: stack_fold_maxpd:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -472,7 +472,7 @@ define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) #0 {
}
declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
-define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double> %a1) #1 {
+define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: stack_fold_maxpd_commutable:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -486,7 +486,7 @@ define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double>
ret <2 x double> %2
}
-define <4 x double> @stack_fold_maxpd_ymm(<4 x double> %a0, <4 x double> %a1) #0 {
+define <4 x double> @stack_fold_maxpd_ymm(<4 x double> %a0, <4 x double> %a1) {
; CHECK-LABEL: stack_fold_maxpd_ymm:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -501,7 +501,7 @@ define <4 x double> @stack_fold_maxpd_ymm(<4 x double> %a0, <4 x double> %a1) #0
}
declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
-define <4 x double> @stack_fold_maxpd_ymm_commutable(<4 x double> %a0, <4 x double> %a1) #1 {
+define <4 x double> @stack_fold_maxpd_ymm_commutable(<4 x double> %a0, <4 x double> %a1) {
; CHECK-LABEL: stack_fold_maxpd_ymm_commutable:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -515,7 +515,7 @@ define <4 x double> @stack_fold_maxpd_ymm_commutable(<4 x double> %a0, <4 x doub
ret <4 x double> %2
}
-define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) #0 {
+define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) {
; CHECK-LABEL: stack_fold_maxps:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -530,7 +530,7 @@ define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) #0 {
}
declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
-define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1) #1 {
+define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1) {
; CHECK-LABEL: stack_fold_maxps_commutable:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -544,7 +544,7 @@ define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1
ret <4 x float> %2
}
-define <8 x float> @stack_fold_maxps_ymm(<8 x float> %a0, <8 x float> %a1) #0 {
+define <8 x float> @stack_fold_maxps_ymm(<8 x float> %a0, <8 x float> %a1) {
; CHECK-LABEL: stack_fold_maxps_ymm:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -559,7 +559,7 @@ define <8 x float> @stack_fold_maxps_ymm(<8 x float> %a0, <8 x float> %a1) #0 {
}
declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
-define <8 x float> @stack_fold_maxps_ymm_commutable(<8 x float> %a0, <8 x float> %a1) #1 {
+define <8 x float> @stack_fold_maxps_ymm_commutable(<8 x float> %a0, <8 x float> %a1) {
; CHECK-LABEL: stack_fold_maxps_ymm_commutable:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -573,7 +573,7 @@ define <8 x float> @stack_fold_maxps_ymm_commutable(<8 x float> %a0, <8 x float>
ret <8 x float> %2
}
-define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) #0 {
+define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) {
; CHECK-LABEL: stack_fold_minps:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -588,7 +588,7 @@ define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) #0 {
}
declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
-define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1) #1 {
+define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1) {
; CHECK-LABEL: stack_fold_minps_commutable:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -602,7 +602,7 @@ define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1
ret <4 x float> %2
}
-define <8 x float> @stack_fold_minps_ymm(<8 x float> %a0, <8 x float> %a1) #0 {
+define <8 x float> @stack_fold_minps_ymm(<8 x float> %a0, <8 x float> %a1) {
; CHECK-LABEL: stack_fold_minps_ymm:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -617,7 +617,7 @@ define <8 x float> @stack_fold_minps_ymm(<8 x float> %a0, <8 x float> %a1) #0 {
}
declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
-define <8 x float> @stack_fold_minps_ymm_commutable(<8 x float> %a0, <8 x float> %a1) #1 {
+define <8 x float> @stack_fold_minps_ymm_commutable(<8 x float> %a0, <8 x float> %a1) {
; CHECK-LABEL: stack_fold_minps_ymm_commutable:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -687,7 +687,7 @@ define <8 x float> @stack_fold_mulps_ymm(<8 x float> %a0, <8 x float> %a1) {
ret <8 x float> %2
}
-define <2 x double> @stack_fold_orpd(<2 x double> %a0, <2 x double> %a1) #0 {
+define <2 x double> @stack_fold_orpd(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: stack_fold_orpd:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -708,7 +708,7 @@ define <2 x double> @stack_fold_orpd(<2 x double> %a0, <2 x double> %a1) #0 {
ret <2 x double> %6
}
-define <4 x double> @stack_fold_orpd_ymm(<4 x double> %a0, <4 x double> %a1) #0 {
+define <4 x double> @stack_fold_orpd_ymm(<4 x double> %a0, <4 x double> %a1) {
; CHECK-LABEL: stack_fold_orpd_ymm:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -939,7 +939,7 @@ define <8 x float> @stack_fold_subps_ymm(<8 x float> %a0, <8 x float> %a1) {
ret <8 x float> %2
}
-define <2 x double> @stack_fold_xorpd(<2 x double> %a0, <2 x double> %a1) #0 {
+define <2 x double> @stack_fold_xorpd(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: stack_fold_xorpd:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -960,7 +960,7 @@ define <2 x double> @stack_fold_xorpd(<2 x double> %a0, <2 x double> %a1) #0 {
ret <2 x double> %6
}
-define <4 x double> @stack_fold_xorpd_ymm(<4 x double> %a0, <4 x double> %a1) #0 {
+define <4 x double> @stack_fold_xorpd_ymm(<4 x double> %a0, <4 x double> %a1) {
; CHECK-LABEL: stack_fold_xorpd_ymm:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -1391,6 +1391,3 @@ declare <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float>, <4 x i32>, <
declare <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double>, <2 x i64>, <2 x double>)
declare <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float>, <8 x i32>, <8 x float>)
declare <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double>, <4 x i64>, <4 x double>)
-
-attributes #0 = { "unsafe-fp-math"="false" }
-attributes #1 = { "unsafe-fp-math"="true" }
diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-sse42.ll b/llvm/test/CodeGen/X86/stack-folding-fp-sse42.ll
index 306ee31..9bc9a9c 100644
--- a/llvm/test/CodeGen/X86/stack-folding-fp-sse42.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-sse42.ll
@@ -1424,7 +1424,7 @@ define <4 x float> @stack_fold_insertps(<4 x float> %a0, <4 x float> %a1) {
}
declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
-define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) #0 {
+define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: stack_fold_maxpd:
; CHECK: # %bb.0:
; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1439,7 +1439,7 @@ define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) #0 {
}
declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
-define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double> %a1) #1 {
+define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: stack_fold_maxpd_commutable:
; CHECK: # %bb.0:
; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1453,7 +1453,7 @@ define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double>
ret <2 x double> %2
}
-define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) #0 {
+define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) {
; CHECK-LABEL: stack_fold_maxps:
; CHECK: # %bb.0:
; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1468,7 +1468,7 @@ define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) #0 {
}
declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
-define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1) #1 {
+define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1) {
; CHECK-LABEL: stack_fold_maxps_commutable:
; CHECK: # %bb.0:
; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1482,7 +1482,7 @@ define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1
ret <4 x float> %2
}
-define double @stack_fold_maxsd(double %a0, double %a1) #0 {
+define double @stack_fold_maxsd(double %a0, double %a1) {
; CHECK-LABEL: stack_fold_maxsd:
; CHECK: # %bb.0:
; CHECK-NEXT: movsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -1497,7 +1497,7 @@ define double @stack_fold_maxsd(double %a0, double %a1) #0 {
ret double %3
}
-define double @stack_fold_maxsd_commutable(double %a0, double %a1) #1 {
+define double @stack_fold_maxsd_commutable(double %a0, double %a1) {
; CHECK-LABEL: stack_fold_maxsd_commutable:
; CHECK: # %bb.0:
; CHECK-NEXT: movsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -1512,7 +1512,7 @@ define double @stack_fold_maxsd_commutable(double %a0, double %a1) #1 {
ret double %3
}
-define <2 x double> @stack_fold_maxsd_int(<2 x double> %a0, <2 x double> %a1) #0 {
+define <2 x double> @stack_fold_maxsd_int(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: stack_fold_maxsd_int:
; CHECK: # %bb.0:
; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1527,7 +1527,7 @@ define <2 x double> @stack_fold_maxsd_int(<2 x double> %a0, <2 x double> %a1) #0
}
declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
-define float @stack_fold_maxss(float %a0, float %a1) #0 {
+define float @stack_fold_maxss(float %a0, float %a1) {
; CHECK-LABEL: stack_fold_maxss:
; CHECK: # %bb.0:
; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -1542,7 +1542,7 @@ define float @stack_fold_maxss(float %a0, float %a1) #0 {
ret float %3
}
-define float @stack_fold_maxss_commutable(float %a0, float %a1) #1 {
+define float @stack_fold_maxss_commutable(float %a0, float %a1) {
; CHECK-LABEL: stack_fold_maxss_commutable:
; CHECK: # %bb.0:
; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -1557,7 +1557,7 @@ define float @stack_fold_maxss_commutable(float %a0, float %a1) #1 {
ret float %3
}
-define <4 x float> @stack_fold_maxss_int(<4 x float> %a0, <4 x float> %a1) #0 {
+define <4 x float> @stack_fold_maxss_int(<4 x float> %a0, <4 x float> %a1) {
; CHECK-LABEL: stack_fold_maxss_int:
; CHECK: # %bb.0:
; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1572,7 +1572,7 @@ define <4 x float> @stack_fold_maxss_int(<4 x float> %a0, <4 x float> %a1) #0 {
}
declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
-define <2 x double> @stack_fold_minpd(<2 x double> %a0, <2 x double> %a1) #0 {
+define <2 x double> @stack_fold_minpd(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: stack_fold_minpd:
; CHECK: # %bb.0:
; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1587,7 +1587,7 @@ define <2 x double> @stack_fold_minpd(<2 x double> %a0, <2 x double> %a1) #0 {
}
declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
-define <2 x double> @stack_fold_minpd_commutable(<2 x double> %a0, <2 x double> %a1) #1 {
+define <2 x double> @stack_fold_minpd_commutable(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: stack_fold_minpd_commutable:
; CHECK: # %bb.0:
; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1601,7 +1601,7 @@ define <2 x double> @stack_fold_minpd_commutable(<2 x double> %a0, <2 x double>
ret <2 x double> %2
}
-define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) #0 {
+define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) {
; CHECK-LABEL: stack_fold_minps:
; CHECK: # %bb.0:
; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1616,7 +1616,7 @@ define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) #0 {
}
declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
-define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1) #1 {
+define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1) {
; CHECK-LABEL: stack_fold_minps_commutable:
; CHECK: # %bb.0:
; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1630,7 +1630,7 @@ define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1
ret <4 x float> %2
}
-define double @stack_fold_minsd(double %a0, double %a1) #0 {
+define double @stack_fold_minsd(double %a0, double %a1) {
; CHECK-LABEL: stack_fold_minsd:
; CHECK: # %bb.0:
; CHECK-NEXT: movsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -1645,7 +1645,7 @@ define double @stack_fold_minsd(double %a0, double %a1) #0 {
ret double %3
}
-define double @stack_fold_minsd_commutable(double %a0, double %a1) #1 {
+define double @stack_fold_minsd_commutable(double %a0, double %a1) {
; CHECK-LABEL: stack_fold_minsd_commutable:
; CHECK: # %bb.0:
; CHECK-NEXT: movsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -1660,7 +1660,7 @@ define double @stack_fold_minsd_commutable(double %a0, double %a1) #1 {
ret double %3
}
-define <2 x double> @stack_fold_minsd_int(<2 x double> %a0, <2 x double> %a1) #0 {
+define <2 x double> @stack_fold_minsd_int(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: stack_fold_minsd_int:
; CHECK: # %bb.0:
; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1675,7 +1675,7 @@ define <2 x double> @stack_fold_minsd_int(<2 x double> %a0, <2 x double> %a1) #0
}
declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
-define float @stack_fold_minss(float %a0, float %a1) #0 {
+define float @stack_fold_minss(float %a0, float %a1) {
; CHECK-LABEL: stack_fold_minss:
; CHECK: # %bb.0:
; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -1690,7 +1690,7 @@ define float @stack_fold_minss(float %a0, float %a1) #0 {
ret float %3
}
-define float @stack_fold_minss_commutable(float %a0, float %a1) #1 {
+define float @stack_fold_minss_commutable(float %a0, float %a1) {
; CHECK-LABEL: stack_fold_minss_commutable:
; CHECK: # %bb.0:
; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -1705,7 +1705,7 @@ define float @stack_fold_minss_commutable(float %a0, float %a1) #1 {
ret float %3
}
-define <4 x float> @stack_fold_minss_int(<4 x float> %a0, <4 x float> %a1) #0 {
+define <4 x float> @stack_fold_minss_int(<4 x float> %a0, <4 x float> %a1) {
; CHECK-LABEL: stack_fold_minss_int:
; CHECK: # %bb.0:
; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2490,6 +2490,3 @@ define <4 x float> @stack_fold_xorps(<4 x float> %a0, <4 x float> %a1) {
declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
-
-attributes #0 = { "unsafe-fp-math"="false" }
-attributes #1 = { "unsafe-fp-math"="true" }
diff --git a/llvm/test/CodeGen/X86/stack-protector-3.ll b/llvm/test/CodeGen/X86/stack-protector-3.ll
index 59784af..8ca6a56 100644
--- a/llvm/test/CodeGen/X86/stack-protector-3.ll
+++ b/llvm/test/CodeGen/X86/stack-protector-3.ll
@@ -118,7 +118,7 @@ declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
; Function Attrs: argmemonly nounwind willreturn
declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #1
-attributes #0 = { nounwind sspreq uwtable writeonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind sspreq uwtable writeonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-soft-float"="false" }
attributes #1 = { argmemonly nounwind willreturn }
attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/stack-protector-vreg-to-vreg-copy.ll b/llvm/test/CodeGen/X86/stack-protector-vreg-to-vreg-copy.ll
index 63390e4..4bc91bf 100644
--- a/llvm/test/CodeGen/X86/stack-protector-vreg-to-vreg-copy.ll
+++ b/llvm/test/CodeGen/X86/stack-protector-vreg-to-vreg-copy.ll
@@ -55,6 +55,6 @@ entry:
declare void @f(i32) #1
-attributes #0 = { nounwind sspreq "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind sspreq "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/stack_guard_remat.ll b/llvm/test/CodeGen/X86/stack_guard_remat.ll
index f7a602c..f53fa0b4 100644
--- a/llvm/test/CodeGen/X86/stack_guard_remat.ll
+++ b/llvm/test/CodeGen/X86/stack_guard_remat.ll
@@ -23,4 +23,4 @@ declare void @foo3(ptr)
; Function Attrs: nounwind
declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/tail-merge-wineh.ll b/llvm/test/CodeGen/X86/tail-merge-wineh.ll
index 00bddc1..a208368 100644
--- a/llvm/test/CodeGen/X86/tail-merge-wineh.ll
+++ b/llvm/test/CodeGen/X86/tail-merge-wineh.ll
@@ -101,5 +101,5 @@ declare x86_stdcallcc void @_CxxThrowException(ptr, ptr)
declare i32 @__CxxFrameHandler3(...)
-attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
attributes #1 = { noreturn }
diff --git a/llvm/test/CodeGen/X86/tls-shrink-wrapping.ll b/llvm/test/CodeGen/X86/tls-shrink-wrapping.ll
index 2749ebd..f0d5a16 100644
--- a/llvm/test/CodeGen/X86/tls-shrink-wrapping.ll
+++ b/llvm/test/CodeGen/X86/tls-shrink-wrapping.ll
@@ -51,6 +51,6 @@ if.end: ; preds = %if.then, %entry
declare void @f(...) #1
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "use-soft-float"="false" }
attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/unused_stackslots.ll b/llvm/test/CodeGen/X86/unused_stackslots.ll
index d909dd4..4d390bd 100644
--- a/llvm/test/CodeGen/X86/unused_stackslots.ll
+++ b/llvm/test/CodeGen/X86/unused_stackslots.ll
@@ -215,8 +215,8 @@ declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64,
; Function Attrs: argmemonly nounwind
declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #1
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "use-soft-float"="false" }
attributes #1 = { argmemonly nounwind }
-attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "use-soft-float"="false" }
attributes #3 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/uwtables.ll b/llvm/test/CodeGen/X86/uwtables.ll
index 1e2e1d9..68a5ff1 100644
--- a/llvm/test/CodeGen/X86/uwtables.ll
+++ b/llvm/test/CodeGen/X86/uwtables.ll
@@ -38,5 +38,5 @@ declare i32 @__gxx_personality_v0(...)
declare void @__cxa_call_unexpected(ptr) local_unnamed_addr
-attributes #0 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
index 910dd1e..5954e34 100644
--- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
@@ -5435,7 +5435,7 @@ define double @extract3_uitofp_v4i32_f64(<4 x i32> %x) nounwind {
ret double %r
}
-define void @PR43609(ptr nocapture %x, <2 x i64> %y) #0 {
+define void @PR43609(ptr nocapture %x, <2 x i64> %y) {
; SSE2-LABEL: PR43609:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2,2]
@@ -5643,6 +5643,3 @@ define void @PR43609(ptr nocapture %x, <2 x i64> %y) #0 {
store <2 x double> %t23, ptr %t26, align 8
ret void
}
-
-attributes #0 = { "unsafe-fp-math"="true" }
-
diff --git a/llvm/test/CodeGen/X86/vector-sqrt.ll b/llvm/test/CodeGen/X86/vector-sqrt.ll
index b08784a..843f099a 100644
--- a/llvm/test/CodeGen/X86/vector-sqrt.ll
+++ b/llvm/test/CodeGen/X86/vector-sqrt.ll
@@ -63,6 +63,6 @@ entry:
; Function Attrs: nounwind readnone
declare float @sqrtf(float) local_unnamed_addr #1
-attributes #0 = { nounwind readonly uwtable "target-features"="+avx" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone "target-features"="+avx2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readonly uwtable "target-features"="+avx" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone "target-features"="+avx2" "use-soft-float"="false" }
attributes #2 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/X86/vector-width-store-merge.ll b/llvm/test/CodeGen/X86/vector-width-store-merge.ll
index 50c7b01..9363348 100644
--- a/llvm/test/CodeGen/X86/vector-width-store-merge.ll
+++ b/llvm/test/CodeGen/X86/vector-width-store-merge.ll
@@ -85,8 +85,8 @@ entry:
; Function Attrs: argmemonly nounwind
declare void @llvm.memmove.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1 immarg) #1
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "prefer-vector-width"="128" "stack-protector-buffer-size"="8" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+pku,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "prefer-vector-width"="128" "stack-protector-buffer-size"="8" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+pku,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "use-soft-float"="false" }
attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "prefer-vector-width"="256" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+pku,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "prefer-vector-width"="256" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+pku,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "use-soft-float"="false" }
!0 = !{i32 1, !"wchar_size", i32 4}
diff --git a/llvm/test/CodeGen/X86/win-cleanuppad.ll b/llvm/test/CodeGen/X86/win-cleanuppad.ll
index e9265a1..59dcccc 100644
--- a/llvm/test/CodeGen/X86/win-cleanuppad.ll
+++ b/llvm/test/CodeGen/X86/win-cleanuppad.ll
@@ -194,6 +194,6 @@ cleanup.outer: ; preds = %invoke.cont.1, %c
; X64-NEXT: .long .Ltmp7@IMGREL
; X64-NEXT: .long -1
-attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/win32-seh-catchpad.ll b/llvm/test/CodeGen/X86/win32-seh-catchpad.ll
index 832ddca..0f51866 100644
--- a/llvm/test/CodeGen/X86/win32-seh-catchpad.ll
+++ b/llvm/test/CodeGen/X86/win32-seh-catchpad.ll
@@ -220,7 +220,7 @@ declare i32 @_except_handler3(...)
; Function Attrs: nounwind
declare void @llvm.localescape(...) #2
-attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind }
attributes #3 = { noinline }
diff --git a/llvm/test/CodeGen/X86/win32-seh-nested-finally.ll b/llvm/test/CodeGen/X86/win32-seh-nested-finally.ll
index 5b1f9b3..5095460 100644
--- a/llvm/test/CodeGen/X86/win32-seh-nested-finally.ll
+++ b/llvm/test/CodeGen/X86/win32-seh-nested-finally.ll
@@ -34,8 +34,8 @@ declare void @f(i32) #0
declare i32 @_except_handler3(...)
-attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { noinline nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { noinline nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
attributes #2 = { nounwind readnone }
attributes #3 = { noinline }
diff --git a/llvm/test/CodeGen/X86/windows-seh-EHa-CppCatchDotDotDot.ll b/llvm/test/CodeGen/X86/windows-seh-EHa-CppCatchDotDotDot.ll
index 785c260..d4d4fe3 100644
--- a/llvm/test/CodeGen/X86/windows-seh-EHa-CppCatchDotDotDot.ll
+++ b/llvm/test/CodeGen/X86/windows-seh-EHa-CppCatchDotDotDot.ll
@@ -272,12 +272,12 @@ declare dso_local void @llvm.seh.try.end() #2
; Function Attrs: nounwind readnone
declare i32 @llvm.eh.exceptioncode(token) #3
-attributes #0 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #1 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
attributes #2 = { nounwind willreturn }
attributes #3 = { nounwind readnone }
-attributes #4 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #5 = { noinline norecurse optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #5 = { noinline norecurse optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
attributes #6 = { nounwind }
attributes #7 = { noreturn }
attributes #8 = { noinline }
diff --git a/llvm/test/CodeGen/X86/windows-seh-EHa-CppDtors01.ll b/llvm/test/CodeGen/X86/windows-seh-EHa-CppDtors01.ll
index 6c6e9c3..b0baaac 100644
--- a/llvm/test/CodeGen/X86/windows-seh-EHa-CppDtors01.ll
+++ b/llvm/test/CodeGen/X86/windows-seh-EHa-CppDtors01.ll
@@ -240,12 +240,12 @@ declare i32 @llvm.eh.exceptioncode(token) #1
declare dso_local void @"?printf@@YAXZZ"(...) #5
-attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
attributes #1 = { nounwind readnone }
-attributes #2 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { noinline norecurse optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #3 = { noinline norecurse optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
attributes #4 = { nounwind willreturn }
-attributes #5 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
attributes #6 = { nounwind }
attributes #7 = { noinline }
diff --git a/llvm/test/CodeGen/X86/windows-seh-EHa-TryInFinally.ll b/llvm/test/CodeGen/X86/windows-seh-EHa-TryInFinally.ll
index 9e44299..d3da5f8 100644
--- a/llvm/test/CodeGen/X86/windows-seh-EHa-TryInFinally.ll
+++ b/llvm/test/CodeGen/X86/windows-seh-EHa-TryInFinally.ll
@@ -209,10 +209,10 @@ declare i32 @llvm.eh.exceptioncode(token) #4
; Function Attrs: nounwind
declare void @llvm.localescape(...) #5
-attributes #0 = { noinline norecurse optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline norecurse optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
attributes #2 = { nounwind willreturn }
-attributes #3 = { noinline "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { noinline "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
attributes #4 = { nounwind readnone }
attributes #5 = { nounwind }
attributes #6 = { noinline }
diff --git a/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll b/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll
index cb12481..9f888f8 100644
--- a/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll
+++ b/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll
@@ -24,7 +24,7 @@ entry:
ret i64 %or
}
-attributes #0 = { minsize nounwind readnone uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { minsize nounwind readnone uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
; clang -Os -c test2.cpp -emit-llvm -S
@@ -63,7 +63,7 @@ entry:
ret i64 %or
}
-attributes #1 = { nounwind optsize readnone uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind optsize readnone uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
; clang -O2 -c test2.cpp -emit-llvm -S
; Verify that we do not generate shld insruction when we are not optimizing
@@ -89,7 +89,7 @@ entry:
ret i64 %or
}
-attributes #2= { nounwind readnone uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2= { nounwind readnone uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"ProfileSummary", !1}
diff --git a/llvm/test/DebugInfo/X86/aggressive-instcombine-store-merge-dbg.ll b/llvm/test/DebugInfo/X86/aggressive-instcombine-store-merge-dbg.ll
new file mode 100644
index 0000000..f6e941a
--- /dev/null
+++ b/llvm/test/DebugInfo/X86/aggressive-instcombine-store-merge-dbg.ll
@@ -0,0 +1,49 @@
+; RUN: opt -S -passes=aggressive-instcombine -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+;; Aggressive instcombine merges the two i8 stores into an i16 store. Check
+;; the debug location and DIAssignID metadata get merged.
+
+; CHECK: define void @test_i16(i16 %x, ptr %p) !dbg ![[#]] {
+; CHECK-NEXT: store i16 %x, ptr %p, align 1, !dbg ![[DBG:[0-9]+]], !DIAssignID ![[ID:[0-9]+]]
+; CHECK-NEXT: #dbg_assign(i16 %x, ![[#]],
+; CHECK-SAME: !DIExpression(DW_OP_LLVM_convert, 16, DW_ATE_unsigned, DW_OP_LLVM_convert, 8, DW_ATE_unsigned, DW_OP_stack_value, DW_OP_LLVM_fragment, 0, 8),
+; CHECK-SAME: ![[ID]], ptr %p, !DIExpression(), ![[#]])
+; CHECK-NEXT: #dbg_assign(i16 %x, ![[#]],
+; CHECK-SAME: !DIExpression(DW_OP_constu, 8, DW_OP_shr, DW_OP_LLVM_convert, 16, DW_ATE_unsigned, DW_OP_LLVM_convert, 8, DW_ATE_unsigned, DW_OP_stack_value, DW_OP_LLVM_fragment, 8, 8),
+; CHECK-SAME: ![[ID]], ptr %p, !DIExpression(DW_OP_plus_uconst, 1), ![[#]])
+; CHECK-NEXT: ret void
+
+; CHECK: ![[DBG]] = !DILocation(line: 0, scope: ![[#]])
+
+define void @test_i16(i16 %x, ptr %p) !dbg !5 {
+ %x.0 = trunc i16 %x to i8
+ store i8 %x.0, ptr %p, align 1, !dbg !16, !DIAssignID !17
+ #dbg_assign(i8 %x.0, !9, !DIExpression(DW_OP_LLVM_fragment, 0, 8), !17, ptr %p, !DIExpression(), !18)
+ %shr.1 = lshr i16 %x, 8
+ %x.1 = trunc i16 %shr.1 to i8
+ %gep.1 = getelementptr i8, ptr %p, i64 1
+ store i8 %x.1, ptr %gep.1, align 1, !dbg !19, !DIAssignID !20
+ #dbg_assign(i8 %x.1, !9, !DIExpression(DW_OP_LLVM_fragment, 8, 8), !20, ptr %gep.1, !DIExpression(), !18)
+ ret void
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.debugify = !{!2, !3}
+!llvm.module.flags = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!1 = !DIFile(filename: "/app/example.ll", directory: "/")
+!2 = !{i32 7}
+!3 = !{i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = distinct !DISubprogram(name: "test_i16", linkageName: "test_i16", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8)
+!6 = !DISubroutineType(types: !7)
+!7 = !{}
+!8 = !{!9}
+!9 = !DILocalVariable(name: "1", scope: !5, file: !1, line: 1, type: !10)
+!10 = !DIBasicType(name: "ty16", size: 16, encoding: DW_ATE_unsigned)
+!16 = !DILocation(line: 2, column: 1, scope: !5)
+!17 = distinct !DIAssignID()
+!18 = !DILocation(line: 1, column: 1, scope: !5)
+!19 = !DILocation(line: 6, column: 1, scope: !5)
+!20 = distinct !DIAssignID()
diff --git a/llvm/test/LTO/AArch64/TestInputs/bar.ll b/llvm/test/LTO/AArch64/Inputs/bar.ll
index 7c2a753..7c2a753 100644
--- a/llvm/test/LTO/AArch64/TestInputs/bar.ll
+++ b/llvm/test/LTO/AArch64/Inputs/bar.ll
diff --git a/llvm/test/LTO/AArch64/TestInputs/fiz.ll b/llvm/test/LTO/AArch64/Inputs/fiz.ll
index e578426..e578426 100644
--- a/llvm/test/LTO/AArch64/TestInputs/fiz.ll
+++ b/llvm/test/LTO/AArch64/Inputs/fiz.ll
diff --git a/llvm/test/LTO/AArch64/TestInputs/foo.ll b/llvm/test/LTO/AArch64/Inputs/foo.ll
index 689d938..689d938 100644
--- a/llvm/test/LTO/AArch64/TestInputs/foo.ll
+++ b/llvm/test/LTO/AArch64/Inputs/foo.ll
diff --git a/llvm/test/LTO/AArch64/TestInputs/old.ll b/llvm/test/LTO/AArch64/Inputs/old.ll
index 2b1758b..2b1758b 100644
--- a/llvm/test/LTO/AArch64/TestInputs/old.ll
+++ b/llvm/test/LTO/AArch64/Inputs/old.ll
diff --git a/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll b/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll
index aef8907..20254de 100644
--- a/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll
+++ b/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll
@@ -2,7 +2,7 @@
;; be mixed.
;;
; RUN: llvm-as %s -o %t1.bc
-; RUN: llvm-as %p/TestInputs/foo.ll -o %t2.bc
+; RUN: llvm-as %p/Inputs/foo.ll -o %t2.bc
; RUN: llvm-lto -exported-symbol main \
; RUN: -exported-symbol foo_on \
; RUN: -filetype=obj \
diff --git a/llvm/test/LTO/AArch64/link-sign-return-address.ll b/llvm/test/LTO/AArch64/link-sign-return-address.ll
index df6276f..331e481 100644
--- a/llvm/test/LTO/AArch64/link-sign-return-address.ll
+++ b/llvm/test/LTO/AArch64/link-sign-return-address.ll
@@ -2,10 +2,10 @@
;; be mixed.
;
; RUN: llvm-as %s -o %t1.bc
-; RUN: llvm-as %p/TestInputs/foo.ll -o %t2.bc
-; RUN: llvm-as %p/TestInputs/fiz.ll -o %t3.bc
-; RUN: llvm-as %p/TestInputs/bar.ll -o %t4.bc
-; RUN: llvm-as %p/TestInputs/old.ll -o %t5.bc
+; RUN: llvm-as %p/Inputs/foo.ll -o %t2.bc
+; RUN: llvm-as %p/Inputs/fiz.ll -o %t3.bc
+; RUN: llvm-as %p/Inputs/bar.ll -o %t4.bc
+; RUN: llvm-as %p/Inputs/old.ll -o %t5.bc
; RUN: llvm-lto -exported-symbol main \
; RUN: -exported-symbol foo_on \
; RUN: -exported-symbol foo_off \
diff --git a/llvm/test/MC/AArch64/FP8/fmmla-diagnostics.s b/llvm/test/MC/AArch64/FP8/fmmla-diagnostics.s
index cf8d21658..15efbee 100644
--- a/llvm/test/MC/AArch64/FP8/fmmla-diagnostics.s
+++ b/llvm/test/MC/AArch64/FP8/fmmla-diagnostics.s
@@ -16,7 +16,7 @@ fmmla v0.4s, v1.4s, v2.4s
// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
fmmla v0.8h, v1.8h, v2.8h
-// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction requires: f16mm
// CHECK-NEXT: fmmla v0.8h, v1.8h, v2.8h
// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/SME2p3/luti6-diagnostics.s b/llvm/test/MC/AArch64/SME2p3/luti6-diagnostics.s
new file mode 100644
index 0000000..c25ff66
--- /dev/null
+++ b/llvm/test/MC/AArch64/SME2p3/luti6-diagnostics.s
@@ -0,0 +1,191 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p3 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+luti6 z0.h, zt0, z0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: luti6 z0.h, zt0, z0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 z0.s, zt0, z0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: luti6 z0.s, zt0, z0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 z0.d, zt0, z0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: luti6 z0.d, zt0, z0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0.h, p0/m, z7.h
+luti6 z0.b, zt0, z1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 z0.b, zt0, z1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+movprfx z0, z7
+luti6 z0.b, zt0, z1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 z0.b, zt0, z1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid vectors/mis-matched registers/invalid index
+
+luti6 { z0.h - z5.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid number of vectors
+// CHECK-NEXT: luti6 { z0.h - z5.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z0.b - z3.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: mismatched register size suffix
+// CHECK-NEXT: luti6 { z0.b - z3.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z0, z1 }[2]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 1].
+// CHECK-NEXT: luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z0, z1 }[2]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0.h, p0/m, z7.h
+luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+movprfx z0, z7
+luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Wrong striding/registers/index
+
+luti6 { z0.h, z4.h, z8.h, z13.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: registers must have the same sequential stride
+// CHECK-NEXT: luti6 { z0.h, z4.h, z8.h, z13.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z1.h, z2.h, z3.h, z4.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 4 consecutive SVE vectors, where the first vector is a multiple of 4 and with matching element types
+// CHECK-NEXT: luti6 { z1.h, z2.h, z3.h, z4.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z0.b, z4.h, z8.h, z12.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: mismatched register size suffix
+// CHECK-NEXT: luti6 { z0.b, z4.h, z8.h, z12.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z0.h, z4.h, z8.h, z12.h }, { z0.h, z1.h }, { z0, z1 }[2]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 1].
+// CHECK-NEXT: luti6 { z0.h, z4.h, z8.h, z12.h }, { z0.h, z1.h }, { z0, z1 }[2]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0.h, p0/m, z7.h
+luti6 { z0.h, z4.h, z8.h, z12.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 { z0.h, z4.h, z8.h, z12.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+movprfx z0, z7
+luti6 { z0.h, z4.h, z8.h, z12.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 { z0.h, z4.h, z8.h, z12.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid registers
+
+luti6 { z0.b - z5.b }, zt0, { z2 - z4 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid number of vectors
+// CHECK-NEXT: luti6 { z0.b - z5.b }, zt0, { z2 - z4 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z0.b - z3.b }, zt0, { z1 - z1 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid number of vectors
+// CHECK-NEXT: luti6 { z0.b - z3.b }, zt0, { z1 - z1 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z0.b - z5.b }, zt0, { z7 - z11 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid number of vectors
+// CHECK-NEXT: luti6 { z0.b - z5.b }, zt0, { z7 - z11 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z0.b - z3.b }, zt1, { z1 - z3 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid lookup table, expected zt0
+// CHECK-NEXT: luti6 { z0.b - z3.b }, zt1, { z1 - z3 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z4.b, z8.b, z12.b, z16.b}, zt0, { z2 - z5 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: luti6 { z4.b, z8.b, z12.b, z16.b}, zt0, { z2 - z5 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z17.b, z21.b, z25.b, z29.b}, zt0, { z2 - z5 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: luti6 { z17.b, z21.b, z25.b, z29.b}, zt0, { z2 - z5 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0.h, p0/m, z7.h
+luti6 { z0.b - z3.b }, zt0, { z1 - z3 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 { z0.b - z3.b }, zt0, { z1 - z3 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+movprfx z0, z7
+luti6 { z0.b - z3.b }, zt0, { z1 - z3 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 { z0.b - z3.b }, zt0, { z1 - z3 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Wrong striding/registers
+
+luti6 { z1.b, z5.b, z9.b, z14.b }, zt0, { z0 - z2 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: registers must have the same sequential stride
+// CHECK-NEXT: luti6 { z1.b, z5.b, z9.b, z14.b }, zt0, { z0 - z2 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z1.b, z2.b, z3.b, z4.b }, zt0, { z0 - z2 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 4 consecutive SVE vectors, where the first vector is a multiple of 4 and with matching element types
+// CHECK-NEXT: luti6 { z1.b, z2.b, z3.b, z4.b }, zt0, { z0 - z2 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z20.b, z24.b, z28.b, z32.b }, zt0, { z0 - z2 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector register expected
+// CHECK-NEXT: luti6 { z20.b, z24.b, z28.b, z32.b }, zt0, { z0 - z2 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z1.h, z5.h, z9.h, z13.h }, zt0, { z0 - z2 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: luti6 { z1.h, z5.h, z9.h, z13.h }, zt0, { z0 - z2 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0.h, p0/m, z7.h
+luti6 { z0.b, z4.b, z8.b, z12.b }, zt0, { z2 - z4 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 { z0.b, z4.b, z8.b, z12.b }, zt0, { z2 - z4 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+movprfx z0, z7
+luti6 { z0.b, z4.b, z8.b, z12.b }, zt0, { z0 - z2 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 { z0.b, z4.b, z8.b, z12.b }, zt0, { z0 - z2 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/SME2p3/luti6.s b/llvm/test/MC/AArch64/SME2p3/luti6.s
new file mode 100644
index 0000000..7a7872f
--- /dev/null
+++ b/llvm/test/MC/AArch64/SME2p3/luti6.s
@@ -0,0 +1,472 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p3 < %s \
+// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN: | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p3 < %s \
+// RUN: | llvm-objdump -d --mattr=+sme2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p3 < %s \
+// RUN: | llvm-objdump -d --mattr=-sme2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p3 < %s \
+// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN: | llvm-mc -triple=aarch64 -mattr=+sme2p3 -disassemble -show-encoding \
+// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+// ----------------------------------------------------------
+// Lookup table read with 6-bit indices (single)
+
+luti6 z0.b, zt0, z0
+// CHECK-INST: luti6 z0.b, zt0, z0
+// CHECK-ENCODING: encoding: [0x00,0x40,0xc8,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c0c84000 <unknown>
+
+luti6 z31.b, zt0, z0
+// CHECK-INST: luti6 z31.b, zt0, z0
+// CHECK-ENCODING: encoding: [0x1f,0x40,0xc8,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c0c8401f <unknown>
+
+luti6 z0.b, zt0, z31
+// CHECK-INST: luti6 z0.b, zt0, z31
+// CHECK-ENCODING: encoding: [0xe0,0x43,0xc8,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c0c843e0 <unknown>
+
+luti6 z31.b, zt0, z31
+// CHECK-INST: luti6 z31.b, zt0, z31
+// CHECK-ENCODING: encoding: [0xff,0x43,0xc8,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c0c843ff <unknown>
+
+// ----------------------------------------------------------
+// Lookup table read with 6-bit indices (16-bit) - consecutive
+
+luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x00,0xf4,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120f400 <unknown>
+
+luti6 { z8.h - z11.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z8.h - z11.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x08,0xf4,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120f408 <unknown>
+
+luti6 { z20.h - z23.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z20.h - z23.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x14,0xf4,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120f414 <unknown>
+
+luti6 { z28.h - z31.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z28.h - z31.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x1c,0xf4,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120f41c <unknown>
+
+luti6 { z0.h - z3.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z0.h - z3.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xe0,0xf7,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17ff7e0 <unknown>
+
+luti6 { z8.h - z11.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z8.h - z11.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xe8,0xf7,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17ff7e8 <unknown>
+
+luti6 { z20.h - z23.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z20.h - z23.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xf4,0xf7,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17ff7f4 <unknown>
+
+luti6 { z28.h - z31.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z28.h - z31.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xfc,0xf7,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17ff7fc <unknown>
+
+// ----------------------------------------------------------
+// Lookup table read with 6-bit indices (16-bit) - strided
+
+luti6 { z0.h, z4.h, z8.h, z12.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z0.h, z4.h, z8.h, z12.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x00,0xfc,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120fc00 <unknown>
+
+luti6 { z1.h, z5.h, z9.h, z13.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z1.h, z5.h, z9.h, z13.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x01,0xfc,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120fc01 <unknown>
+
+luti6 { z2.h, z6.h, z10.h, z14.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z2.h, z6.h, z10.h, z14.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x02,0xfc,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120fc02 <unknown>
+
+luti6 { z3.h, z7.h, z11.h, z15.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z3.h, z7.h, z11.h, z15.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x03,0xfc,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120fc03 <unknown>
+
+luti6 { z16.h, z20.h, z24.h, z28.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z16.h, z20.h, z24.h, z28.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x10,0xfc,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120fc10 <unknown>
+
+luti6 { z17.h, z21.h, z25.h, z29.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z17.h, z21.h, z25.h, z29.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x11,0xfc,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120fc11 <unknown>
+
+luti6 { z18.h, z22.h, z26.h, z30.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z18.h, z22.h, z26.h, z30.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x12,0xfc,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120fc12 <unknown>
+
+luti6 { z19.h, z23.h, z27.h, z31.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z19.h, z23.h, z27.h, z31.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x13,0xfc,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120fc13 <unknown>
+
+luti6 { z0.h, z4.h, z8.h, z12.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z0.h, z4.h, z8.h, z12.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xe0,0xff,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17fffe0 <unknown>
+
+luti6 { z1.h, z5.h, z9.h, z13.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z1.h, z5.h, z9.h, z13.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xe1,0xff,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17fffe1 <unknown>
+
+luti6 { z2.h, z6.h, z10.h, z14.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z2.h, z6.h, z10.h, z14.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xe2,0xff,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17fffe2 <unknown>
+
+luti6 { z3.h, z7.h, z11.h, z15.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z3.h, z7.h, z11.h, z15.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xe3,0xff,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17fffe3 <unknown>
+
+luti6 { z16.h, z20.h, z24.h, z28.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z16.h, z20.h, z24.h, z28.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xf0,0xff,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17ffff0 <unknown>
+
+luti6 { z17.h, z21.h, z25.h, z29.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z17.h, z21.h, z25.h, z29.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xf1,0xff,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17ffff1 <unknown>
+
+luti6 { z18.h, z22.h, z26.h, z30.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z18.h, z22.h, z26.h, z30.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xf2,0xff,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17ffff2 <unknown>
+
+luti6 { z19.h, z23.h, z27.h, z31.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z19.h, z23.h, z27.h, z31.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xf3,0xff,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17ffff3 <unknown>
+
+// ----------------------------------------------------------
+// Lookup table read with 6-bit indices (8-bit) - consecutive
+
+luti6 { z8.b - z11.b }, zt0, { z0 - z2 }
+// CHECK-INST: luti6 { z8.b - z11.b }, zt0, { z0 - z2 }
+// CHECK-ENCODING: encoding: [0x08,0x00,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a0008 <unknown>
+
+luti6 { z20.b - z23.b }, zt0, { z0 - z2 }
+// CHECK-INST: luti6 { z20.b - z23.b }, zt0, { z0 - z2 }
+// CHECK-ENCODING: encoding: [0x14,0x00,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a0014 <unknown>
+
+luti6 { z28.b - z31.b }, zt0, { z0 - z2 }
+// CHECK-INST: luti6 { z28.b - z31.b }, zt0, { z0 - z2 }
+// CHECK-ENCODING: encoding: [0x1c,0x00,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a001c <unknown>
+
+luti6 { z0.b - z3.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z0.b - z3.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x00,0x01,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a0100 <unknown>
+
+luti6 { z8.b - z11.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z8.b - z11.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x08,0x01,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a0108 <unknown>
+
+luti6 { z20.b - z23.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z20.b - z23.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x14,0x01,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a0114 <unknown>
+
+luti6 { z28.b - z31.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z28.b - z31.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x1c,0x01,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a011c <unknown>
+
+luti6 { z0.b - z3.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z0.b - z3.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x80,0x02,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a0280 <unknown>
+
+luti6 { z8.b - z11.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z8.b - z11.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x88,0x02,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a0288 <unknown>
+
+luti6 { z20.b - z23.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z20.b - z23.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x94,0x02,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a0294 <unknown>
+
+luti6 { z28.b - z31.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z28.b - z31.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x9c,0x02,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a029c <unknown>
+
+luti6 { z0.b - z3.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z0.b - z3.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x80,0x03,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a0380 <unknown>
+
+luti6 { z8.b - z11.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z8.b - z11.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x88,0x03,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a0388 <unknown>
+
+luti6 { z20.b - z23.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z20.b - z23.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x94,0x03,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a0394 <unknown>
+
+luti6 { z28.b - z31.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z28.b - z31.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x9c,0x03,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a039c <unknown>
+
+// ----------------------------------------------------------
+// Lookup table read with 6-bit indices (8-bit) - strided
+
+luti6 { z1.b, z5.b, z9.b, z13.b }, zt0, { z0 - z2 }
+// CHECK-INST: luti6 { z1.b, z5.b, z9.b, z13.b }, zt0, { z0 - z2 }
+// CHECK-ENCODING: encoding: [0x01,0x00,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0001 <unknown>
+
+luti6 { z2.b, z6.b, z10.b, z14.b }, zt0, { z0 - z2 }
+// CHECK-INST: luti6 { z2.b, z6.b, z10.b, z14.b }, zt0, { z0 - z2 }
+// CHECK-ENCODING: encoding: [0x02,0x00,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0002 <unknown>
+
+luti6 { z3.b, z7.b, z11.b, z15.b }, zt0, { z0 - z2 }
+// CHECK-INST: luti6 { z3.b, z7.b, z11.b, z15.b }, zt0, { z0 - z2 }
+// CHECK-ENCODING: encoding: [0x03,0x00,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0003 <unknown>
+
+luti6 { z16.b, z20.b, z24.b, z28.b }, zt0, { z0 - z2 }
+// CHECK-INST: luti6 { z16.b, z20.b, z24.b, z28.b }, zt0, { z0 - z2 }
+// CHECK-ENCODING: encoding: [0x10,0x00,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0010 <unknown>
+
+luti6 { z17.b, z21.b, z25.b, z29.b }, zt0, { z0 - z2 }
+// CHECK-INST: luti6 { z17.b, z21.b, z25.b, z29.b }, zt0, { z0 - z2 }
+// CHECK-ENCODING: encoding: [0x11,0x00,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0011 <unknown>
+
+luti6 { z18.b, z22.b, z26.b, z30.b }, zt0, { z0 - z2 }
+// CHECK-INST: luti6 { z18.b, z22.b, z26.b, z30.b }, zt0, { z0 - z2 }
+// CHECK-ENCODING: encoding: [0x12,0x00,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0012 <unknown>
+
+luti6 { z19.b, z23.b, z27.b, z31.b }, zt0, { z0 - z2 }
+// CHECK-INST: luti6 { z19.b, z23.b, z27.b, z31.b }, zt0, { z0 - z2 }
+// CHECK-ENCODING: encoding: [0x13,0x00,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0013 <unknown>
+
+luti6 { z0.b, z4.b, z8.b, z12.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z0.b, z4.b, z8.b, z12.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x00,0x01,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0100 <unknown>
+
+luti6 { z1.b, z5.b, z9.b, z13.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z1.b, z5.b, z9.b, z13.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x01,0x01,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0101 <unknown>
+
+luti6 { z2.b, z6.b, z10.b, z14.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z2.b, z6.b, z10.b, z14.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x02,0x01,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0102 <unknown>
+
+luti6 { z3.b, z7.b, z11.b, z15.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z3.b, z7.b, z11.b, z15.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x03,0x01,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0103 <unknown>
+
+luti6 { z16.b, z20.b, z24.b, z28.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z16.b, z20.b, z24.b, z28.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x10,0x01,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0110 <unknown>
+
+luti6 { z17.b, z21.b, z25.b, z29.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z17.b, z21.b, z25.b, z29.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x11,0x01,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0111 <unknown>
+
+luti6 { z18.b, z22.b, z26.b, z30.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z18.b, z22.b, z26.b, z30.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x12,0x01,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0112 <unknown>
+
+luti6 { z19.b, z23.b, z27.b, z31.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z19.b, z23.b, z27.b, z31.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x13,0x01,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0113 <unknown>
+
+luti6 { z0.b, z4.b, z8.b, z12.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z0.b, z4.b, z8.b, z12.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x80,0x02,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0280 <unknown>
+
+luti6 { z1.b, z5.b, z9.b, z13.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z1.b, z5.b, z9.b, z13.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x81,0x02,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0281 <unknown>
+
+luti6 { z2.b, z6.b, z10.b, z14.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z2.b, z6.b, z10.b, z14.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x82,0x02,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0282 <unknown>
+
+luti6 { z3.b, z7.b, z11.b, z15.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z3.b, z7.b, z11.b, z15.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x83,0x02,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0283 <unknown>
+
+luti6 { z16.b, z20.b, z24.b, z28.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z16.b, z20.b, z24.b, z28.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x90,0x02,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0290 <unknown>
+
+luti6 { z17.b, z21.b, z25.b, z29.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z17.b, z21.b, z25.b, z29.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x91,0x02,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0291 <unknown>
+
+luti6 { z18.b, z22.b, z26.b, z30.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z18.b, z22.b, z26.b, z30.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x92,0x02,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0292 <unknown>
+
+luti6 { z19.b, z23.b, z27.b, z31.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z19.b, z23.b, z27.b, z31.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x93,0x02,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0293 <unknown>
+
+luti6 { z0.b, z4.b, z8.b, z12.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z0.b, z4.b, z8.b, z12.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x80,0x03,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0380 <unknown>
+
+luti6 { z1.b, z5.b, z9.b, z13.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z1.b, z5.b, z9.b, z13.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x81,0x03,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0381 <unknown>
+
+luti6 { z2.b, z6.b, z10.b, z14.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z2.b, z6.b, z10.b, z14.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x82,0x03,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0382 <unknown>
+
+luti6 { z3.b, z7.b, z11.b, z15.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z3.b, z7.b, z11.b, z15.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x83,0x03,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0383 <unknown>
+
+luti6 { z16.b, z20.b, z24.b, z28.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z16.b, z20.b, z24.b, z28.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x90,0x03,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0390 <unknown>
+
+luti6 { z17.b, z21.b, z25.b, z29.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z17.b, z21.b, z25.b, z29.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x91,0x03,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0391 <unknown>
+
+luti6 { z18.b, z22.b, z26.b, z30.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z18.b, z22.b, z26.b, z30.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x92,0x03,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0392 <unknown>
+
+luti6 { z19.b, z23.b, z27.b, z31.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z19.b, z23.b, z27.b, z31.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x93,0x03,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0393 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/bfmmla-diagnostics.s b/llvm/test/MC/AArch64/SVE/bfmmla-diagnostics.s
index 409c2c5..4695b05 100644
--- a/llvm/test/MC/AArch64/SVE/bfmmla-diagnostics.s
+++ b/llvm/test/MC/AArch64/SVE/bfmmla-diagnostics.s
@@ -5,11 +5,6 @@ bfmmla z0.s, z1.s, z2.h
// CHECK-NEXT: bfmmla z0.s, z1.s, z2.h
// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
-bfmmla z0.h, z1.h, z2.h
-// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
-// CHECK-NEXT: bfmmla z0.h, z1.h, z2.h
-// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
-
bfmmla z0.s, z1.h, z2.s
// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
// CHECK-NEXT: bfmmla z0.s, z1.h, z2.s
diff --git a/llvm/test/MC/AArch64/SVE2p1/sdot-diagnostics.s b/llvm/test/MC/AArch64/SVE2p1/sdot-diagnostics.s
index 344abb1..42c6ae7 100644
--- a/llvm/test/MC/AArch64/SVE2p1/sdot-diagnostics.s
+++ b/llvm/test/MC/AArch64/SVE2p1/sdot-diagnostics.s
@@ -29,7 +29,7 @@ sdot z0.s, z0.h, z0.h[-1]
// Invalid vector suffix
sdot z0.h, z0.s, z0.s
-// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
// CHECK-NEXT: sdot z0.h, z0.s, z0.s
// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/SVE2p1/udot-diagnostics.s b/llvm/test/MC/AArch64/SVE2p1/udot-diagnostics.s
index 0debddf..aa2ec7f 100644
--- a/llvm/test/MC/AArch64/SVE2p1/udot-diagnostics.s
+++ b/llvm/test/MC/AArch64/SVE2p1/udot-diagnostics.s
@@ -29,11 +29,11 @@ udot z0.s, z0.h, z0.h[-1]
// Invalid vector suffix
udot z0.h, z0.s, z0.s
-// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
// CHECK-NEXT: udot z0.h, z0.s, z0.s
// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
udot z0.h, z0.s, z0.s[1]
-// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
// CHECK-NEXT: udot z0.h, z0.s, z0.s[1]
// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/SVE2p2/fmmla-diagnostics.s b/llvm/test/MC/AArch64/SVE2p2/fmmla-diagnostics.s
new file mode 100644
index 0000000..e88fcce
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p2/fmmla-diagnostics.s
@@ -0,0 +1,15 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p2,+f16mm 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+fmmla z0.b, z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: fmmla z0.b, z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+movprfx z0.d, p0/z, z6.d
+fmmla z0.h, z2.h, z3.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a predicated movprfx, suggest using unpredicated movprfx
+// CHECK-NEXT: fmmla z0.h, z2.h, z3.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/SVE2p2/fmmla.s b/llvm/test/MC/AArch64/SVE2p2/fmmla.s
new file mode 100644
index 0000000..19929a9
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p2/fmmla.s
@@ -0,0 +1,45 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p2,+f16mm < %s \
+// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN: | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p2,+f16mm < %s \
+// RUN: | llvm-objdump -d --mattr=+sve2p2,+f16mm --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p2,+f16mm < %s \
+// RUN: | llvm-objdump -d --mattr=-sve2p2,-f16mm --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p2,+f16mm < %s \
+// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN: | llvm-mc -triple=aarch64 -mattr=+sve2p2,+f16mm -disassemble -show-encoding \
+// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+fmmla z0.h, z0.h, z0.h
+// CHECK-INST: fmmla z0.h, z0.h, z0.h
+// CHECK-ENCODING: encoding: [0x00,0xe0,0xa0,0x64]
+// CHECK-ERROR: instruction requires: f16mm sve2p2
+// CHECK-UNKNOWN: 64a0e000 <unknown>
+
+fmmla z10.h, z10.h, z10.h
+// CHECK-INST: fmmla z10.h, z10.h, z10.h
+// CHECK-ENCODING: encoding: [0x4a,0xe1,0xaa,0x64]
+// CHECK-ERROR: instruction requires: f16mm sve2p2
+// CHECK-UNKNOWN: 64aae14a <unknown>
+
+fmmla z21.h, z21.h, z21.h
+// CHECK-INST: fmmla z21.h, z21.h, z21.h
+// CHECK-ENCODING: encoding: [0xb5,0xe2,0xb5,0x64]
+// CHECK-ERROR: instruction requires: f16mm sve2p2
+// CHECK-UNKNOWN: 64b5e2b5 <unknown>
+
+fmmla z31.h, z31.h, z31.h
+// CHECK-INST: fmmla z31.h, z31.h, z31.h
+// CHECK-ENCODING: encoding: [0xff,0xe3,0xbf,0x64]
+// CHECK-ERROR: instruction requires: f16mm sve2p2
+// CHECK-UNKNOWN: 64bfe3ff <unknown>
+
+movprfx z0, z7
+fmmla z0.h, z1.h, z2.h
+// CHECK-INST: movprfx z0, z7
+// CHECK-INST: fmmla z0.h, z1.h, z2.h
+// CHECK-ENCODING: encoding: [0x20,0xe0,0xa2,0x64]
+// CHECK-ERROR: instruction requires: f16mm sve2p2
+// CHECK-UNKNOWN: 64a2e020 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2p3/arithmetic-diagnostics.s b/llvm/test/MC/AArch64/SVE2p3/arithmetic-diagnostics.s
new file mode 100644
index 0000000..2c0cf07
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/arithmetic-diagnostics.s
@@ -0,0 +1,147 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Test addqp
+
+addqp z0.h, z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: addqp z0.h, z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+addqp z0.s, z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: addqp z0.s, z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+addqp z0.d, z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: addqp z0.d, z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+addqp z0.b, z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: addqp z0.b, z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Test addsubp
+
+addsubp z0.h, z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: addsubp z0.h, z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+addsubp z0.s, z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: addsubp z0.s, z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+addsubp z0.d, z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: addsubp z0.d, z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+addsubp z0.b, z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: addsubp z0.b, z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Test sabal
+
+sabal z0.b, z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sabal z0.b, z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sabal z0.h, z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sabal z0.h, z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sabal z0.s, z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sabal z0.s, z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sabal z0.d, z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sabal z0.d, z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Test uabal
+
+uabal z0.b, z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: uabal z0.b, z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+uabal z0.h, z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: uabal z0.h, z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+uabal z0.s, z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: uabal z0.s, z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+uabal z0.d, z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: uabal z0.d, z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Test subp
+
+subp z0.h, p0/m, z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: subp z0.h, p0/m, z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+subp z0.s, p0/m, z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: subp z0.s, p0/m, z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+subp z0.d, p0/m, z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: subp z0.d, p0/m, z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+subp z0.b, p0/m, z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: subp z0.b, p0/m, z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Predicate not in restricted predicate range
+
+subp z0.h, p8/m, z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted predicate register, expected p0..p7 (without element suffix)
+// CHECK-NEXT: subp z0.h, p8/m, z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Operand must match destination register
+
+subp z0.b, p0/m, z1.b, z2.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: operand must match destination register
+// CHECK-NEXT: subp z0.b, p0/m, z1.b, z2.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+addqp z0.b, z1.b, z2.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: addqp z0.b, z1.b, z2.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+movprfx z0, z7
+addsubp z0.b, z1.b, z2.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: addsubp z0.b, z1.b, z2.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/SVE2p3/arithmetic.s b/llvm/test/MC/AArch64/SVE2p3/arithmetic.s
new file mode 100644
index 0000000..12df18d
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/arithmetic.s
@@ -0,0 +1,275 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 < %s \
+// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p3 < %s \
+// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN: | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p3 < %s \
+// RUN: | llvm-objdump -d --mattr=+sve2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p3 < %s \
+// RUN: | llvm-objdump -d --mattr=-sve2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 < %s \
+// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN: | llvm-mc -triple=aarch64 -mattr=+sve2p3 -disassemble -show-encoding \
+// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+addqp z0.b, z0.b, z0.b
+// CHECK-INST: addqp z0.b, z0.b, z0.b
+// CHECK-ENCODING: encoding: [0x00,0x78,0x20,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04207800 <unknown>
+
+addqp z31.b, z31.b, z31.b
+// CHECK-INST: addqp z31.b, z31.b, z31.b
+// CHECK-ENCODING: encoding: [0xff,0x7b,0x3f,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 043f7bff <unknown>
+
+addqp z0.h, z0.h, z0.h
+// CHECK-INST: addqp z0.h, z0.h, z0.h
+// CHECK-ENCODING: encoding: [0x00,0x78,0x60,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04607800 <unknown>
+
+addqp z31.h, z31.h, z31.h
+// CHECK-INST: addqp z31.h, z31.h, z31.h
+// CHECK-ENCODING: encoding: [0xff,0x7b,0x7f,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 047f7bff <unknown>
+
+addqp z0.s, z0.s, z0.s
+// CHECK-INST: addqp z0.s, z0.s, z0.s
+// CHECK-ENCODING: encoding: [0x00,0x78,0xa0,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04a07800 <unknown>
+
+addqp z31.s, z31.s, z31.s
+// CHECK-INST: addqp z31.s, z31.s, z31.s
+// CHECK-ENCODING: encoding: [0xff,0x7b,0xbf,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04bf7bff <unknown>
+
+addqp z0.d, z0.d, z0.d
+// CHECK-INST: addqp z0.d, z0.d, z0.d
+// CHECK-ENCODING: encoding: [0x00,0x78,0xe0,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04e07800 <unknown>
+
+addqp z31.d, z31.d, z31.d
+// CHECK-INST: addqp z31.d, z31.d, z31.d
+// CHECK-ENCODING: encoding: [0xff,0x7b,0xff,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04ff7bff <unknown>
+
+// --------------------------------------------------------------------------//
+// Test addsubp
+
+addsubp z0.b, z0.b, z0.b
+// CHECK-INST: addsubp z0.b, z0.b, z0.b
+// CHECK-ENCODING: encoding: [0x00,0x7c,0x20,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04207c00 <unknown>
+
+addsubp z31.b, z31.b, z31.b
+// CHECK-INST: addsubp z31.b, z31.b, z31.b
+// CHECK-ENCODING: encoding: [0xff,0x7f,0x3f,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 043f7fff <unknown>
+
+addsubp z0.h, z0.h, z0.h
+// CHECK-INST: addsubp z0.h, z0.h, z0.h
+// CHECK-ENCODING: encoding: [0x00,0x7c,0x60,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04607c00 <unknown>
+
+addsubp z31.h, z31.h, z31.h
+// CHECK-INST: addsubp z31.h, z31.h, z31.h
+// CHECK-ENCODING: encoding: [0xff,0x7f,0x7f,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 047f7fff <unknown>
+
+addsubp z0.s, z0.s, z0.s
+// CHECK-INST: addsubp z0.s, z0.s, z0.s
+// CHECK-ENCODING: encoding: [0x00,0x7c,0xa0,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04a07c00 <unknown>
+
+addsubp z31.s, z31.s, z31.s
+// CHECK-INST: addsubp z31.s, z31.s, z31.s
+// CHECK-ENCODING: encoding: [0xff,0x7f,0xbf,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04bf7fff <unknown>
+
+addsubp z0.d, z0.d, z0.d
+// CHECK-INST: addsubp z0.d, z0.d, z0.d
+// CHECK-ENCODING: encoding: [0x00,0x7c,0xe0,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04e07c00 <unknown>
+
+addsubp z31.d, z31.d, z31.d
+// CHECK-INST: addsubp z31.d, z31.d, z31.d
+// CHECK-ENCODING: encoding: [0xff,0x7f,0xff,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04ff7fff <unknown>
+
+// --------------------------------------------------------------------------//
+// Test sabal
+
+sabal z0.h, z0.b, z0.b
+// CHECK-INST: sabal z0.h, z0.b, z0.b
+// CHECK-ENCODING: encoding: [0x00,0xd4,0x40,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4440d400 <unknown>
+
+sabal z31.h, z31.b, z31.b
+// CHECK-INST: sabal z31.h, z31.b, z31.b
+// CHECK-ENCODING: encoding: [0xff,0xd7,0x5f,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 445fd7ff <unknown>
+
+sabal z0.s, z0.h, z0.h
+// CHECK-INST: sabal z0.s, z0.h, z0.h
+// CHECK-ENCODING: encoding: [0x00,0xd4,0x80,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4480d400 <unknown>
+
+sabal z31.s, z31.h, z31.h
+// CHECK-INST: sabal z31.s, z31.h, z31.h
+// CHECK-ENCODING: encoding: [0xff,0xd7,0x9f,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 449fd7ff <unknown>
+
+sabal z0.d, z0.s, z0.s
+// CHECK-INST: sabal z0.d, z0.s, z0.s
+// CHECK-ENCODING: encoding: [0x00,0xd4,0xc0,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44c0d400 <unknown>
+
+sabal z31.d, z31.s, z31.s
+// CHECK-INST: sabal z31.d, z31.s, z31.s
+// CHECK-ENCODING: encoding: [0xff,0xd7,0xdf,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44dfd7ff <unknown>
+
+movprfx z0, z7
+sabal z0.h, z1.b, z2.b
+// CHECK-INST: movprfx z0, z7
+// CHECK-INST: sabal z0.h, z1.b, z2.b
+// CHECK-ENCODING: encoding: [0x20,0xd4,0x42,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4442d420 <unknown>
+
+// --------------------------------------------------------------------------//
+// Test uabal
+
+uabal z0.h, z0.b, z0.b
+// CHECK-INST: uabal z0.h, z0.b, z0.b
+// CHECK-ENCODING: encoding: [0x00,0xdc,0x40,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4440dc00 <unknown>
+
+uabal z31.h, z31.b, z31.b
+// CHECK-INST: uabal z31.h, z31.b, z31.b
+// CHECK-ENCODING: encoding: [0xff,0xdf,0x5f,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 445fdfff <unknown>
+
+uabal z0.s, z0.h, z0.h
+// CHECK-INST: uabal z0.s, z0.h, z0.h
+// CHECK-ENCODING: encoding: [0x00,0xdc,0x80,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4480dc00 <unknown>
+
+uabal z31.s, z31.h, z31.h
+// CHECK-INST: uabal z31.s, z31.h, z31.h
+// CHECK-ENCODING: encoding: [0xff,0xdf,0x9f,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 449fdfff <unknown>
+
+uabal z0.d, z0.s, z0.s
+// CHECK-INST: uabal z0.d, z0.s, z0.s
+// CHECK-ENCODING: encoding: [0x00,0xdc,0xc0,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44c0dc00 <unknown>
+
+uabal z31.d, z31.s, z31.s
+// CHECK-INST: uabal z31.d, z31.s, z31.s
+// CHECK-ENCODING: encoding: [0xff,0xdf,0xdf,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44dfdfff <unknown>
+
+movprfx z0, z7
+uabal z0.h, z1.b, z2.b
+// CHECK-INST: movprfx z0, z7
+// CHECK-INST: uabal z0.h, z1.b, z2.b
+// CHECK-ENCODING: encoding: [0x20,0xdc,0x42,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4442dc20 <unknown>
+
+// --------------------------------------------------------------------------//
+// Test subp
+
+subp z0.b, p0/m, z0.b, z0.b
+// CHECK-INST: subp z0.b, p0/m, z0.b, z0.b
+// CHECK-ENCODING: encoding: [0x00,0xa0,0x10,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4410a000 <unknown>
+
+subp z31.b, p7/m, z31.b, z31.b
+// CHECK-INST: subp z31.b, p7/m, z31.b, z31.b
+// CHECK-ENCODING: encoding: [0xff,0xbf,0x10,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4410bfff <unknown>
+
+subp z0.h, p0/m, z0.h, z0.h
+// CHECK-INST: subp z0.h, p0/m, z0.h, z0.h
+// CHECK-ENCODING: encoding: [0x00,0xa0,0x50,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4450a000 <unknown>
+
+subp z31.h, p7/m, z31.h, z31.h
+// CHECK-INST: subp z31.h, p7/m, z31.h, z31.h
+// CHECK-ENCODING: encoding: [0xff,0xbf,0x50,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4450bfff <unknown>
+
+subp z0.s, p0/m, z0.s, z0.s
+// CHECK-INST: subp z0.s, p0/m, z0.s, z0.s
+// CHECK-ENCODING: encoding: [0x00,0xa0,0x90,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4490a000 <unknown>
+
+subp z31.s, p7/m, z31.s, z31.s
+// CHECK-INST: subp z31.s, p7/m, z31.s, z31.s
+// CHECK-ENCODING: encoding: [0xff,0xbf,0x90,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4490bfff <unknown>
+
+subp z0.d, p0/m, z0.d, z0.d
+// CHECK-INST: subp z0.d, p0/m, z0.d, z0.d
+// CHECK-ENCODING: encoding: [0x00,0xa0,0xd0,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44d0a000 <unknown>
+
+subp z31.d, p7/m, z31.d, z31.d
+// CHECK-INST: subp z31.d, p7/m, z31.d, z31.d
+// CHECK-ENCODING: encoding: [0xff,0xbf,0xd0,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44d0bfff <unknown>
+
+movprfx z0.b, p0/m, z7.b
+subp z0.b, p0/m, z0.b, z1.b
+// CHECK-INST: movprfx z0.b, p0/m, z7.b
+// CHECK-INST: subp z0.b, p0/m, z0.b, z1.b
+// CHECK-ENCODING: encoding: [0x20,0xa0,0x10,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4410a020 <unknown>
+
+movprfx z0, z7
+subp z0.b, p0/m, z0.b, z1.b
+// CHECK-INST: movprfx z0, z7
+// CHECK-INST: subp z0.b, p0/m, z0.b, z1.b
+// CHECK-ENCODING: encoding: [0x20,0xa0,0x10,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4410a020 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2p3/bfmmla-diagnostics.s b/llvm/test/MC/AArch64/SVE2p3/bfmmla-diagnostics.s
new file mode 100644
index 0000000..28ec78d
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/bfmmla-diagnostics.s
@@ -0,0 +1,19 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve-b16mm 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+bfmmla z0.h, z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: bfmmla z0.h, z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+bfmmla z0.s, z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: bfmmla z0.s, z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+bfmmla z0.d, z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: bfmmla z0.d, z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/SVE2p3/bfmmla.s b/llvm/test/MC/AArch64/SVE2p3/bfmmla.s
new file mode 100644
index 0000000..77440ee
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/bfmmla.s
@@ -0,0 +1,45 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve-b16mm < %s \
+// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN: | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve-b16mm < %s \
+// RUN: | llvm-objdump -d --mattr=+sve-b16mm --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve-b16mm < %s \
+// RUN: | llvm-objdump -d --mattr=-sve-b16mm --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve-b16mm < %s \
+// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN: | llvm-mc -triple=aarch64 -mattr=+sve-b16mm -disassemble -show-encoding \
+// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+bfmmla z0.h, z0.h, z0.h
+// CHECK-INST: bfmmla z0.h, z0.h, z0.h
+// CHECK-ENCODING: encoding: [0x00,0xe0,0xe0,0x64]
+// CHECK-ERROR: instruction requires: sve-b16mm
+// CHECK-UNKNOWN: 64e0e000 <unknown>
+
+bfmmla z10.h, z10.h, z10.h
+// CHECK-INST: bfmmla z10.h, z10.h, z10.h
+// CHECK-ENCODING: encoding: [0x4a,0xe1,0xea,0x64]
+// CHECK-ERROR: instruction requires: sve-b16mm
+// CHECK-UNKNOWN: 64eae14a <unknown>
+
+bfmmla z21.h, z21.h, z21.h
+// CHECK-INST: bfmmla z21.h, z21.h, z21.h
+// CHECK-ENCODING: encoding: [0xb5,0xe2,0xf5,0x64]
+// CHECK-ERROR: instruction requires: sve-b16mm
+// CHECK-UNKNOWN: 64f5e2b5 <unknown>
+
+bfmmla z31.h, z31.h, z31.h
+// CHECK-INST: bfmmla z31.h, z31.h, z31.h
+// CHECK-ENCODING: encoding: [0xff,0xe3,0xff,0x64]
+// CHECK-ERROR: instruction requires: sve-b16mm
+// CHECK-UNKNOWN: 64ffe3ff <unknown>
+
+movprfx z0, z7
+bfmmla z0.h, z1.h, z2.h
+// CHECK-INST: movprfx z0, z7
+// CHECK-INST: bfmmla z0.h, z1.h, z2.h
+// CHECK-ENCODING: encoding: [0x20,0xe0,0xe2,0x64]
+// CHECK-ERROR: instruction requires: sve-b16mm
+// CHECK-UNKNOWN: 64e2e020 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2p3/cvt-diagnostics.s b/llvm/test/MC/AArch64/SVE2p3/cvt-diagnostics.s
new file mode 100644
index 0000000..68a50ab
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/cvt-diagnostics.s
@@ -0,0 +1,193 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Invalid operand for instruction
+
+fcvtzsn z0.b, { z0.b, z1.b }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fcvtzsn z0.b, { z0.b, z1.b }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fcvtzsn z0.h, { z0.h, z1.h }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fcvtzsn z0.h, { z0.h, z1.h }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fcvtzsn z0.s, { z0.s, z1.s }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fcvtzsn z0.s, { z0.s, z1.s }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fcvtzsn z0.b, { z1.h, z2.h }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types
+// CHECK-NEXT: fcvtzsn z0.b, { z1.h, z2.h }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+fcvtzsn z0.b, { z2.h, z3.h }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: fcvtzsn z0.b, { z2.h, z3.h }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid operand for instruction
+
+fcvtzun z0.b, { z0.b, z1.b }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fcvtzun z0.b, { z0.b, z1.b }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fcvtzun z0.h, { z0.h, z1.h }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fcvtzun z0.h, { z0.h, z1.h }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fcvtzun z0.s, { z0.s, z1.s }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fcvtzun z0.s, { z0.s, z1.s }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fcvtzun z0.b, { z1.h, z2.h }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types
+// CHECK-NEXT: fcvtzun z0.b, { z1.h, z2.h }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+fcvtzun z0.b, { z2.h, z3.h }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: fcvtzun z0.b, { z2.h, z3.h }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+scvtf z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: scvtf z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+scvtf z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: scvtf z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+scvtf z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: scvtf z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+scvtf z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: scvtf z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+scvtf z0.h, z1.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: scvtf z0.h, z1.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+scvtflt z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: scvtflt z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+scvtflt z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: scvtflt z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+scvtflt z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: scvtflt z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+scvtflt z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: scvtflt z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+scvtflt z0.h, z1.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: scvtflt z0.h, z1.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+ucvtf z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: ucvtf z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+ucvtf z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: ucvtf z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+ucvtf z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: ucvtf z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+ucvtf z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: ucvtf z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+ucvtf z0.h, z1.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: ucvtf z0.h, z1.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+ucvtflt z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: ucvtflt z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+ucvtflt z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: ucvtflt z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+ucvtflt z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: ucvtflt z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+ucvtflt z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: ucvtflt z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+ucvtflt z0.h, z1.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: ucvtflt z0.h, z1.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/SVE2p3/cvt.s b/llvm/test/MC/AArch64/SVE2p3/cvt.s
new file mode 100644
index 0000000..da9c463
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/cvt.s
@@ -0,0 +1,321 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 < %s \
+// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p3 < %s \
+// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN: | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p3 < %s \
+// RUN: | llvm-objdump -d --mattr=+sve2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p3 < %s \
+// RUN: | llvm-objdump -d --mattr=-sve2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 < %s \
+// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN: | llvm-mc -triple=aarch64 -mattr=+sve2p3 -disassemble -show-encoding \
+// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+// -------------------------------------------------------------
+// Floating-point convert, narrow and interleave to signed integer, rounding toward zero
+
+fcvtzsn z0.b, { z0.h, z1.h }
+// CHECK-INST: fcvtzsn z0.b, { z0.h, z1.h }
+// CHECK-ENCODING: encoding: [0x00,0x30,0x4d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654d3000 <unknown>
+
+fcvtzsn z31.b, { z0.h, z1.h }
+// CHECK-INST: fcvtzsn z31.b, { z0.h, z1.h }
+// CHECK-ENCODING: encoding: [0x1f,0x30,0x4d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654d301f <unknown>
+
+fcvtzsn z0.b, { z30.h, z31.h }
+// CHECK-INST: fcvtzsn z0.b, { z30.h, z31.h }
+// CHECK-ENCODING: encoding: [0xc0,0x33,0x4d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654d33c0 <unknown>
+
+fcvtzsn z31.b, { z30.h, z31.h }
+// CHECK-INST: fcvtzsn z31.b, { z30.h, z31.h }
+// CHECK-ENCODING: encoding: [0xdf,0x33,0x4d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654d33df <unknown>
+
+fcvtzsn z0.h, { z0.s, z1.s }
+// CHECK-INST: fcvtzsn z0.h, { z0.s, z1.s }
+// CHECK-ENCODING: encoding: [0x00,0x30,0x8d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658d3000 <unknown>
+
+fcvtzsn z31.h, { z0.s, z1.s }
+// CHECK-INST: fcvtzsn z31.h, { z0.s, z1.s }
+// CHECK-ENCODING: encoding: [0x1f,0x30,0x8d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658d301f <unknown>
+
+fcvtzsn z0.h, { z30.s, z31.s }
+// CHECK-INST: fcvtzsn z0.h, { z30.s, z31.s }
+// CHECK-ENCODING: encoding: [0xc0,0x33,0x8d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658d33c0 <unknown>
+
+fcvtzsn z31.h, { z30.s, z31.s }
+// CHECK-INST: fcvtzsn z31.h, { z30.s, z31.s }
+// CHECK-ENCODING: encoding: [0xdf,0x33,0x8d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658d33df <unknown>
+
+fcvtzsn z0.s, { z0.d, z1.d }
+// CHECK-INST: fcvtzsn z0.s, { z0.d, z1.d }
+// CHECK-ENCODING: encoding: [0x00,0x30,0xcd,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cd3000 <unknown>
+
+fcvtzsn z31.s, { z0.d, z1.d }
+// CHECK-INST: fcvtzsn z31.s, { z0.d, z1.d }
+// CHECK-ENCODING: encoding: [0x1f,0x30,0xcd,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cd301f <unknown>
+
+fcvtzsn z0.s, { z30.d, z31.d }
+// CHECK-INST: fcvtzsn z0.s, { z30.d, z31.d }
+// CHECK-ENCODING: encoding: [0xc0,0x33,0xcd,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cd33c0 <unknown>
+
+fcvtzsn z31.s, { z30.d, z31.d }
+// CHECK-INST: fcvtzsn z31.s, { z30.d, z31.d }
+// CHECK-ENCODING: encoding: [0xdf,0x33,0xcd,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cd33df <unknown>
+
+// -------------------------------------------------------------
+// Floating-point convert, narrow and interleave to unsigned integer, rounding toward zero
+
+fcvtzun z0.b, { z0.h, z1.h }
+// CHECK-INST: fcvtzun z0.b, { z0.h, z1.h }
+// CHECK-ENCODING: encoding: [0x00,0x34,0x4d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654d3400 <unknown>
+
+fcvtzun z31.b, { z0.h, z1.h }
+// CHECK-INST: fcvtzun z31.b, { z0.h, z1.h }
+// CHECK-ENCODING: encoding: [0x1f,0x34,0x4d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654d341f <unknown>
+
+fcvtzun z0.b, { z30.h, z31.h }
+// CHECK-INST: fcvtzun z0.b, { z30.h, z31.h }
+// CHECK-ENCODING: encoding: [0xc0,0x37,0x4d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654d37c0 <unknown>
+
+fcvtzun z31.b, { z30.h, z31.h }
+// CHECK-INST: fcvtzun z31.b, { z30.h, z31.h }
+// CHECK-ENCODING: encoding: [0xdf,0x37,0x4d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654d37df <unknown>
+
+fcvtzun z0.h, { z0.s, z1.s }
+// CHECK-INST: fcvtzun z0.h, { z0.s, z1.s }
+// CHECK-ENCODING: encoding: [0x00,0x34,0x8d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658d3400 <unknown>
+
+fcvtzun z31.h, { z0.s, z1.s }
+// CHECK-INST: fcvtzun z31.h, { z0.s, z1.s }
+// CHECK-ENCODING: encoding: [0x1f,0x34,0x8d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658d341f <unknown>
+
+fcvtzun z0.h, { z30.s, z31.s }
+// CHECK-INST: fcvtzun z0.h, { z30.s, z31.s }
+// CHECK-ENCODING: encoding: [0xc0,0x37,0x8d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658d37c0 <unknown>
+
+fcvtzun z31.h, { z30.s, z31.s }
+// CHECK-INST: fcvtzun z31.h, { z30.s, z31.s }
+// CHECK-ENCODING: encoding: [0xdf,0x37,0x8d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658d37df <unknown>
+
+fcvtzun z0.s, { z0.d, z1.d }
+// CHECK-INST: fcvtzun z0.s, { z0.d, z1.d }
+// CHECK-ENCODING: encoding: [0x00,0x34,0xcd,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cd3400 <unknown>
+
+fcvtzun z31.s, { z0.d, z1.d }
+// CHECK-INST: fcvtzun z31.s, { z0.d, z1.d }
+// CHECK-ENCODING: encoding: [0x1f,0x34,0xcd,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cd341f <unknown>
+
+fcvtzun z0.s, { z30.d, z31.d }
+// CHECK-INST: fcvtzun z0.s, { z30.d, z31.d }
+// CHECK-ENCODING: encoding: [0xc0,0x37,0xcd,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cd37c0 <unknown>
+
+fcvtzun z31.s, { z30.d, z31.d }
+// CHECK-INST: fcvtzun z31.s, { z30.d, z31.d }
+// CHECK-ENCODING: encoding: [0xdf,0x37,0xcd,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cd37df <unknown>
+
+// -----------------------------------------------------------------------
+// Signed integer convert to floating-point (bottom, unpredicated)
+
+scvtf z0.h, z0.b
+// CHECK-INST: scvtf z0.h, z0.b
+// CHECK-ENCODING: encoding: [0x00,0x30,0x4c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654c3000 <unknown>
+
+scvtf z31.h, z31.b
+// CHECK-INST: scvtf z31.h, z31.b
+// CHECK-ENCODING: encoding: [0xff,0x33,0x4c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654c33ff <unknown>
+
+scvtf z0.s, z0.h
+// CHECK-INST: scvtf z0.s, z0.h
+// CHECK-ENCODING: encoding: [0x00,0x30,0x8c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658c3000 <unknown>
+
+scvtf z31.s, z31.h
+// CHECK-INST: scvtf z31.s, z31.h
+// CHECK-ENCODING: encoding: [0xff,0x33,0x8c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658c33ff <unknown>
+
+scvtf z0.d, z0.s
+// CHECK-INST: scvtf z0.d, z0.s
+// CHECK-ENCODING: encoding: [0x00,0x30,0xcc,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cc3000 <unknown>
+
+scvtf z31.d, z31.s
+// CHECK-INST: scvtf z31.d, z31.s
+// CHECK-ENCODING: encoding: [0xff,0x33,0xcc,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cc33ff <unknown>
+
+// -----------------------------------------------------------------------
+// Signed integer convert to floating-point (top, unpredicated)
+
+scvtflt z0.h, z0.b
+// CHECK-INST: scvtflt z0.h, z0.b
+// CHECK-ENCODING: encoding: [0x00,0x38,0x4c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654c3800 <unknown>
+
+scvtflt z31.h, z31.b
+// CHECK-INST: scvtflt z31.h, z31.b
+// CHECK-ENCODING: encoding: [0xff,0x3b,0x4c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654c3bff <unknown>
+
+scvtflt z0.s, z0.h
+// CHECK-INST: scvtflt z0.s, z0.h
+// CHECK-ENCODING: encoding: [0x00,0x38,0x8c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658c3800 <unknown>
+
+scvtflt z31.s, z31.h
+// CHECK-INST: scvtflt z31.s, z31.h
+// CHECK-ENCODING: encoding: [0xff,0x3b,0x8c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658c3bff <unknown>
+
+scvtflt z0.d, z0.s
+// CHECK-INST: scvtflt z0.d, z0.s
+// CHECK-ENCODING: encoding: [0x00,0x38,0xcc,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cc3800 <unknown>
+
+scvtflt z31.d, z31.s
+// CHECK-INST: scvtflt z31.d, z31.s
+// CHECK-ENCODING: encoding: [0xff,0x3b,0xcc,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cc3bff <unknown>
+
+// -----------------------------------------------------------------------
+// Unsigned integer convert to floating-point (bottom, unpredicated)
+
+ucvtf z0.h, z0.b
+// CHECK-INST: ucvtf z0.h, z0.b
+// CHECK-ENCODING: encoding: [0x00,0x34,0x4c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654c3400 <unknown>
+
+ucvtf z31.h, z31.b
+// CHECK-INST: ucvtf z31.h, z31.b
+// CHECK-ENCODING: encoding: [0xff,0x37,0x4c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654c37ff <unknown>
+
+ucvtf z0.s, z0.h
+// CHECK-INST: ucvtf z0.s, z0.h
+// CHECK-ENCODING: encoding: [0x00,0x34,0x8c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658c3400 <unknown>
+
+ucvtf z31.s, z31.h
+// CHECK-INST: ucvtf z31.s, z31.h
+// CHECK-ENCODING: encoding: [0xff,0x37,0x8c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658c37ff <unknown>
+
+ucvtf z0.d, z0.s
+// CHECK-INST: ucvtf z0.d, z0.s
+// CHECK-ENCODING: encoding: [0x00,0x34,0xcc,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cc3400 <unknown>
+
+ucvtf z31.d, z31.s
+// CHECK-INST: ucvtf z31.d, z31.s
+// CHECK-ENCODING: encoding: [0xff,0x37,0xcc,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cc37ff <unknown>
+
+// -----------------------------------------------------------------------
+// Unsigned integer convert to floating-point (top, unpredicated)
+
+ucvtflt z0.h, z0.b
+// CHECK-INST: ucvtflt z0.h, z0.b
+// CHECK-ENCODING: encoding: [0x00,0x3c,0x4c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654c3c00 <unknown>
+
+ucvtflt z31.h, z31.b
+// CHECK-INST: ucvtflt z31.h, z31.b
+// CHECK-ENCODING: encoding: [0xff,0x3f,0x4c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654c3fff <unknown>
+
+ucvtflt z0.s, z0.h
+// CHECK-INST: ucvtflt z0.s, z0.h
+// CHECK-ENCODING: encoding: [0x00,0x3c,0x8c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658c3c00 <unknown>
+
+ucvtflt z31.s, z31.h
+// CHECK-INST: ucvtflt z31.s, z31.h
+// CHECK-ENCODING: encoding: [0xff,0x3f,0x8c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658c3fff <unknown>
+
+ucvtflt z0.d, z0.s
+// CHECK-INST: ucvtflt z0.d, z0.s
+// CHECK-ENCODING: encoding: [0x00,0x3c,0xcc,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cc3c00 <unknown>
+
+ucvtflt z31.d, z31.s
+// CHECK-INST: ucvtflt z31.d, z31.s
+// CHECK-ENCODING: encoding: [0xff,0x3f,0xcc,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cc3fff <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2p3/directive-arch-negative.s b/llvm/test/MC/AArch64/SVE2p3/directive-arch-negative.s
new file mode 100644
index 0000000..0a12cf8
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/directive-arch-negative.s
@@ -0,0 +1,7 @@
+// RUN: not llvm-mc -triple aarch64 -filetype asm -o - %s 2>&1 | FileCheck %s
+
+.arch armv9-a+sve2p3
+.arch armv9-a+nosve2p3
+addqp z0.b, z0.b, z0.b
+// CHECK: error: instruction requires: sme2p3 or sve2p3
+// CHECK-NEXT: addqp z0.b, z0.b, z0.b
diff --git a/llvm/test/MC/AArch64/SVE2p3/directive-arch_extension-negative.s b/llvm/test/MC/AArch64/SVE2p3/directive-arch_extension-negative.s
new file mode 100644
index 0000000..1af6245
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/directive-arch_extension-negative.s
@@ -0,0 +1,7 @@
+// RUN: not llvm-mc -triple aarch64 -filetype asm -o - %s 2>&1 | FileCheck %s
+
+.arch_extension sve2p3
+.arch_extension nosve2p3
+addqp z0.b, z0.b, z0.b
+// CHECK: error: instruction requires: sme2p3 or sve2p3
+// CHECK-NEXT: addqp z0.b, z0.b, z0.b
diff --git a/llvm/test/MC/AArch64/SVE2p3/directive-cpu-negative.s b/llvm/test/MC/AArch64/SVE2p3/directive-cpu-negative.s
new file mode 100644
index 0000000..f3dac04
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/directive-cpu-negative.s
@@ -0,0 +1,7 @@
+// RUN: not llvm-mc -triple aarch64 -filetype asm -o - %s 2>&1 | FileCheck %s
+
+.cpu generic+sve2p3
+.cpu generic+nosve2p3
+addqp z0.b, z0.b, z0.b
+// CHECK: error: instruction requires: sme2p3 or sve2p3
+// CHECK-NEXT: addqp z0.b, z0.b, z0.b
diff --git a/llvm/test/MC/AArch64/SVE2p3/dot-diagnostics.s b/llvm/test/MC/AArch64/SVE2p3/dot-diagnostics.s
new file mode 100644
index 0000000..3eb3792
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/dot-diagnostics.s
@@ -0,0 +1,137 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+sdot z0.b, z0.b, z0.b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sdot z0.b, z0.b, z0.b[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sdot z0.h, z0.h, z0.h[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sdot z0.h, z0.h, z0.h[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sdot z0.s, z0.s, z0.s[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sdot z0.s, z0.s, z0.s[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sdot z0.d, z0.d, z0.d[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sdot z0.d, z0.d, z0.d[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sdot z0.d, z0.s, z0.s[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sdot z0.d, z0.s, z0.s[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sdot z0.b, z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sdot z0.b, z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sdot z0.h, z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sdot z0.h, z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sdot z0.s, z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sdot z0.s, z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sdot z0.d, z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sdot z0.d, z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sdot z0.d, z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sdot z0.d, z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.b, z0.b, z0.b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: udot z0.b, z0.b, z0.b[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.h, z0.h, z0.h[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: udot z0.h, z0.h, z0.h[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.s, z0.s, z0.s[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: udot z0.s, z0.s, z0.s[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.d, z0.d, z0.d[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: udot z0.d, z0.d, z0.d[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.d, z0.s, z0.s[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: udot z0.d, z0.s, z0.s[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.b, z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: udot z0.b, z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.h, z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: udot z0.h, z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.s, z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: udot z0.s, z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.d, z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: udot z0.d, z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.d, z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: udot z0.d, z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid register range and index
+
+sdot z0.h, z0.b, z8.b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: sdot z0.h, z0.b, z8.b[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sdot z0.h, z0.b, z0.b[-1]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 7].
+// CHECK-NEXT: sdot z0.h, z0.b, z0.b[-1]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sdot z0.h, z0.b, z0.b[8]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 7].
+// CHECK-NEXT: sdot z0.h, z0.b, z0.b[8]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.h, z0.b, z8.b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: udot z0.h, z0.b, z8.b[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.h, z0.b, z0.b[-1]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 7].
+// CHECK-NEXT: udot z0.h, z0.b, z0.b[-1]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.h, z0.b, z0.b[8]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 7].
+// CHECK-NEXT: udot z0.h, z0.b, z0.b[8]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/SVE2p3/dot.s b/llvm/test/MC/AArch64/SVE2p3/dot.s
new file mode 100644
index 0000000..01021b38
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/dot.s
@@ -0,0 +1,173 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 < %s \
+// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p3 < %s \
+// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN: | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p3 < %s \
+// RUN: | llvm-objdump -d --mattr=+sve2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p3 < %s \
+// RUN: | llvm-objdump -d --mattr=-sve2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 < %s \
+// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN: | llvm-mc -triple=aarch64 -mattr=+sve2p3 -disassemble -show-encoding \
+// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+sdot z0.h, z0.b, z0.b
+// CHECK-INST: sdot z0.h, z0.b, z0.b
+// CHECK-ENCODING: encoding: [0x00,0x00,0x40,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44400000 <unknown>
+
+sdot z10.h, z10.b, z10.b
+// CHECK-INST: sdot z10.h, z10.b, z10.b
+// CHECK-ENCODING: encoding: [0x4a,0x01,0x4a,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 444a014a <unknown>
+
+sdot z21.h, z21.b, z21.b
+// CHECK-INST: sdot z21.h, z21.b, z21.b
+// CHECK-ENCODING: encoding: [0xb5,0x02,0x55,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 445502b5 <unknown>
+
+sdot z31.h, z31.b, z31.b
+// CHECK-INST: sdot z31.h, z31.b, z31.b
+// CHECK-ENCODING: encoding: [0xff,0x03,0x5f,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 445f03ff <unknown>
+
+movprfx z0, z7
+sdot z0.h, z1.b, z2.b
+// CHECK-INST: movprfx z0, z7
+// CHECK-INST: sdot z0.h, z1.b, z2.b
+// CHECK-ENCODING: encoding: [0x20,0x00,0x42,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44420020 <unknown>
+
+// sdot indexed
+
+sdot z0.h, z0.b, z0.b[0]
+// CHECK-INST: sdot z0.h, z0.b, z0.b[0]
+// CHECK-ENCODING: encoding: [0x00,0x00,0x20,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44200000 <unknown>
+
+sdot z31.h, z31.b, z7.b[0]
+// CHECK-INST: sdot z31.h, z31.b, z7.b[0]
+// CHECK-ENCODING: encoding: [0xff,0x03,0x27,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 442703ff <unknown>
+
+sdot z0.h, z0.b, z0.b[1]
+// CHECK-INST: sdot z0.h, z0.b, z0.b[1]
+// CHECK-ENCODING: encoding: [0x00,0x00,0x28,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44280000 <unknown>
+
+sdot z31.h, z31.b, z7.b[1]
+// CHECK-INST: sdot z31.h, z31.b, z7.b[1]
+// CHECK-ENCODING: encoding: [0xff,0x03,0x2f,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 442f03ff <unknown>
+
+sdot z0.h, z0.b, z0.b[7]
+// CHECK-INST: sdot z0.h, z0.b, z0.b[7]
+// CHECK-ENCODING: encoding: [0x00,0x00,0x78,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44780000 <unknown>
+
+sdot z31.h, z31.b, z7.b[7]
+// CHECK-INST: sdot z31.h, z31.b, z7.b[7]
+// CHECK-ENCODING: encoding: [0xff,0x03,0x7f,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 447f03ff <unknown>
+
+movprfx z0, z7
+sdot z0.h, z1.b, z2.b[0]
+// CHECK-INST: movprfx z0, z7
+// CHECK-INST: sdot z0.h, z1.b, z2.b[0]
+// CHECK-ENCODING: encoding: [0x20,0x00,0x22,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44220020 <unknown>
+
+// udot
+
+udot z0.h, z0.b, z0.b
+// CHECK-INST: udot z0.h, z0.b, z0.b
+// CHECK-ENCODING: encoding: [0x00,0x04,0x40,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44400400 <unknown>
+
+udot z10.h, z10.b, z10.b
+// CHECK-INST: udot z10.h, z10.b, z10.b
+// CHECK-ENCODING: encoding: [0x4a,0x05,0x4a,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 444a054a <unknown>
+
+udot z21.h, z21.b, z21.b
+// CHECK-INST: udot z21.h, z21.b, z21.b
+// CHECK-ENCODING: encoding: [0xb5,0x06,0x55,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 445506b5 <unknown>
+
+udot z31.h, z31.b, z31.b
+// CHECK-INST: udot z31.h, z31.b, z31.b
+// CHECK-ENCODING: encoding: [0xff,0x07,0x5f,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 445f07ff <unknown>
+
+movprfx z0, z7
+udot z0.h, z1.b, z2.b
+// CHECK-INST: movprfx z0, z7
+// CHECK-INST: udot z0.h, z1.b, z2.b
+// CHECK-ENCODING: encoding: [0x20,0x04,0x42,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44420420 <unknown>
+
+// udot indexed
+
+udot z0.h, z0.b, z0.b[0]
+// CHECK-INST: udot z0.h, z0.b, z0.b[0]
+// CHECK-ENCODING: encoding: [0x00,0x04,0x20,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44200400 <unknown>
+
+udot z31.h, z31.b, z7.b[0]
+// CHECK-INST: udot z31.h, z31.b, z7.b[0]
+// CHECK-ENCODING: encoding: [0xff,0x07,0x27,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 442707ff <unknown>
+
+udot z0.h, z0.b, z0.b[1]
+// CHECK-INST: udot z0.h, z0.b, z0.b[1]
+// CHECK-ENCODING: encoding: [0x00,0x04,0x28,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44280400 <unknown>
+
+udot z31.h, z31.b, z7.b[1]
+// CHECK-INST: udot z31.h, z31.b, z7.b[1]
+// CHECK-ENCODING: encoding: [0xff,0x07,0x2f,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 442f07ff <unknown>
+
+udot z0.h, z0.b, z0.b[7]
+// CHECK-INST: udot z0.h, z0.b, z0.b[7]
+// CHECK-ENCODING: encoding: [0x00,0x04,0x78,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44780400 <unknown>
+
+udot z31.h, z31.b, z7.b[7]
+// CHECK-INST: udot z31.h, z31.b, z7.b[7]
+// CHECK-ENCODING: encoding: [0xff,0x07,0x7f,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 447f07ff <unknown>
+
+movprfx z0, z7
+udot z0.h, z1.b, z2.b[0]
+// CHECK-INST: movprfx z0, z7
+// CHECK-INST: udot z0.h, z1.b, z2.b[0]
+// CHECK-ENCODING: encoding: [0x20,0x04,0x22,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44220420 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2p3/luti6-diagnostics.s b/llvm/test/MC/AArch64/SVE2p3/luti6-diagnostics.s
new file mode 100644
index 0000000..21b4df9
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/luti6-diagnostics.s
@@ -0,0 +1,70 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+luti6 z10.h, { z0.b, z1.b }, z0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: luti6 z10.h, { z0.b, z1.b }, z0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 z10.s, { z0.b, z1.b }, z0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: luti6 z10.s, { z0.b, z1.b }, z0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0.h, p0/m, z7.h
+luti6 z0.b, { z2.b, z3.b }, z4
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 z0.b, { z2.b, z3.b }, z4
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+movprfx z0, z7
+luti6 z0.b, { z2.b, z3.b }, z4
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 z0.b, { z2.b, z3.b }, z4
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+luti6 z10.s, { z0.h, z1.h }, z0[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: luti6 z10.s, { z0.h, z1.h }, z0[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 z10.b, { z0.h, z1.h }, z0[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: luti6 z10.b, { z0.h, z1.h }, z0[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid immediate range
+
+luti6 z10.h, { z0.h, z1.h }, z0[-1]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 1].
+// CHECK-NEXT: luti6 z10.h, { z0.h, z1.h }, z0[-1]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 z10.h, { z0.h, z1.h }, z0[2]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 1].
+// CHECK-NEXT: luti6 z10.h, { z0.h, z1.h }, z0[2]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0.h, p0/m, z7.h
+luti6 z0.h, { z2.h, z3.h }, z4[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 z0.h, { z2.h, z3.h }, z4[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+movprfx z0, z7
+luti6 z0.h, { z2.h, z3.h }, z4[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 z0.h, { z2.h, z3.h }, z4[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/SVE2p3/luti6.s b/llvm/test/MC/AArch64/SVE2p3/luti6.s
new file mode 100644
index 0000000..848091c
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/luti6.s
@@ -0,0 +1,115 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 < %s \
+// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN: | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p3 < %s \
+// RUN: | llvm-objdump -d --mattr=+sve2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p3 < %s \
+// RUN: | llvm-objdump -d --mattr=-sve2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 < %s \
+// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN: | llvm-mc -triple=aarch64 -mattr=+sve2p3 -disassemble -show-encoding \
+// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+// ---------------------------------------------------------------
+// Lookup table read with 6-bit indices (8-bit)
+
+luti6 z0.b, { z0.b, z1.b }, z0
+// CHECK-INST: luti6 z0.b, { z0.b, z1.b }, z0
+// CHECK-ENCODING: encoding: [0x00,0xac,0x20,0x45]
+// CHECK-ERROR: instruction requires: sve2p3
+// CHECK-UNKNOWN: 4520ac00 <unknown>
+
+luti6 z10.b, { z0.b, z1.b }, z0
+// CHECK-INST: luti6 z10.b, { z0.b, z1.b }, z0
+// CHECK-ENCODING: encoding: [0x0a,0xac,0x20,0x45]
+// CHECK-ERROR: instruction requires: sve2p3
+// CHECK-UNKNOWN: 4520ac0a <unknown>
+
+luti6 z21.b, { z0.b, z1.b }, z0
+// CHECK-INST: luti6 z21.b, { z0.b, z1.b }, z0
+// CHECK-ENCODING: encoding: [0x15,0xac,0x20,0x45]
+// CHECK-ERROR: instruction requires: sve2p3
+// CHECK-UNKNOWN: 4520ac15 <unknown>
+
+luti6 z31.b, { z0.b, z1.b }, z0
+// CHECK-INST: luti6 z31.b, { z0.b, z1.b }, z0
+// CHECK-ENCODING: encoding: [0x1f,0xac,0x20,0x45]
+// CHECK-ERROR: instruction requires: sve2p3
+// CHECK-UNKNOWN: 4520ac1f <unknown>
+
+luti6 z0.b, { z31.b, z0.b }, z31
+// CHECK-INST: luti6 z0.b, { z31.b, z0.b }, z31
+// CHECK-ENCODING: encoding: [0xe0,0xaf,0x3f,0x45]
+// CHECK-ERROR: instruction requires: sve2p3
+// CHECK-UNKNOWN: 453fafe0 <unknown>
+
+luti6 z10.b, { z31.b, z0.b }, z31
+// CHECK-INST: luti6 z10.b, { z31.b, z0.b }, z31
+// CHECK-ENCODING: encoding: [0xea,0xaf,0x3f,0x45]
+// CHECK-ERROR: instruction requires: sve2p3
+// CHECK-UNKNOWN: 453fafea <unknown>
+
+luti6 z21.b, { z31.b, z0.b }, z31
+// CHECK-INST: luti6 z21.b, { z31.b, z0.b }, z31
+// CHECK-ENCODING: encoding: [0xf5,0xaf,0x3f,0x45]
+// CHECK-ERROR: instruction requires: sve2p3
+// CHECK-UNKNOWN: 453faff5 <unknown>
+
+luti6 z31.b, { z31.b, z0.b }, z31
+// CHECK-INST: luti6 z31.b, { z31.b, z0.b }, z31
+// CHECK-ENCODING: encoding: [0xff,0xaf,0x3f,0x45]
+// CHECK-ERROR: instruction requires: sve2p3
+// CHECK-UNKNOWN: 453fafff <unknown>
+
+// ---------------------------------------------------------------
+// Lookup table read with 6-bit indices (16-bit)
+
+luti6 z0.h, { z0.h, z1.h }, z0[0]
+// CHECK-INST: luti6 z0.h, { z0.h, z1.h }, z0[0]
+// CHECK-ENCODING: encoding: [0x00,0xac,0x60,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4560ac00 <unknown>
+
+luti6 z10.h, { z0.h, z1.h }, z0[0]
+// CHECK-INST: luti6 z10.h, { z0.h, z1.h }, z0[0]
+// CHECK-ENCODING: encoding: [0x0a,0xac,0x60,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4560ac0a <unknown>
+
+luti6 z21.h, { z0.h, z1.h }, z0[0]
+// CHECK-INST: luti6 z21.h, { z0.h, z1.h }, z0[0]
+// CHECK-ENCODING: encoding: [0x15,0xac,0x60,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4560ac15 <unknown>
+
+luti6 z31.h, { z0.h, z1.h }, z0[0]
+// CHECK-INST: luti6 z31.h, { z0.h, z1.h }, z0[0]
+// CHECK-ENCODING: encoding: [0x1f,0xac,0x60,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4560ac1f <unknown>
+
+luti6 z0.h, { z31.h, z0.h }, z31[1]
+// CHECK-INST: luti6 z0.h, { z31.h, z0.h }, z31[1]
+// CHECK-ENCODING: encoding: [0xe0,0xaf,0xff,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45ffafe0 <unknown>
+
+luti6 z10.h, { z31.h, z0.h }, z31[1]
+// CHECK-INST: luti6 z10.h, { z31.h, z0.h }, z31[1]
+// CHECK-ENCODING: encoding: [0xea,0xaf,0xff,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45ffafea <unknown>
+
+luti6 z21.h, { z31.h, z0.h }, z31[1]
+// CHECK-INST: luti6 z21.h, { z31.h, z0.h }, z31[1]
+// CHECK-ENCODING: encoding: [0xf5,0xaf,0xff,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45ffaff5 <unknown>
+
+luti6 z31.h, { z31.h, z0.h }, z31[1]
+// CHECK-INST: luti6 z31.h, { z31.h, z0.h }, z31[1]
+// CHECK-ENCODING: encoding: [0xff,0xaf,0xff,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45ffafff <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2p3/qshrn-diagnostics.s b/llvm/test/MC/AArch64/SVE2p3/qshrn-diagnostics.s
new file mode 100644
index 0000000..a4dbae2
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/qshrn-diagnostics.s
@@ -0,0 +1,343 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+sqrshrn z10.s, { z0.s, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sqrshrn z10.s, { z0.s, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqrshrn z10.d, { z0.d, z1.d }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sqrshrn z10.d, { z0.d, z1.d }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Mismatched register size suffix
+
+sqrshrn z0.b, { z0.h, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: mismatched register size suffix
+// CHECK-NEXT: sqrshrn z0.b, { z0.h, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid operand for instruction
+
+sqrshrn z10.h, { z0.b, z1.b }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: sqrshrn z10.h, { z0.b, z1.b }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Intermediate out of range
+
+sqrshrn z10.b, { z0.h, z1.h }, #0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: sqrshrn z10.b, { z0.h, z1.h }, #0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqrshrn z10.b, { z0.h, z1.h }, #9
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: sqrshrn z10.b, { z0.h, z1.h }, #9
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqrshrn z10.b, { z1.h, z2.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types
+// CHECK-NEXT: sqrshrn z10.b, { z1.h, z2.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+sqrshrn z0.b, { z2.h, z3.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: sqrshrn z0.b, { z2.h, z3.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+sqrshrun z10.s, { z0.s, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sqrshrun z10.s, { z0.s, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqrshrun z10.d, { z0.d, z1.d }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sqrshrun z10.d, { z0.d, z1.d }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid operand for instruction
+
+sqrshrun z10.h, { z0.b, z1.b }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: sqrshrun z10.h, { z0.b, z1.b }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Intermediate out of range
+
+sqrshrun z10.b, { z0.h, z1.h }, #0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: sqrshrun z10.b, { z0.h, z1.h }, #0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqrshrun z10.b, { z0.h, z1.h }, #9
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: sqrshrun z10.b, { z0.h, z1.h }, #9
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqrshrun z10.b, { z1.h, z2.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types
+// CHECK-NEXT: sqrshrun z10.b, { z1.h, z2.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Mismatched register size suffix
+
+sqrshrun z0.b, { z0.h, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: mismatched register size suffix
+// CHECK-NEXT: sqrshrun z0.b, { z0.h, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+sqrshrun z0.b, { z0.h, z1.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: sqrshrun z0.b, { z0.h, z1.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+sqshrn z10.s, { z0.s, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sqshrn z10.s, { z0.s, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqshrn z10.d, { z0.d, z1.d }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sqshrn z10.d, { z0.d, z1.d }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Mismatched register size suffix
+
+sqshrn z0.b, { z0.h, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: mismatched register size suffix
+// CHECK-NEXT: sqshrn z0.b, { z0.h, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Intermediate out of range
+
+sqshrn z10.b, { z0.h, z1.h }, #0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: sqshrn z10.b, { z0.h, z1.h }, #0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqshrn z10.b, { z0.h, z1.h }, #9
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: sqshrn z10.b, { z0.h, z1.h }, #9
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqshrn z0.h, { z0.s, z1.s }, #0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 16].
+// CHECK-NEXT: sqshrn z0.h, { z0.s, z1.s }, #0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqshrn z0.h, { z0.s, z1.s }, #17
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 16].
+// CHECK-NEXT: sqshrn z0.h, { z0.s, z1.s }, #17
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqshrn z10.b, { z1.h, z2.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types
+// CHECK-NEXT: sqshrn z10.b, { z1.h, z2.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+sqshrn z0.b, { z0.h, z1.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: sqshrn z0.b, { z0.h, z1.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+sqshrun z10.s, { z0.s, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sqshrun z10.s, { z0.s, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqshrun z10.d, { z0.d, z1.d }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sqshrun z10.d, { z0.d, z1.d }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Mismatched register size suffix
+
+sqshrun z0.b, { z0.h, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: mismatched register size suffix
+// CHECK-NEXT: sqshrun z0.b, { z0.h, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Intermediate out of range
+
+sqshrun z10.b, { z0.h, z1.h }, #0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: sqshrun z10.b, { z0.h, z1.h }, #0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqshrun z10.b, { z0.h, z1.h }, #9
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: sqshrun z10.b, { z0.h, z1.h }, #9
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqshrun z10.h, { z0.s, z1.s }, #0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 16].
+// CHECK-NEXT: sqshrun z10.h, { z0.s, z1.s }, #0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqshrun z10.h, { z0.s, z1.s }, #17
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 16].
+// CHECK-NEXT: sqshrun z10.h, { z0.s, z1.s }, #17
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqshrun z10.b, { z1.h, z2.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types
+// CHECK-NEXT: sqshrun z10.b, { z1.h, z2.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+sqshrun z0.b, { z0.h, z1.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: sqshrun z0.b, { z0.h, z1.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+uqrshrn z10.s, { z0.s, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: uqrshrn z10.s, { z0.s, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+uqrshrn z10.d, { z0.d, z1.d }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: uqrshrn z10.d, { z0.d, z1.d }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Mismatched register size suffix
+
+uqrshrn z0.b, { z0.h, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: mismatched register size suffix
+// CHECK-NEXT: uqrshrn z0.b, { z0.h, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid operand for instruction
+
+uqrshrn z10.h, { z0.b, z1.b }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: uqrshrn z10.h, { z0.b, z1.b }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Intermediate out of range
+
+uqrshrn z10.b, { z0.h, z1.h }, #0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: uqrshrn z10.b, { z0.h, z1.h }, #0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+uqrshrn z10.b, { z0.h, z1.h }, #9
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: uqrshrn z10.b, { z0.h, z1.h }, #9
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+uqrshrn z10.b, { z1.h, z2.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types
+// CHECK-NEXT: uqrshrn z10.b, { z1.h, z2.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+uqrshrn z0.b, { z0.h, z1.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: uqrshrn z0.b, { z0.h, z1.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+uqshrn z10.s, { z0.s, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: uqshrn z10.s, { z0.s, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+uqshrn z10.d, { z0.d, z1.d }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: uqshrn z10.d, { z0.d, z1.d }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Mismatched register size suffix
+
+uqshrn z0.b, { z0.h, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: mismatched register size suffix
+// CHECK-NEXT: uqshrn z0.b, { z0.h, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Intermediate out of range
+
+uqshrn z10.b, { z0.h, z1.h }, #0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: uqshrn z10.b, { z0.h, z1.h }, #0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+uqshrn z10.b, { z0.h, z1.h }, #9
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: uqshrn z10.b, { z0.h, z1.h }, #9
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+uqshrn z0.h, { z0.s, z1.s }, #0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 16].
+// CHECK-NEXT: uqshrn z0.h, { z0.s, z1.s }, #0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+uqshrn z0.h, { z0.s, z1.s }, #17
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 16].
+// CHECK-NEXT: uqshrn z0.h, { z0.s, z1.s }, #17
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+uqshrn z10.b, { z1.h, z2.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types
+// CHECK-NEXT: uqshrn z10.b, { z1.h, z2.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+uqshrn z0.b, { z0.h, z1.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: uqshrn z0.b, { z0.h, z1.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/SVE2p3/qshrn.s b/llvm/test/MC/AArch64/SVE2p3/qshrn.s
new file mode 100644
index 0000000..31c87cf
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/qshrn.s
@@ -0,0 +1,255 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 < %s \
+// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p3 < %s \
+// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN: | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p3 < %s \
+// RUN: | llvm-objdump -d --mattr=+sve2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p3 < %s \
+// RUN: | llvm-objdump -d --mattr=-sve2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 < %s \
+// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN: | llvm-mc -triple=aarch64 -mattr=+sve2p3 -disassemble -show-encoding \
+// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+// -----------------------------------------------------------------
+// Signed saturating rounding shift right narrow by immediate and interleave
+
+sqrshrn z0.b, { z0.h, z1.h }, #1
+// CHECK-INST: sqrshrn z0.b, { z0.h, z1.h }, #1
+// CHECK-ENCODING: encoding: [0x00,0x28,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af2800 <unknown>
+
+sqrshrn z31.b, { z30.h, z31.h }, #1
+// CHECK-INST: sqrshrn z31.b, { z30.h, z31.h }, #1
+// CHECK-ENCODING: encoding: [0xdf,0x2b,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af2bdf <unknown>
+
+sqrshrn z0.b, { z0.h, z1.h }, #8
+// CHECK-INST: sqrshrn z0.b, { z0.h, z1.h }, #8
+// CHECK-ENCODING: encoding: [0x00,0x28,0xa8,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45a82800 <unknown>
+
+sqrshrn z31.b, { z30.h, z31.h }, #3
+// CHECK-INST: sqrshrn z31.b, { z30.h, z31.h }, #3
+// CHECK-ENCODING: encoding: [0xdf,0x2b,0xad,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45ad2bdf <unknown>
+
+// -----------------------------------------------------------------
+// Signed saturating rounding shift right unsigned narrow by immediate and interleave
+
+sqrshrun z0.b, { z0.h, z1.h }, #1
+// CHECK-INST: sqrshrun z0.b, { z0.h, z1.h }, #1
+// CHECK-ENCODING: encoding: [0x00,0x08,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af0800 <unknown>
+
+sqrshrun z31.b, { z30.h, z31.h }, #1
+// CHECK-INST: sqrshrun z31.b, { z30.h, z31.h }, #1
+// CHECK-ENCODING: encoding: [0xdf,0x0b,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af0bdf <unknown>
+
+sqrshrun z0.b, { z0.h, z1.h }, #8
+// CHECK-INST: sqrshrun z0.b, { z0.h, z1.h }, #8
+// CHECK-ENCODING: encoding: [0x00,0x08,0xa8,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45a80800 <unknown>
+
+sqrshrun z31.b, { z30.h, z31.h }, #8
+// CHECK-INST: sqrshrun z31.b, { z30.h, z31.h }, #8
+// CHECK-ENCODING: encoding: [0xdf,0x0b,0xa8,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45a80bdf <unknown>
+
+// -----------------------------------------------------------------
+// Signed saturating shift right narrow by immediate and interleave
+
+sqshrn z21.b, { z30.h, z31.h }, #1
+// CHECK-INST: sqshrn z21.b, { z30.h, z31.h }, #1
+// CHECK-ENCODING: encoding: [0xd5,0x03,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af03d5 <unknown>
+
+sqshrn z31.b, { z30.h, z31.h }, #1
+// CHECK-INST: sqshrn z31.b, { z30.h, z31.h }, #1
+// CHECK-ENCODING: encoding: [0xdf,0x03,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af03df <unknown>
+
+sqshrn z10.b, { z0.h, z1.h }, #8
+// CHECK-INST: sqshrn z10.b, { z0.h, z1.h }, #8
+// CHECK-ENCODING: encoding: [0x0a,0x00,0xa8,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45a8000a <unknown>
+
+sqshrn z31.b, { z30.h, z31.h }, #8
+// CHECK-INST: sqshrn z31.b, { z30.h, z31.h }, #8
+// CHECK-ENCODING: encoding: [0xdf,0x03,0xa8,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45a803df <unknown>
+
+sqshrn z0.b, { z0.h, z1.h }, #1
+// CHECK-INST: sqshrn z0.b, { z0.h, z1.h }, #1
+// CHECK-ENCODING: encoding: [0x00,0x00,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af0000 <unknown>
+
+sqshrn z0.h, { z0.s, z1.s }, #1
+// CHECK-INST: sqshrn z0.h, { z0.s, z1.s }, #1
+// CHECK-ENCODING: encoding: [0x00,0x00,0xbf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45bf0000 <unknown>
+
+sqshrn z31.h, { z30.s, z31.s }, #1
+// CHECK-INST: sqshrn z31.h, { z30.s, z31.s }, #1
+// CHECK-ENCODING: encoding: [0xdf,0x03,0xbf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45bf03df <unknown>
+
+sqshrn z0.h, { z0.s, z1.s }, #16
+// CHECK-INST: sqshrn z0.h, { z0.s, z1.s }, #16
+// CHECK-ENCODING: encoding: [0x00,0x00,0xb0,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45b00000 <unknown>
+
+sqshrn z31.h, { z30.s, z31.s }, #16
+// CHECK-INST: sqshrn z31.h, { z30.s, z31.s }, #16
+// CHECK-ENCODING: encoding: [0xdf,0x03,0xb0,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45b003df <unknown>
+
+// -----------------------------------------------------------------
+// Signed saturating shift right unsigned narrow by immediate and interleave
+
+sqshrun z0.b, { z0.h, z1.h }, #1
+// CHECK-INST: sqshrun z0.b, { z0.h, z1.h }, #1
+// CHECK-ENCODING: encoding: [0x00,0x20,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af2000 <unknown>
+
+sqshrun z31.b, { z30.h, z31.h }, #1
+// CHECK-INST: sqshrun z31.b, { z30.h, z31.h }, #1
+// CHECK-ENCODING: encoding: [0xdf,0x23,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af23df <unknown>
+
+sqshrun z0.b, { z0.h, z1.h }, #8
+// CHECK-INST: sqshrun z0.b, { z0.h, z1.h }, #8
+// CHECK-ENCODING: encoding: [0x00,0x20,0xa8,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45a82000 <unknown>
+
+sqshrun z31.b, { z30.h, z31.h }, #8
+// CHECK-INST: sqshrun z31.b, { z30.h, z31.h }, #8
+// CHECK-ENCODING: encoding: [0xdf,0x23,0xa8,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45a823df <unknown>
+
+sqshrun z0.h, { z0.s, z1.s }, #1
+// CHECK-INST: sqshrun z0.h, { z0.s, z1.s }, #1
+// CHECK-ENCODING: encoding: [0x00,0x20,0xbf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45bf2000 <unknown>
+
+sqshrun z31.h, { z30.s, z31.s }, #1
+// CHECK-INST: sqshrun z31.h, { z30.s, z31.s }, #1
+// CHECK-ENCODING: encoding: [0xdf,0x23,0xbf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45bf23df <unknown>
+
+sqshrun z0.h, { z0.s, z1.s }, #16
+// CHECK-INST: sqshrun z0.h, { z0.s, z1.s }, #16
+// CHECK-ENCODING: encoding: [0x00,0x20,0xb0,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45b02000 <unknown>
+
+sqshrun z31.h, { z30.s, z31.s }, #16
+// CHECK-INST: sqshrun z31.h, { z30.s, z31.s }, #16
+// CHECK-ENCODING: encoding: [0xdf,0x23,0xb0,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45b023df <unknown>
+
+// -----------------------------------------------------------------
+// Unsigned saturating rounding shift right narrow by immediate and interleave
+
+uqrshrn z0.b, { z0.h, z1.h }, #1
+// CHECK-INST: uqrshrn z0.b, { z0.h, z1.h }, #1
+// CHECK-ENCODING: encoding: [0x00,0x38,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af3800 <unknown>
+
+uqrshrn z31.b, { z30.h, z31.h }, #1
+// CHECK-INST: uqrshrn z31.b, { z30.h, z31.h }, #1
+// CHECK-ENCODING: encoding: [0xdf,0x3b,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af3bdf <unknown>
+
+uqrshrn z0.b, { z0.h, z1.h }, #8
+// CHECK-INST: uqrshrn z0.b, { z0.h, z1.h }, #8
+// CHECK-ENCODING: encoding: [0x00,0x38,0xa8,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45a83800 <unknown>
+
+uqrshrn z31.b, { z30.h, z31.h }, #8
+// CHECK-INST: uqrshrn z31.b, { z30.h, z31.h }, #8
+// CHECK-ENCODING: encoding: [0xdf,0x3b,0xa8,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45a83bdf <unknown>
+
+// -----------------------------------------------------------------
+// Unsigned saturating shift right narrow by immediate and interleave
+
+uqshrn z0.b, { z0.h, z1.h }, #1
+// CHECK-INST: uqshrn z0.b, { z0.h, z1.h }, #1
+// CHECK-ENCODING: encoding: [0x00,0x10,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af1000 <unknown>
+
+uqshrn z31.b, { z30.h, z31.h }, #1
+// CHECK-INST: uqshrn z31.b, { z30.h, z31.h }, #1
+// CHECK-ENCODING: encoding: [0xdf,0x13,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af13df <unknown>
+
+uqshrn z0.b, { z0.h, z1.h }, #8
+// CHECK-INST: uqshrn z0.b, { z0.h, z1.h }, #8
+// CHECK-ENCODING: encoding: [0x00,0x10,0xa8,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45a81000 <unknown>
+
+uqshrn z31.b, { z30.h, z31.h }, #8
+// CHECK-INST: uqshrn z31.b, { z30.h, z31.h }, #8
+// CHECK-ENCODING: encoding: [0xdf,0x13,0xa8,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45a813df <unknown>
+
+uqshrn z0.h, { z0.s, z1.s }, #1
+// CHECK-INST: uqshrn z0.h, { z0.s, z1.s }, #1
+// CHECK-ENCODING: encoding: [0x00,0x10,0xbf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45bf1000 <unknown>
+
+uqshrn z31.h, { z30.s, z31.s }, #1
+// CHECK-INST: uqshrn z31.h, { z30.s, z31.s }, #1
+// CHECK-ENCODING: encoding: [0xdf,0x13,0xbf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45bf13df <unknown>
+
+uqshrn z0.h, { z0.s, z1.s }, #16
+// CHECK-INST: uqshrn z0.h, { z0.s, z1.s }, #16
+// CHECK-ENCODING: encoding: [0x00,0x10,0xb0,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45b01000 <unknown>
+
+uqshrn z31.h, { z30.s, z31.s }, #16
+// CHECK-INST: uqshrn z31.h, { z30.s, z31.s }, #16
+// CHECK-ENCODING: encoding: [0xdf,0x13,0xb0,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45b013df <unknown>
diff --git a/llvm/test/MC/AArch64/armv8.4a-mpam.s b/llvm/test/MC/AArch64/armv8.4a-mpam.s
index 14787e6..7469227 100644
--- a/llvm/test/MC/AArch64/armv8.4a-mpam.s
+++ b/llvm/test/MC/AArch64/armv8.4a-mpam.s
@@ -1,6 +1,4 @@
// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v8.4a < %s 2> %t | FileCheck %s --check-prefix=CHECK
-// RUN: FileCheck --check-prefix=CHECK-RO < %t %s
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=-v8.4a < %s 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
//------------------------------------------------------------------------------
// ARMV8.4-A MPAM Extensions
@@ -56,9 +54,6 @@ mrs x0, MPAMIDR_EL1
//CHECK: msr MPAMVPM6_EL2, x0 // encoding: [0xc0,0xa6,0x1c,0xd5]
//CHECK: msr MPAMVPM7_EL2, x0 // encoding: [0xe0,0xa6,0x1c,0xd5]
-//CHECK-RO: error: expected writable system register or pstate
-//CHECK-RO: msr MPAMIDR_EL1, x0
-//CHECK-RO: ^
//CHECK: mrs x0, MPAM0_EL1 // encoding: [0x20,0xa5,0x38,0xd5]
//CHECK: mrs x0, MPAM1_EL1 // encoding: [0x00,0xa5,0x38,0xd5]
@@ -77,100 +72,4 @@ mrs x0, MPAMIDR_EL1
//CHECK: mrs x0, MPAMVPM7_EL2 // encoding: [0xe0,0xa6,0x3c,0xd5]
//CHECK: mrs x0, MPAMIDR_EL1 // encoding: [0x80,0xa4,0x38,0xd5]
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAM0_EL1, x0
-//CHECK-ERROR: ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAM1_EL1, x0
-//CHECK-ERROR: ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAM2_EL2, x0
-//CHECK-ERROR: ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAM3_EL3, x0
-//CHECK-ERROR: ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAM1_EL12, x0
-//CHECK-ERROR: ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAMHCR_EL2, x0
-//CHECK-ERROR: ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAMVPMV_EL2, x0
-//CHECK-ERROR: ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAMVPM0_EL2, x0
-//CHECK-ERROR: ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAMVPM1_EL2, x0
-//CHECK-ERROR: ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAMVPM2_EL2, x0
-//CHECK-ERROR: ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAMVPM3_EL2, x0
-//CHECK-ERROR: ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAMVPM4_EL2, x0
-//CHECK-ERROR: ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAMVPM5_EL2, x0
-//CHECK-ERROR: ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAMVPM6_EL2, x0
-//CHECK-ERROR: ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAMVPM7_EL2, x0
-//CHECK-ERROR: ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAMIDR_EL1, x0
-//CHECK-ERROR: ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAM0_EL1
-//CHECK-ERROR: ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAM1_EL1
-//CHECK-ERROR: ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAM2_EL2
-//CHECK-ERROR: ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAM3_EL3
-//CHECK-ERROR: ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAM1_EL12
-//CHECK-ERROR: ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAMHCR_EL2
-//CHECK-ERROR: ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAMVPMV_EL2
-//CHECK-ERROR: ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAMVPM0_EL2
-//CHECK-ERROR: ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAMVPM1_EL2
-//CHECK-ERROR: ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAMVPM2_EL2
-//CHECK-ERROR: ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAMVPM3_EL2
-//CHECK-ERROR: ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAMVPM4_EL2
-//CHECK-ERROR: ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAMVPM5_EL2
-//CHECK-ERROR: ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAMVPM6_EL2
-//CHECK-ERROR: ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAMVPM7_EL2
-//CHECK-ERROR: ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAMIDR_EL1
-//CHECK-ERROR: ^
diff --git a/llvm/test/MC/AArch64/armv9.7a-gcie-diagnostics.s b/llvm/test/MC/AArch64/armv9.7a-gcie-diagnostics.s
new file mode 100644
index 0000000..cffee7d
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.7a-gcie-diagnostics.s
@@ -0,0 +1,18 @@
+// RUN: not llvm-mc -triple=aarch64 -mattr=+gcie -show-encoding < %s 2>&1 \
+// RUN: | FileCheck %s --check-prefix=CHECK-ERROR
+
+//------------------------------------------------------------------------------
+// FEAT_GCIE instructions
+//------------------------------------------------------------------------------
+
+gsb
+// CHECK-ERROR: error: invalid operand for GSB instruction
+
+gicr
+// CHECK-ERROR: error: expected register operand
+
+gicr x3, foo
+// CHECK-ERROR: error: invalid operand for GICR instruction
+
+gic cdaff
+// CHECK-ERROR: error: specified gic op requires a register
diff --git a/llvm/test/MC/AArch64/armv9.7a-gcie.s b/llvm/test/MC/AArch64/armv9.7a-gcie.s
new file mode 100644
index 0000000..4fd5d25
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.7a-gcie.s
@@ -0,0 +1,985 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+gcie < %s \
+// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN: | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+gcie < %s \
+// RUN: | llvm-objdump -d --mattr=+gcie --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+gcie < %s \
+// RUN: | llvm-objdump -d --mattr=-gcie --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+gcie < %s \
+// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN: | llvm-mc -triple=aarch64 -mattr=+gcie -disassemble -show-encoding \
+// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+//------------------------------------------------------------------------------
+// Armv9.7-A FEAT_GCIE Extensions
+//------------------------------------------------------------------------------
+
+// CPU Interface Registers MRS Instruction - Encodings Checked
+MRS x3, ICC_APR_EL1
+// CHECK-INST: mrs x3, ICC_APR_EL1
+// CHECK-ENCODING: [0x03,0xc0,0x39,0xd5]
+// CHECK-UNKNOWN: d539c003
+
+MRS x3, ICC_APR_EL3
+// CHECK-INST: mrs x3, ICC_APR_EL3
+// CHECK-ENCODING: [0x03,0xc8,0x3e,0xd5]
+// CHECK-UNKNOWN: d53ec803
+
+MRS x3, ICC_CR0_EL1
+// CHECK-INST: mrs x3, ICC_CR0_EL1
+// CHECK-ENCODING: [0x23,0xc0,0x39,0xd5]
+// CHECK-UNKNOWN: d539c023
+
+MRS x3, ICC_CR0_EL3
+// CHECK-INST: mrs x3, ICC_CR0_EL3
+// CHECK-ENCODING: [0x03,0xc9,0x3e,0xd5]
+// CHECK-UNKNOWN: d53ec903
+
+MRS x3, ICC_DOMHPPIR_EL3
+// CHECK-INST: mrs x3, ICC_DOMHPPIR_EL3
+// CHECK-ENCODING: [0x43,0xc8,0x3e,0xd5]
+// CHECK-UNKNOWN: d53ec843
+
+MRS x3, ICC_HAPR_EL1
+// CHECK-INST: mrs x3, ICC_HAPR_EL1
+// CHECK-ENCODING: [0x63,0xc0,0x39,0xd5]
+// CHECK-UNKNOWN: d539c063
+
+MRS x3, ICC_HPPIR_EL1
+// CHECK-INST: mrs x3, ICC_HPPIR_EL1
+// CHECK-ENCODING: [0x63,0xca,0x38,0xd5]
+// CHECK-UNKNOWN: d538ca63
+
+MRS x3, ICC_HPPIR_EL3
+// CHECK-INST: mrs x3, ICC_HPPIR_EL3
+// CHECK-ENCODING: [0x23,0xc9,0x3e,0xd5]
+// CHECK-UNKNOWN: d53ec923
+
+MRS x3, ICC_IAFFIDR_EL1
+// CHECK-INST: mrs x3, ICC_IAFFIDR_EL1
+// CHECK-ENCODING: [0xa3,0xca,0x38,0xd5]
+// CHECK-UNKNOWN: d538caa3
+
+MRS x3, ICC_ICSR_EL1
+// CHECK-INST: mrs x3, ICC_ICSR_EL1
+// CHECK-ENCODING: [0x83,0xca,0x38,0xd5]
+// CHECK-UNKNOWN: d538ca83
+
+MRS x3, ICC_IDR0_EL1
+// CHECK-INST: mrs x3, ICC_IDR0_EL1
+// CHECK-ENCODING: [0x43,0xca,0x38,0xd5]
+// CHECK-UNKNOWN: d538ca43
+
+MRS x3, ICC_PCR_EL1
+// CHECK-INST: mrs x3, ICC_PCR_EL1
+// CHECK-ENCODING: [0x43,0xc0,0x39,0xd5]
+// CHECK-UNKNOWN: d539c043
+
+MRS x3, ICC_PCR_EL3
+// CHECK-INST: mrs x3, ICC_PCR_EL3
+// CHECK-ENCODING: [0x23,0xc8,0x3e,0xd5]
+// CHECK-UNKNOWN: d53ec823
+
+MRS x3, ICC_SRE_EL1
+// CHECK-INST: mrs x3, ICC_SRE_EL1
+// CHECK-ENCODING: [0xa3,0xcc,0x38,0xd5]
+// CHECK-UNKNOWN: d538cca3
+
+// -----------------------------------------------
+MSR ICC_APR_EL1, x3
+// CHECK-INST: msr ICC_APR_EL1, x3
+// CHECK-ENCODING: [0x03,0xc0,0x19,0xd5]
+// CHECK-UNKNOWN: d519c003
+
+MSR ICC_APR_EL3, x3
+// CHECK-INST: msr ICC_APR_EL3, x3
+// CHECK-ENCODING: [0x03,0xc8,0x1e,0xd5]
+// CHECK-UNKNOWN: d51ec803
+
+MSR ICC_CR0_EL1, x3
+// CHECK-INST: msr ICC_CR0_EL1, x3
+// CHECK-ENCODING: [0x23,0xc0,0x19,0xd5]
+// CHECK-UNKNOWN: d519c023
+
+MSR ICC_CR0_EL3, x3
+// CHECK-INST: msr ICC_CR0_EL3, x3
+// CHECK-ENCODING: [0x03,0xc9,0x1e,0xd5]
+// CHECK-UNKNOWN: d51ec903
+
+MSR ICC_ICSR_EL1, x3
+// CHECK-INST: msr ICC_ICSR_EL1, x3
+// CHECK-ENCODING: [0x83,0xca,0x18,0xd5]
+// CHECK-UNKNOWN: d518ca83
+
+MSR ICC_PCR_EL1, x3
+// CHECK-INST: msr ICC_PCR_EL1, x3
+// CHECK-ENCODING: [0x43,0xc0,0x19,0xd5]
+// CHECK-UNKNOWN: d519c043
+
+MSR ICC_PCR_EL3, x3
+// CHECK-INST: msr ICC_PCR_EL3, x3
+// CHECK-ENCODING: [0x23,0xc8,0x1e,0xd5]
+// CHECK-UNKNOWN: d51ec823
+
+
+// -----------------------------------------------
+// Virtual CPU Registers MRS Instructions
+
+// The specification says:
+// "Each ICC_system register that is accessible at EL1 and higher and whose state
+// is specific to the Virtual Interrupt Domain, has a corresponding virtual
+// ICV_register. The ICV_registers are accessed using the same system register
+// encodings as their ICC_counterparts."
+//
+// So expect ICC_* encodings here, not ICV_* encodings
+
+MRS x3, ICV_APR_EL1
+// CHECK-INST: mrs x3, ICC_APR_EL1
+// CHECK-ENCODING: [0x03,0xc0,0x39,0xd5]
+// CHECK-UNKNOWN: d539c003
+
+MRS x3, ICV_CR0_EL1
+// CHECK-INST: mrs x3, ICC_CR0_EL1
+// CHECK-ENCODING: [0x23,0xc0,0x39,0xd5]
+// CHECK-UNKNOWN: d539c023
+
+MRS x3, ICV_HAPR_EL1
+// CHECK-INST: mrs x3, ICC_HAPR_EL1
+// CHECK-ENCODING: [0x63,0xc0,0x39,0xd5]
+// CHECK-UNKNOWN: d539c063
+
+MRS x3, ICV_HPPIR_EL1
+// CHECK-INST: mrs x3, ICC_HPPIR_EL1
+// CHECK-ENCODING: [0x63,0xca,0x38,0xd5]
+// CHECK-UNKNOWN: d538ca63
+
+MRS x3, ICV_PCR_EL1
+// CHECK-INST: mrs x3, ICC_PCR_EL1
+// CHECK-ENCODING: [0x43,0xc0,0x39,0xd5]
+// CHECK-UNKNOWN: d539c043
+
+
+// -----------------------------------------------
+// Likewise here, expect ICC_* encodings here, not ICV_* encodings
+MSR ICV_APR_EL1, x3
+// CHECK-INST: msr ICC_APR_EL1, x3
+// CHECK-ENCODING: [0x03,0xc0,0x19,0xd5]
+// CHECK-UNKNOWN: d519c003
+
+MSR ICV_CR0_EL1, x3
+// CHECK-INST: msr ICC_CR0_EL1, x3
+// CHECK-ENCODING: [0x23,0xc0,0x19,0xd5]
+// CHECK-UNKNOWN: d519c023
+
+MSR ICV_PCR_EL1, x3
+// CHECK-INST: msr ICC_PCR_EL1, x3
+// CHECK-ENCODING: [0x43,0xc0,0x19,0xd5]
+// CHECK-UNKNOWN: d519c043
+
+// -----------------------------------------------
+// PPI Registers MRS Instructions - Encodings Checked
+MRS x3, ICC_PPI_CACTIVER0_EL1
+// CHECK-INST: mrs x3, ICC_PPI_CACTIVER0_EL1
+// CHECK-ENCODING: [0x03,0xcd,0x38,0xd5]
+// CHECK-UNKNOWN: d538cd03
+
+MRS x3, ICC_PPI_CACTIVER1_EL1
+// CHECK-INST: mrs x3, ICC_PPI_CACTIVER1_EL1
+// CHECK-ENCODING: [0x23,0xcd,0x38,0xd5]
+// CHECK-UNKNOWN: d538cd23
+
+MRS x3, ICC_PPI_CPENDR0_EL1
+// CHECK-INST: mrs x3, ICC_PPI_CPENDR0_EL1
+// CHECK-ENCODING: [0x83,0xcd,0x38,0xd5]
+// CHECK-UNKNOWN: d538cd83
+
+MRS x3, ICC_PPI_CPENDR1_EL1
+// CHECK-INST: mrs x3, ICC_PPI_CPENDR1_EL1
+// CHECK-ENCODING: [0xa3,0xcd,0x38,0xd5]
+// CHECK-UNKNOWN: d538cda3
+
+MRS x3, ICC_PPI_DOMAINR0_EL3
+// CHECK-INST: mrs x3, ICC_PPI_DOMAINR0_EL3
+// CHECK-ENCODING: [0x83,0xc8,0x3e,0xd5]
+// CHECK-UNKNOWN: d53ec883
+
+MRS x3, ICC_PPI_DOMAINR1_EL3
+// CHECK-INST: mrs x3, ICC_PPI_DOMAINR1_EL3
+// CHECK-ENCODING: [0xa3,0xc8,0x3e,0xd5]
+// CHECK-UNKNOWN: d53ec8a3
+
+MRS x3, ICC_PPI_DOMAINR2_EL3
+// CHECK-INST: mrs x3, ICC_PPI_DOMAINR2_EL3
+// CHECK-ENCODING: [0xc3,0xc8,0x3e,0xd5]
+// CHECK-UNKNOWN: d53ec8c3
+
+MRS x3, ICC_PPI_DOMAINR3_EL3
+// CHECK-INST: mrs x3, ICC_PPI_DOMAINR3_EL3
+// CHECK-ENCODING: [0xe3,0xc8,0x3e,0xd5]
+// CHECK-UNKNOWN: d53ec8e3
+
+MRS x3, ICC_PPI_ENABLER0_EL1
+// CHECK-INST: mrs x3, ICC_PPI_ENABLER0_EL1
+// CHECK-ENCODING: [0xc3,0xca,0x38,0xd5]
+// CHECK-UNKNOWN: d538cac3
+
+MRS x3, ICC_PPI_ENABLER1_EL1
+// CHECK-INST: mrs x3, ICC_PPI_ENABLER1_EL1
+// CHECK-ENCODING: [0xe3,0xca,0x38,0xd5]
+// CHECK-UNKNOWN: d538cae3
+
+MRS x3, ICC_PPI_PRIORITYR0_EL1
+// CHECK-INST: mrs x3, ICC_PPI_PRIORITYR0_EL1
+// CHECK-ENCODING: [0x03,0xce,0x38,0xd5]
+// CHECK-UNKNOWN: d538ce03
+
+MRS x3, ICC_PPI_PRIORITYR1_EL1
+// CHECK-INST: mrs x3, ICC_PPI_PRIORITYR1_EL1
+// CHECK-ENCODING: [0x23,0xce,0x38,0xd5]
+// CHECK-UNKNOWN: d538ce23
+
+MRS x3, ICC_PPI_PRIORITYR2_EL1
+// CHECK-INST: mrs x3, ICC_PPI_PRIORITYR2_EL1
+// CHECK-ENCODING: [0x43,0xce,0x38,0xd5]
+// CHECK-UNKNOWN: d538ce43
+
+MRS x3, ICC_PPI_PRIORITYR3_EL1
+// CHECK-INST: mrs x3, ICC_PPI_PRIORITYR3_EL1
+// CHECK-ENCODING: [0x63,0xce,0x38,0xd5]
+// CHECK-UNKNOWN: d538ce63
+
+MRS x3, ICC_PPI_PRIORITYR4_EL1
+// CHECK-INST: mrs x3, ICC_PPI_PRIORITYR4_EL1
+// CHECK-ENCODING: [0x83,0xce,0x38,0xd5]
+// CHECK-UNKNOWN: d538ce83
+
+MRS x3, ICC_PPI_PRIORITYR5_EL1
+// CHECK-INST: mrs x3, ICC_PPI_PRIORITYR5_EL1
+// CHECK-ENCODING: [0xa3,0xce,0x38,0xd5]
+// CHECK-UNKNOWN: d538cea3
+
+MRS x3, ICC_PPI_PRIORITYR6_EL1
+// CHECK-INST: mrs x3, ICC_PPI_PRIORITYR6_EL1
+// CHECK-ENCODING: [0xc3,0xce,0x38,0xd5]
+// CHECK-UNKNOWN: d538cec3
+
+MRS x3, ICC_PPI_PRIORITYR7_EL1
+// CHECK-INST: mrs x3, ICC_PPI_PRIORITYR7_EL1
+// CHECK-ENCODING: [0xe3,0xce,0x38,0xd5]
+// CHECK-UNKNOWN: d538cee3
+
+MRS x3, ICC_PPI_PRIORITYR8_EL1
+// CHECK-INST: mrs x3, ICC_PPI_PRIORITYR8_EL1
+// CHECK-ENCODING: [0x03,0xcf,0x38,0xd5]
+// CHECK-UNKNOWN: d538cf03
+
+MRS x3, ICC_PPI_PRIORITYR9_EL1
+// CHECK-INST: mrs x3, ICC_PPI_PRIORITYR9_EL1
+// CHECK-ENCODING: [0x23,0xcf,0x38,0xd5]
+// CHECK-UNKNOWN: d538cf23
+
+MRS x3, ICC_PPI_PRIORITYR10_EL1
+// CHECK-INST: mrs x3, ICC_PPI_PRIORITYR10_EL1
+// CHECK-ENCODING: [0x43,0xcf,0x38,0xd5]
+// CHECK-UNKNOWN: d538cf43
+
+MRS x3, ICC_PPI_PRIORITYR11_EL1
+// CHECK-INST: mrs x3, ICC_PPI_PRIORITYR11_EL1
+// CHECK-ENCODING: [0x63,0xcf,0x38,0xd5]
+// CHECK-UNKNOWN: d538cf63
+
+MRS x3, ICC_PPI_PRIORITYR12_EL1
+// CHECK-INST: mrs x3, ICC_PPI_PRIORITYR12_EL1
+// CHECK-ENCODING: [0x83,0xcf,0x38,0xd5]
+// CHECK-UNKNOWN: d538cf83
+
+MRS x3, ICC_PPI_PRIORITYR13_EL1
+// CHECK-INST: mrs x3, ICC_PPI_PRIORITYR13_EL1
+// CHECK-ENCODING: [0xa3,0xcf,0x38,0xd5]
+// CHECK-UNKNOWN: d538cfa3
+
+MRS x3, ICC_PPI_PRIORITYR14_EL1
+// CHECK-INST: mrs x3, ICC_PPI_PRIORITYR14_EL1
+// CHECK-ENCODING: [0xc3,0xcf,0x38,0xd5]
+// CHECK-UNKNOWN: d538cfc3
+
+MRS x3, ICC_PPI_PRIORITYR15_EL1
+// CHECK-INST: mrs x3, ICC_PPI_PRIORITYR15_EL1
+// CHECK-ENCODING: [0xe3,0xcf,0x38,0xd5]
+// CHECK-UNKNOWN: d538cfe3
+
+MRS x3, ICC_PPI_SACTIVER0_EL1
+// CHECK-INST: mrs x3, ICC_PPI_SACTIVER0_EL1
+// CHECK-ENCODING: [0x43,0xcd,0x38,0xd5]
+// CHECK-UNKNOWN: d538cd43
+
+MRS x3, ICC_PPI_SACTIVER1_EL1
+// CHECK-INST: mrs x3, ICC_PPI_SACTIVER1_EL1
+// CHECK-ENCODING: [0x63,0xcd,0x38,0xd5]
+// CHECK-UNKNOWN: d538cd63
+
+MRS x3, ICC_PPI_SPENDR0_EL1
+// CHECK-INST: mrs x3, ICC_PPI_SPENDR0_EL1
+// CHECK-ENCODING: [0xc3,0xcd,0x38,0xd5]
+// CHECK-UNKNOWN: d538cdc3
+
+MRS x3, ICC_PPI_SPENDR1_EL1
+// CHECK-INST: mrs x3, ICC_PPI_SPENDR1_EL1
+// CHECK-ENCODING: [0xe3,0xcd,0x38,0xd5]
+// CHECK-UNKNOWN: d538cde3
+
+MRS x3, ICC_PPI_HMR0_EL1
+// CHECK-INST: mrs x3, ICC_PPI_HMR0_EL1
+// CHECK-ENCODING: [0x03,0xca,0x38,0xd5]
+// CHECK-UNKNOWN: d538ca03
+
+MRS x3, ICC_PPI_HMR1_EL1
+// CHECK-INST: mrs x3, ICC_PPI_HMR1_EL1
+// CHECK-ENCODING: [0x23,0xca,0x38,0xd5]
+// CHECK-UNKNOWN: d538ca23
+
+// -----------------------------------------------
+// MSR PPI Registers Instructions
+MSR ICC_PPI_CACTIVER0_EL1, x3
+// CHECK-INST: msr ICC_PPI_CACTIVER0_EL1, x3
+// CHECK-ENCODING: [0x03,0xcd,0x18,0xd5]
+// CHECK-UNKNOWN: d518cd03
+
+MSR ICC_PPI_CACTIVER1_EL1, x3
+// CHECK-INST: msr ICC_PPI_CACTIVER1_EL1, x3
+// CHECK-ENCODING: [0x23,0xcd,0x18,0xd5]
+// CHECK-UNKNOWN: d518cd23
+
+MSR ICC_PPI_CPENDR0_EL1, x3
+// CHECK-INST: msr ICC_PPI_CPENDR0_EL1, x3
+// CHECK-ENCODING: [0x83,0xcd,0x18,0xd5]
+// CHECK-UNKNOWN: d518cd83
+
+MSR ICC_PPI_CPENDR1_EL1, x3
+// CHECK-INST: msr ICC_PPI_CPENDR1_EL1, x3
+// CHECK-ENCODING: [0xa3,0xcd,0x18,0xd5]
+// CHECK-UNKNOWN: d518cda3
+
+MSR ICC_PPI_DOMAINR0_EL3, x3
+// CHECK-INST: msr ICC_PPI_DOMAINR0_EL3, x3
+// CHECK-ENCODING: [0x83,0xc8,0x1e,0xd5]
+// CHECK-UNKNOWN: d51ec883
+
+MSR ICC_PPI_DOMAINR1_EL3, x3
+// CHECK-INST: msr ICC_PPI_DOMAINR1_EL3, x3
+// CHECK-ENCODING: [0xa3,0xc8,0x1e,0xd5]
+// CHECK-UNKNOWN: d51ec8a3
+
+MSR ICC_PPI_DOMAINR2_EL3, x3
+// CHECK-INST: msr ICC_PPI_DOMAINR2_EL3, x3
+// CHECK-ENCODING: [0xc3,0xc8,0x1e,0xd5]
+// CHECK-UNKNOWN: d51ec8c3
+
+MSR ICC_PPI_DOMAINR3_EL3, x3
+// CHECK-INST: msr ICC_PPI_DOMAINR3_EL3, x3
+// CHECK-ENCODING: [0xe3,0xc8,0x1e,0xd5]
+// CHECK-UNKNOWN: d51ec8e3
+
+MSR ICC_PPI_ENABLER0_EL1, x3
+// CHECK-INST: msr ICC_PPI_ENABLER0_EL1, x3
+// CHECK-ENCODING: [0xc3,0xca,0x18,0xd5]
+// CHECK-UNKNOWN: d518cac3
+
+MSR ICC_PPI_ENABLER1_EL1, x3
+// CHECK-INST: msr ICC_PPI_ENABLER1_EL1, x3
+// CHECK-ENCODING: [0xe3,0xca,0x18,0xd5]
+// CHECK-UNKNOWN: d518cae3
+
+MSR ICC_PPI_PRIORITYR0_EL1, x3
+// CHECK-INST: msr ICC_PPI_PRIORITYR0_EL1, x3
+// CHECK-ENCODING: [0x03,0xce,0x18,0xd5]
+// CHECK-UNKNOWN: d518ce03
+
+MSR ICC_PPI_PRIORITYR1_EL1, x3
+// CHECK-INST: msr ICC_PPI_PRIORITYR1_EL1, x3
+// CHECK-ENCODING: [0x23,0xce,0x18,0xd5]
+// CHECK-UNKNOWN: d518ce23
+
+MSR ICC_PPI_PRIORITYR2_EL1, x3
+// CHECK-INST: msr ICC_PPI_PRIORITYR2_EL1, x3
+// CHECK-ENCODING: [0x43,0xce,0x18,0xd5]
+// CHECK-UNKNOWN: d518ce43
+
+MSR ICC_PPI_PRIORITYR3_EL1, x3
+// CHECK-INST: msr ICC_PPI_PRIORITYR3_EL1, x3
+// CHECK-ENCODING: [0x63,0xce,0x18,0xd5]
+// CHECK-UNKNOWN: d518ce63
+
+MSR ICC_PPI_PRIORITYR4_EL1, x3
+// CHECK-INST: msr ICC_PPI_PRIORITYR4_EL1, x3
+// CHECK-ENCODING: [0x83,0xce,0x18,0xd5]
+// CHECK-UNKNOWN: d518ce83
+
+MSR ICC_PPI_PRIORITYR5_EL1, x3
+// CHECK-INST: msr ICC_PPI_PRIORITYR5_EL1, x3
+// CHECK-ENCODING: [0xa3,0xce,0x18,0xd5]
+// CHECK-UNKNOWN: d518cea3
+
+MSR ICC_PPI_PRIORITYR6_EL1, x3
+// CHECK-INST: msr ICC_PPI_PRIORITYR6_EL1, x3
+// CHECK-ENCODING: [0xc3,0xce,0x18,0xd5]
+// CHECK-UNKNOWN: d518cec3
+
+MSR ICC_PPI_PRIORITYR7_EL1, x3
+// CHECK-INST: msr ICC_PPI_PRIORITYR7_EL1, x3
+// CHECK-ENCODING: [0xe3,0xce,0x18,0xd5]
+// CHECK-UNKNOWN: d518cee3
+
+MSR ICC_PPI_PRIORITYR8_EL1, x3
+// CHECK-INST: msr ICC_PPI_PRIORITYR8_EL1, x3
+// CHECK-ENCODING: [0x03,0xcf,0x18,0xd5]
+// CHECK-UNKNOWN: d518cf03
+
+MSR ICC_PPI_PRIORITYR9_EL1, x3
+// CHECK-INST: msr ICC_PPI_PRIORITYR9_EL1, x3
+// CHECK-ENCODING: [0x23,0xcf,0x18,0xd5]
+// CHECK-UNKNOWN: d518cf23
+
+MSR ICC_PPI_PRIORITYR10_EL1, x3
+// CHECK-INST: msr ICC_PPI_PRIORITYR10_EL1, x3
+// CHECK-ENCODING: [0x43,0xcf,0x18,0xd5]
+// CHECK-UNKNOWN: d518cf43
+
+MSR ICC_PPI_PRIORITYR11_EL1, x3
+// CHECK-INST: msr ICC_PPI_PRIORITYR11_EL1, x3
+// CHECK-ENCODING: [0x63,0xcf,0x18,0xd5]
+// CHECK-UNKNOWN: d518cf63
+
+MSR ICC_PPI_PRIORITYR12_EL1, x3
+// CHECK-INST: msr ICC_PPI_PRIORITYR12_EL1, x3
+// CHECK-ENCODING: [0x83,0xcf,0x18,0xd5]
+// CHECK-UNKNOWN: d518cf83
+
+MSR ICC_PPI_PRIORITYR13_EL1, x3
+// CHECK-INST: msr ICC_PPI_PRIORITYR13_EL1, x3
+// CHECK-ENCODING: [0xa3,0xcf,0x18,0xd5]
+// CHECK-UNKNOWN: d518cfa3
+
+MSR ICC_PPI_PRIORITYR14_EL1, x3
+// CHECK-INST: msr ICC_PPI_PRIORITYR14_EL1, x3
+// CHECK-ENCODING: [0xc3,0xcf,0x18,0xd5]
+// CHECK-UNKNOWN: d518cfc3
+
+MSR ICC_PPI_PRIORITYR15_EL1, x3
+// CHECK-INST: msr ICC_PPI_PRIORITYR15_EL1, x3
+// CHECK-ENCODING: [0xe3,0xcf,0x18,0xd5]
+// CHECK-UNKNOWN: d518cfe3
+
+MSR ICC_PPI_SACTIVER0_EL1, x3
+// CHECK-INST: msr ICC_PPI_SACTIVER0_EL1, x3
+// CHECK-ENCODING: [0x43,0xcd,0x18,0xd5]
+// CHECK-UNKNOWN: d518cd43
+
+MSR ICC_PPI_SACTIVER1_EL1, x3
+// CHECK-INST: msr ICC_PPI_SACTIVER1_EL1, x3
+// CHECK-ENCODING: [0x63,0xcd,0x18,0xd5]
+// CHECK-UNKNOWN: d518cd63
+
+MSR ICC_PPI_SPENDR0_EL1, x3
+// CHECK-INST: msr ICC_PPI_SPENDR0_EL1, x3
+// CHECK-ENCODING: [0xc3,0xcd,0x18,0xd5]
+// CHECK-UNKNOWN: d518cdc3
+
+MSR ICC_PPI_SPENDR1_EL1, x3
+// CHECK-INST: msr ICC_PPI_SPENDR1_EL1, x3
+// CHECK-ENCODING: [0xe3,0xcd,0x18,0xd5]
+// CHECK-UNKNOWN: d518cde3
+
+// -----------------------------------------------
+// Hypervisor Control Register MRS Instructions
+MRS x3, ICH_APR_EL2
+// CHECK-INST: mrs x3, ICH_APR_EL2
+// CHECK-ENCODING: [0x83,0xc8,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cc883
+
+MRS x3, ICH_CONTEXTR_EL2
+// CHECK-INST: mrs x3, ICH_CONTEXTR_EL2
+// CHECK-ENCODING: [0xc3,0xcb,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccbc3
+
+MRS x3, ICH_HFGITR_EL2
+// CHECK-INST: mrs x3, ICH_HFGITR_EL2
+// CHECK-ENCODING: [0xe3,0xc9,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cc9e3
+
+MRS x3, ICH_HFGRTR_EL2
+// CHECK-INST: mrs x3, ICH_HFGRTR_EL2
+// CHECK-ENCODING: [0x83,0xc9,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cc983
+
+MRS x3, ICH_HFGWTR_EL2
+// CHECK-INST: mrs x3, ICH_HFGWTR_EL2
+// CHECK-ENCODING: [0xc3,0xc9,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cc9c3
+
+MRS x3, ICH_HPPIR_EL2
+// CHECK-INST: mrs x3, ICH_HPPIR_EL2
+// CHECK-ENCODING: [0xa3,0xc8,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cc8a3
+
+MRS x3, ICH_PPI_ACTIVER0_EL2
+// CHECK-INST: mrs x3, ICH_PPI_ACTIVER0_EL2
+// CHECK-ENCODING: [0xc3,0xca,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccac3
+
+MRS x3, ICH_PPI_ACTIVER1_EL2
+// CHECK-INST: mrs x3, ICH_PPI_ACTIVER1_EL2
+// CHECK-ENCODING: [0xe3,0xca,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccae3
+
+MRS x3, ICH_PPI_DVIR0_EL2
+// CHECK-INST: mrs x3, ICH_PPI_DVIR0_EL2
+// CHECK-ENCODING: [0x03,0xca,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cca03
+
+MRS x3, ICH_PPI_DVIR1_EL2
+// CHECK-INST: mrs x3, ICH_PPI_DVIR1_EL2
+// CHECK-ENCODING: [0x23,0xca,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cca23
+
+MRS x3, ICH_PPI_ENABLER0_EL2
+// CHECK-INST: mrs x3, ICH_PPI_ENABLER0_EL2
+// CHECK-ENCODING: [0x43,0xca,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cca43
+
+MRS x3, ICH_PPI_ENABLER1_EL2
+// CHECK-INST: mrs x3, ICH_PPI_ENABLER1_EL2
+// CHECK-ENCODING: [0x63,0xca,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cca63
+
+MRS x3, ICH_PPI_PENDR0_EL2
+// CHECK-INST: mrs x3, ICH_PPI_PENDR0_EL2
+// CHECK-ENCODING: [0x83,0xca,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cca83
+
+MRS x3, ICH_PPI_PENDR1_EL2
+// CHECK-INST: mrs x3, ICH_PPI_PENDR1_EL2
+// CHECK-ENCODING: [0xa3,0xca,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccaa3
+
+MRS x3, ICH_PPI_PRIORITYR0_EL2
+// CHECK-INST: mrs x3, ICH_PPI_PRIORITYR0_EL2
+// CHECK-ENCODING: [0x03,0xce,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cce03
+
+MRS x3, ICH_PPI_PRIORITYR1_EL2
+// CHECK-INST: mrs x3, ICH_PPI_PRIORITYR1_EL2
+// CHECK-ENCODING: [0x23,0xce,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cce23
+
+MRS x3, ICH_PPI_PRIORITYR2_EL2
+// CHECK-INST: mrs x3, ICH_PPI_PRIORITYR2_EL2
+// CHECK-ENCODING: [0x43,0xce,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cce43
+
+MRS x3, ICH_PPI_PRIORITYR3_EL2
+// CHECK-INST: mrs x3, ICH_PPI_PRIORITYR3_EL2
+// CHECK-ENCODING: [0x63,0xce,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cce63
+
+MRS x3, ICH_PPI_PRIORITYR4_EL2
+// CHECK-INST: mrs x3, ICH_PPI_PRIORITYR4_EL2
+// CHECK-ENCODING: [0x83,0xce,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cce83
+
+MRS x3, ICH_PPI_PRIORITYR5_EL2
+// CHECK-INST: mrs x3, ICH_PPI_PRIORITYR5_EL2
+// CHECK-ENCODING: [0xa3,0xce,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccea3
+
+MRS x3, ICH_PPI_PRIORITYR6_EL2
+// CHECK-INST: mrs x3, ICH_PPI_PRIORITYR6_EL2
+// CHECK-ENCODING: [0xc3,0xce,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccec3
+
+MRS x3, ICH_PPI_PRIORITYR7_EL2
+// CHECK-INST: mrs x3, ICH_PPI_PRIORITYR7_EL2
+// CHECK-ENCODING: [0xe3,0xce,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccee3
+
+MRS x3, ICH_PPI_PRIORITYR8_EL2
+// CHECK-INST: mrs x3, ICH_PPI_PRIORITYR8_EL2
+// CHECK-ENCODING: [0x03,0xcf,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccf03
+
+MRS x3, ICH_PPI_PRIORITYR9_EL2
+// CHECK-INST: mrs x3, ICH_PPI_PRIORITYR9_EL2
+// CHECK-ENCODING: [0x23,0xcf,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccf23
+
+MRS x3, ICH_PPI_PRIORITYR10_EL2
+// CHECK-INST: mrs x3, ICH_PPI_PRIORITYR10_EL2
+// CHECK-ENCODING: [0x43,0xcf,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccf43
+
+MRS x3, ICH_PPI_PRIORITYR11_EL2
+// CHECK-INST: mrs x3, ICH_PPI_PRIORITYR11_EL2
+// CHECK-ENCODING: [0x63,0xcf,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccf63
+
+MRS x3, ICH_PPI_PRIORITYR12_EL2
+// CHECK-INST: mrs x3, ICH_PPI_PRIORITYR12_EL2
+// CHECK-ENCODING: [0x83,0xcf,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccf83
+
+MRS x3, ICH_PPI_PRIORITYR13_EL2
+// CHECK-INST: mrs x3, ICH_PPI_PRIORITYR13_EL2
+// CHECK-ENCODING: [0xa3,0xcf,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccfa3
+
+MRS x3, ICH_PPI_PRIORITYR14_EL2
+// CHECK-INST: mrs x3, ICH_PPI_PRIORITYR14_EL2
+// CHECK-ENCODING: [0xc3,0xcf,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccfc3
+
+MRS x3, ICH_PPI_PRIORITYR15_EL2
+// CHECK-INST: mrs x3, ICH_PPI_PRIORITYR15_EL2
+// CHECK-ENCODING: [0xe3,0xcf,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccfe3
+
+MRS x3, ICH_VCTLR_EL2
+// CHECK-INST: mrs x3, ICH_VCTLR_EL2
+// CHECK-ENCODING: [0x83,0xcb,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccb83
+
+// -----------------------------------------------
+// Hypervisor Control Register MSR Instructions
+MSR ICH_APR_EL2, x3
+// CHECK-INST: msr ICH_APR_EL2, x3
+// CHECK-ENCODING: [0x83,0xc8,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cc883
+
+MSR ICH_CONTEXTR_EL2, x3
+// CHECK-INST: msr ICH_CONTEXTR_EL2, x3
+// CHECK-ENCODING: [0xc3,0xcb,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccbc3
+
+MSR ICH_HFGITR_EL2, x3
+// CHECK-INST: msr ICH_HFGITR_EL2, x3
+// CHECK-ENCODING: [0xe3,0xc9,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cc9e3
+
+MSR ICH_HFGRTR_EL2, x3
+// CHECK-INST: msr ICH_HFGRTR_EL2, x3
+// CHECK-ENCODING: [0x83,0xc9,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cc983
+
+MSR ICH_HFGWTR_EL2, x3
+// CHECK-INST: msr ICH_HFGWTR_EL2, x3
+// CHECK-ENCODING: [0xc3,0xc9,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cc9c3
+
+MSR ICH_PPI_ACTIVER0_EL2, x3
+// CHECK-INST: msr ICH_PPI_ACTIVER0_EL2, x3
+// CHECK-ENCODING: [0xc3,0xca,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccac3
+
+MSR ICH_PPI_ACTIVER1_EL2, x3
+// CHECK-INST: msr ICH_PPI_ACTIVER1_EL2, x3
+// CHECK-ENCODING: [0xe3,0xca,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccae3
+
+MSR ICH_PPI_DVIR0_EL2, x3
+// CHECK-INST: msr ICH_PPI_DVIR0_EL2, x3
+// CHECK-ENCODING: [0x03,0xca,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cca03
+
+MSR ICH_PPI_DVIR1_EL2, x3
+// CHECK-INST: msr ICH_PPI_DVIR1_EL2, x3
+// CHECK-ENCODING: [0x23,0xca,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cca23
+
+MSR ICH_PPI_ENABLER0_EL2, x3
+// CHECK-INST: msr ICH_PPI_ENABLER0_EL2, x3
+// CHECK-ENCODING: [0x43,0xca,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cca43
+
+MSR ICH_PPI_ENABLER1_EL2, x3
+// CHECK-INST: msr ICH_PPI_ENABLER1_EL2, x3
+// CHECK-ENCODING: [0x63,0xca,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cca63
+
+MSR ICH_PPI_PENDR0_EL2, x3
+// CHECK-INST: msr ICH_PPI_PENDR0_EL2, x3
+// CHECK-ENCODING: [0x83,0xca,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cca83
+
+MSR ICH_PPI_PENDR1_EL2, x3
+// CHECK-INST: msr ICH_PPI_PENDR1_EL2, x3
+// CHECK-ENCODING: [0xa3,0xca,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccaa3
+
+MSR ICH_PPI_PRIORITYR0_EL2, x3
+// CHECK-INST: msr ICH_PPI_PRIORITYR0_EL2, x3
+// CHECK-ENCODING: [0x03,0xce,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cce03
+
+MSR ICH_PPI_PRIORITYR1_EL2, x3
+// CHECK-INST: msr ICH_PPI_PRIORITYR1_EL2, x3
+// CHECK-ENCODING: [0x23,0xce,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cce23
+
+MSR ICH_PPI_PRIORITYR2_EL2, x3
+// CHECK-INST: msr ICH_PPI_PRIORITYR2_EL2, x3
+// CHECK-ENCODING: [0x43,0xce,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cce43
+
+MSR ICH_PPI_PRIORITYR3_EL2, x3
+// CHECK-INST: msr ICH_PPI_PRIORITYR3_EL2, x3
+// CHECK-ENCODING: [0x63,0xce,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cce63
+
+MSR ICH_PPI_PRIORITYR4_EL2, x3
+// CHECK-INST: msr ICH_PPI_PRIORITYR4_EL2, x3
+// CHECK-ENCODING: [0x83,0xce,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cce83
+
+MSR ICH_PPI_PRIORITYR5_EL2, x3
+// CHECK-INST: msr ICH_PPI_PRIORITYR5_EL2, x3
+// CHECK-ENCODING: [0xa3,0xce,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccea3
+
+MSR ICH_PPI_PRIORITYR6_EL2, x3
+// CHECK-INST: msr ICH_PPI_PRIORITYR6_EL2, x3
+// CHECK-ENCODING: [0xc3,0xce,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccec3
+
+MSR ICH_PPI_PRIORITYR7_EL2, x3
+// CHECK-INST: msr ICH_PPI_PRIORITYR7_EL2, x3
+// CHECK-ENCODING: [0xe3,0xce,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccee3
+
+MSR ICH_PPI_PRIORITYR8_EL2, x3
+// CHECK-INST: msr ICH_PPI_PRIORITYR8_EL2, x3
+// CHECK-ENCODING: [0x03,0xcf,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccf03
+
+MSR ICH_PPI_PRIORITYR9_EL2, x3
+// CHECK-INST: msr ICH_PPI_PRIORITYR9_EL2, x3
+// CHECK-ENCODING: [0x23,0xcf,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccf23
+
+MSR ICH_PPI_PRIORITYR10_EL2, x3
+// CHECK-INST: msr ICH_PPI_PRIORITYR10_EL2, x3
+// CHECK-ENCODING: [0x43,0xcf,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccf43
+
+MSR ICH_PPI_PRIORITYR11_EL2, x3
+// CHECK-INST: msr ICH_PPI_PRIORITYR11_EL2, x3
+// CHECK-ENCODING: [0x63,0xcf,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccf63
+
+MSR ICH_PPI_PRIORITYR12_EL2, x3
+// CHECK-INST: msr ICH_PPI_PRIORITYR12_EL2, x3
+// CHECK-ENCODING: [0x83,0xcf,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccf83
+
+MSR ICH_PPI_PRIORITYR13_EL2, x3
+// CHECK-INST: msr ICH_PPI_PRIORITYR13_EL2, x3
+// CHECK-ENCODING: [0xa3,0xcf,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccfa3
+
+MSR ICH_PPI_PRIORITYR14_EL2, x3
+// CHECK-INST: msr ICH_PPI_PRIORITYR14_EL2, x3
+// CHECK-ENCODING: [0xc3,0xcf,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccfc3
+
+MSR ICH_PPI_PRIORITYR15_EL2, x3
+// CHECK-INST: msr ICH_PPI_PRIORITYR15_EL2, x3
+// CHECK-ENCODING: [0xe3,0xcf,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccfe3
+
+MSR ICH_VCTLR_EL2, x3
+// CHECK-INST: msr ICH_VCTLR_EL2, x3
+// CHECK-ENCODING: [0x83,0xcb,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccb83
+
+// -----------------------------------------------
+// FEAT_GCIE Instructions
+// Current Interrupt Domain
+GIC CDAFF, x3
+// CHECK-INST: gic cdaff, x3
+// CHECK-ENCODING: [0x63,0xc1,0x08,0xd5]
+// CHECK-UNKNOWN: d508c163 sys #0, c12, c1, #3, x3
+// CHECK-ERROR: error: GIC cdaff requires: gcie
+
+GIC CDDI, x3
+// CHECK-INST: gic cddi, x3
+// CHECK-ENCODING: [0x03,0xc2,0x08,0xd5]
+// CHECK-UNKNOWN: d508c203 sys #0, c12, c2, #0, x3
+// CHECK-ERROR: error: GIC cddi requires: gcie
+
+GIC CDDIS, x3
+// CHECK-INST: gic cddis, x3
+// CHECK-ENCODING: [0x03,0xc1,0x08,0xd5]
+// CHECK-UNKNOWN: d508c103 sys #0, c12, c1, #0, x3
+// CHECK-ERROR: error: GIC cddis requires: gcie
+
+GIC CDEN, x3
+// CHECK-INST: gic cden, x3
+// CHECK-ENCODING: [0x23,0xc1,0x08,0xd5]
+// CHECK-UNKNOWN: d508c123 sys #0, c12, c1, #1, x3
+// CHECK-ERROR: error: GIC cden requires: gcie
+
+GIC CDEOI, x3
+// CHECK-INST: gic cdeoi, x3
+// CHECK-ENCODING: [0xe3,0xc1,0x08,0xd5]
+// CHECK-UNKNOWN: d508c1e3 sys #0, c12, c1, #7, x3
+// CHECK-ERROR: error: GIC cdeoi requires: gcie
+
+GIC CDHM, x3
+// CHECK-INST: gic cdhm, x3
+// CHECK-ENCODING: [0x23,0xc2,0x08,0xd5]
+// CHECK-UNKNOWN: d508c223 sys #0, c12, c2, #1, x3
+// CHECK-ERROR: error: GIC cdhm requires: gcie
+
+GIC CDPEND, x3
+// CHECK-INST: gic cdpend, x3
+// CHECK-ENCODING: [0x83,0xc1,0x08,0xd5]
+// CHECK-UNKNOWN: d508c183 sys #0, c12, c1, #4, x3
+// CHECK-ERROR: error: GIC cdpend requires: gcie
+
+GIC CDPRI, x3
+// CHECK-INST: gic cdpri, x3
+// CHECK-ENCODING: [0x43,0xc1,0x08,0xd5]
+// CHECK-UNKNOWN: d508c143 sys #0, c12, c1, #2, x3
+// CHECK-ERROR: error: GIC cdpri requires: gcie
+
+GIC CDRCFG, x3
+// CHECK-INST: gic cdrcfg, x3
+// CHECK-ENCODING: [0xa3,0xc1,0x08,0xd5]
+// CHECK-UNKNOWN: d508c1a3 sys #0, c12, c1, #5, x3
+// CHECK-ERROR: error: GIC cdrcfg requires: gcie
+
+GICR x3, CDIA
+// CHECK-INST: gicr x3, cdia
+// CHECK-ENCODING: [0x03,0xc3,0x28,0xd5]
+// CHECK-UNKNOWN: d528c303 sysl x3, #0, c12, c3, #0
+// CHECK-ERROR: error: GICR cdia requires: gcie
+
+GICR x3, CDNMIA
+// CHECK-INST: gicr x3, cdnmia
+// CHECK-ENCODING: [0x23,0xc3,0x28,0xd5]
+// CHECK-UNKNOWN: d528c323 sysl x3, #0, c12, c3, #1
+// CHECK-ERROR: error: GICR cdnmia requires: gcie
+
+// -----------------------------------------------
+// Virtual Interrupt Domain
+GIC VDAFF, x3
+// CHECK-INST: gic vdaff, x3
+// CHECK-ENCODING: [0x63,0xc1,0x0c,0xd5]
+// CHECK-UNKNOWN: d50cc163 sys #4, c12, c1, #3, x3
+// CHECK-ERROR: error: GIC vdaff requires: gcie
+
+GIC VDDI, x3
+// CHECK-INST: gic vddi, x3
+// CHECK-ENCODING: [0x03,0xc2,0x0c,0xd5]
+// CHECK-UNKNOWN: d50cc203 sys #4, c12, c2, #0, x3
+// CHECK-ERROR: error: GIC vddi requires: gcie
+
+GIC VDDIS, x3
+// CHECK-INST: gic vddis, x3
+// CHECK-ENCODING: [0x03,0xc1,0x0c,0xd5]
+// CHECK-UNKNOWN: d50cc103 sys #4, c12, c1, #0, x3
+// CHECK-ERROR: error: GIC vddis requires: gcie
+
+GIC VDEN, x3
+// CHECK-INST: gic vden, x3
+// CHECK-ENCODING: [0x23,0xc1,0x0c,0xd5]
+// CHECK-UNKNOWN: d50cc123 sys #4, c12, c1, #1, x3
+// CHECK-ERROR: error: GIC vden requires: gcie
+
+GIC VDHM, x3
+// CHECK-INST: gic vdhm, x3
+// CHECK-ENCODING: [0x23,0xc2,0x0c,0xd5]
+// CHECK-UNKNOWN: d50cc223 sys #4, c12, c2, #1, x3
+// CHECK-ERROR: error: GIC vdhm requires: gcie
+
+GIC VDPEND, x3
+// CHECK-INST: gic vdpend, x3
+// CHECK-ENCODING: [0x83,0xc1,0x0c,0xd5]
+// CHECK-UNKNOWN: d50cc183 sys #4, c12, c1, #4, x3
+// CHECK-ERROR: error: GIC vdpend requires: gcie
+
+GIC VDPRI, x3
+// CHECK-INST: gic vdpri, x3
+// CHECK-ENCODING: [0x43,0xc1,0x0c,0xd5]
+// CHECK-UNKNOWN: d50cc143 sys #4, c12, c1, #2, x3
+// CHECK-ERROR: error: GIC vdpri requires: gcie
+
+GIC VDRCFG, x3
+// CHECK-INST: gic vdrcfg, x3
+// CHECK-ENCODING: [0xa3,0xc1,0x0c,0xd5]
+// CHECK-UNKNOWN: d50cc1a3 sys #4, c12, c1, #5, x3
+// CHECK-ERROR: error: GIC vdrcfg requires: gcie
+
+// -----------------------------------------------
+// Logical Interrupt Domain
+GIC LDAFF, x3
+// CHECK-INST: gic ldaff, x3
+// CHECK-ENCODING: [0x63,0xc1,0x0e,0xd5]
+// CHECK-UNKNOWN: d50ec163 sys #6, c12, c1, #3, x3
+// CHECK-ERROR: error: GIC ldaff requires: gcie
+
+GIC LDDI, x3
+// CHECK-INST: gic lddi, x3
+// CHECK-ENCODING: [0x03,0xc2,0x0e,0xd5]
+// CHECK-UNKNOWN: d50ec203 sys #6, c12, c2, #0, x3
+// CHECK-ERROR: error: GIC lddi requires: gcie
+
+GIC LDDIS, x3
+// CHECK-INST: gic lddis, x3
+// CHECK-ENCODING: [0x03,0xc1,0x0e,0xd5]
+// CHECK-UNKNOWN: d50ec103 sys #6, c12, c1, #0, x3
+// CHECK-ERROR: error: GIC lddis requires: gcie
+
+GIC LDEN, x3
+// CHECK-INST: gic lden, x3
+// CHECK-ENCODING: [0x23,0xc1,0x0e,0xd5]
+// CHECK-UNKNOWN: d50ec123 sys #6, c12, c1, #1, x3
+// CHECK-ERROR: error: GIC lden requires: gcie
+
+GIC LDHM, x3
+// CHECK-INST: gic ldhm, x3
+// CHECK-ENCODING: [0x23,0xc2,0x0e,0xd5]
+// CHECK-UNKNOWN: d50ec223 sys #6, c12, c2, #1, x3
+// CHECK-ERROR: error: GIC ldhm requires: gcie
+
+GIC LDPEND, x3
+// CHECK-INST: gic ldpend, x3
+// CHECK-ENCODING: [0x83,0xc1,0x0e,0xd5]
+// CHECK-UNKNOWN: d50ec183 sys #6, c12, c1, #4, x3
+// CHECK-ERROR: error: GIC ldpend requires: gcie
+
+GIC LDPRI, x3
+// CHECK-INST: gic ldpri, x3
+// CHECK-ENCODING: [0x43,0xc1,0x0e,0xd5]
+// CHECK-UNKNOWN: d50ec143 sys #6, c12, c1, #2, x3
+// CHECK-ERROR: error: GIC ldpri requires: gcie
+
+GIC LDRCFG, x3
+// CHECK-INST: gic ldrcfg, x3
+// CHECK-ENCODING: [0xa3,0xc1,0x0e,0xd5]
+// CHECK-UNKNOWN: d50ec1a3 sys #6, c12, c1, #5, x3
+// CHECK-ERROR: error: GIC ldrcfg requires: gcie
+
+// -----------------------------------------------
+// GIC Synchronization Barrier Instructions
+GSB SYS
+// CHECK-INST: gsb sys
+// CHECK-ENCODING: [0x1f,0xc0,0x08,0xd5]
+// CHECK-UNKNOWN: d508c01f sys #0, c12, c0, #0
+// CHECK-ERROR: error: GSB sys requires: gcie
+
+GSB ACK
+// CHECK-INST: gsb ack
+// CHECK-ENCODING: [0x3f,0xc0,0x08,0xd5]
+// CHECK-UNKNOWN: d508c03f sys #0, c12, c0, #1
+// CHECK-ERROR: error: GSB ack requires: gcie
diff --git a/llvm/test/MC/AArch64/armv9.7a-memsys.s b/llvm/test/MC/AArch64/armv9.7a-memsys.s
new file mode 100644
index 0000000..228c71e
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.7a-memsys.s
@@ -0,0 +1,140 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+cmh,+lscp < %s \
+// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN: | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+cmh,+lscp < %s \
+// RUN: | llvm-objdump -d --mattr=+cmh,+lscp --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+cmh,+lscp < %s \
+// RUN: | llvm-objdump -d --mattr=-cmh,-lscp --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+cmh,+lscp < %s \
+// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN: | llvm-mc -triple=aarch64 -mattr=+cmh,+lscp -disassemble -show-encoding \
+// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+// Armv9.7-A Contention Management Hints (FEAT_CMH).
+
+shuh
+// CHECK-INST: shuh
+// CHECK-ENCODING: encoding: [0x5f,0x26,0x03,0xd5]
+// CHECK-ERROR: error: instruction requires: cmh
+// CHECK-UNKNOWN: d503265f hint #50
+
+shuh ph
+// CHECK-INST: shuh ph
+// CHECK-ENCODING: encoding: [0x7f,0x26,0x03,0xd5]
+// CHECK-ERROR: error: instruction requires: cmh
+// CHECK-UNKNOWN: d503267f hint #51
+
+stcph
+// CHECK-INST: stcph
+// CHECK-ENCODING: [0x9f,0x26,0x03,0xd5]
+// CHECK-ERROR: error: instruction requires: cmh
+// CHECK-UNKNOWN: d503269f hint #52
+
+ldap x0, x1, [x2]
+// CHECK-INST: ldap x0, x1, [x2]
+// CHECK-ENCODING: encoding: [0x40,0x58,0x41,0xd9]
+// CHECK-ERROR: error: instruction requires: lscp
+// CHECK-UNKNOWN: d9415840 <unknown>
+
+ldap x0, x1, [x2, #0]
+// CHECK-INST: ldap x0, x1, [x2]
+// CHECK-ENCODING: encoding: [0x40,0x58,0x41,0xd9]
+// CHECK-ERROR: error: instruction requires: lscp
+// CHECK-UNKNOWN: d9415840 <unknown>
+
+ldapp x0, x1, [x2]
+// CHECK-INST: ldapp x0, x1, [x2]
+// CHECK-ENCODING: encoding: [0x40,0x78,0x41,0xd9]
+// CHECK-ERROR: error: instruction requires: lscp
+// CHECK-UNKNOWN: d9417840 <unknown>
+
+ldapp x0, x1, [x2, #0]
+// CHECK-INST: ldapp x0, x1, [x2]
+// CHECK-ENCODING: encoding: [0x40,0x78,0x41,0xd9]
+// CHECK-ERROR: error: instruction requires: lscp
+// CHECK-UNKNOWN: d9417840 <unknown>
+
+stlp x0, x1, [x2, #0]
+// CHECK-INST: stlp x0, x1, [x2]
+// CHECK-ENCODING: encoding: [0x40,0x58,0x01,0xd9]
+// CHECK-ERROR: error: instruction requires: lscp
+// CHECK-UNKNOWN: d9015840 <unknown>
+
+stlp x0, x1, [x2]
+// CHECK-INST: stlp x0, x1, [x2]
+// CHECK-ENCODING: encoding: [0x40,0x58,0x01,0xd9]
+// CHECK-ERROR: error: instruction requires: lscp
+// CHECK-UNKNOWN: d9015840 <unknown>
+
+mrs x3, VTLBID0_EL2
+// CHECK-INST: mrs x3, VTLBID0_EL2
+// CHECK-ENCODING: encoding: [0x03,0x28,0x3c,0xd5]
+// CHECK-UNKNOWN: d53c2803
+mrs x3, VTLBID1_EL2
+// CHECK-INST: mrs x3, VTLBID1_EL2
+// CHECK-ENCODING: encoding: [0x23,0x28,0x3c,0xd5]
+// CHECK-UNKNOWN: d53c2823
+mrs x3, VTLBID2_EL2
+// CHECK-INST: mrs x3, VTLBID2_EL2
+// CHECK-ENCODING: encoding: [0x43,0x28,0x3c,0xd5]
+// CHECK-UNKNOWN: d53c2843
+mrs x3, VTLBID3_EL2
+// CHECK-INST: mrs x3, VTLBID3_EL2
+// CHECK-ENCODING: encoding: [0x63,0x28,0x3c,0xd5]
+// CHECK-UNKNOWN: d53c2863
+mrs x3, VTLBIDOS0_EL2
+// CHECK-INST: mrs x3, VTLBIDOS0_EL2
+// CHECK-ENCODING: encoding: [0x03,0x29,0x3c,0xd5]
+// CHECK-UNKNOWN: d53c2903
+mrs x3, VTLBIDOS1_EL2
+// CHECK-INST: mrs x3, VTLBIDOS1_EL2
+// CHECK-ENCODING: encoding: [0x23,0x29,0x3c,0xd5]
+// CHECK-UNKNOWN: d53c2923
+mrs x3, VTLBIDOS2_EL2
+// CHECK-INST: mrs x3, VTLBIDOS2_EL2
+// CHECK-ENCODING: encoding: [0x43,0x29,0x3c,0xd5]
+// CHECK-UNKNOWN: d53c2943
+mrs x3, VTLBIDOS3_EL2
+// CHECK-INST: mrs x3, VTLBIDOS3_EL2
+// CHECK-ENCODING: encoding: [0x63,0x29,0x3c,0xd5]
+// CHECK-UNKNOWN: d53c2963
+mrs x3, TLBIDIDR_EL1
+// CHECK-INST: mrs x3, TLBIDIDR_EL1
+// CHECK-ENCODING: encoding: [0xc3,0xa4,0x38,0xd5]
+// CHECK-UNKNOWN: d538a4c3
+
+msr VTLBID0_EL2, x3
+// CHECK-INST: msr VTLBID0_EL2, x3
+// CHECK-ENCODING: encoding: [0x03,0x28,0x1c,0xd5]
+// CHECK-UNKNOWN: d51c2803
+msr VTLBID1_EL2, x3
+// CHECK-INST: msr VTLBID1_EL2, x3
+// CHECK-ENCODING: encoding: [0x23,0x28,0x1c,0xd5]
+// CHECK-UNKNOWN: d51c2823
+msr VTLBID2_EL2, x3
+// CHECK-INST: msr VTLBID2_EL2, x3
+// CHECK-ENCODING: encoding: [0x43,0x28,0x1c,0xd5]
+// CHECK-UNKNOWN: d51c2843
+msr VTLBID3_EL2, x3
+// CHECK-INST: msr VTLBID3_EL2, x3
+// CHECK-ENCODING: encoding: [0x63,0x28,0x1c,0xd5]
+// CHECK-UNKNOWN: d51c2863
+msr VTLBIDOS0_EL2, x3
+// CHECK-INST: msr VTLBIDOS0_EL2, x3
+// CHECK-ENCODING: encoding: [0x03,0x29,0x1c,0xd5]
+// CHECK-UNKNOWN: d51c2903
+msr VTLBIDOS1_EL2, x3
+// CHECK-INST: msr VTLBIDOS1_EL2, x3
+// CHECK-ENCODING: encoding: [0x23,0x29,0x1c,0xd5]
+// CHECK-UNKNOWN: d51c2923
+msr VTLBIDOS2_EL2, x3
+// CHECK-INST: msr VTLBIDOS2_EL2, x3
+// CHECK-ENCODING: encoding: [0x43,0x29,0x1c,0xd5]
+// CHECK-UNKNOWN: d51c2943
+msr VTLBIDOS3_EL2, x3
+// CHECK-INST: msr VTLBIDOS3_EL2, x3
+// CHECK-ENCODING: encoding: [0x63,0x29,0x1c,0xd5]
+// CHECK-UNKNOWN: d51c2963
+
diff --git a/llvm/test/MC/AArch64/armv9.7a-mpamv2-diagnostics.s b/llvm/test/MC/AArch64/armv9.7a-mpamv2-diagnostics.s
new file mode 100644
index 0000000..54fdc23
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.7a-mpamv2-diagnostics.s
@@ -0,0 +1,18 @@
+// RUN: not llvm-mc -triple=aarch64 -mattr=+mpamv2 -show-encoding < %s 2>&1 \
+// RUN: | FileCheck %s --check-prefix=CHECK-ERROR
+
+//------------------------------------------------------------------------------
+// Armv9.7-A FEAT_MPAMV2 Extensions
+//------------------------------------------------------------------------------
+
+mlbi alle1, x30
+// CHECK-ERROR: error: specified mlbi op does not use a register
+
+mlbi vmalle1, x30
+// CHECK-ERROR: error: specified mlbi op does not use a register
+
+mlbi vpide1
+// CHECK-ERROR: error: specified mlbi op requires a register
+
+mlbi vpmge1
+// CHECK-ERROR: error: specified mlbi op requires a register
diff --git a/llvm/test/MC/AArch64/armv9.7a-mpamv2.s b/llvm/test/MC/AArch64/armv9.7a-mpamv2.s
new file mode 100644
index 0000000..b8b21e96
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.7a-mpamv2.s
@@ -0,0 +1,126 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+mpamv2 < %s \
+// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN: | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+mpamv2 < %s \
+// RUN: | llvm-objdump -d --mattr=+mpamv2 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+mpamv2 < %s \
+// RUN: | llvm-objdump -d --mattr=-mpamv2 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+mpamv2 < %s \
+// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN: | llvm-mc -triple=aarch64 -mattr=+mpamv2 -disassemble -show-encoding \
+// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+//------------------------------------------------------------------------------
+// Armv9.7-A FEAT_MPAMV2 Extensions
+//------------------------------------------------------------------------------
+
+msr MPAMCTL_EL1, x0
+// CHECK-INST: msr MPAMCTL_EL1, x0
+// CHECK-ENCODING: [0x40,0xa5,0x18,0xd5]
+// CHECK-UNKNOWN: d518a540
+
+msr MPAMCTL_EL12, x0
+// CHECK-INST: msr MPAMCTL_EL12, x0
+// CHECK-ENCODING: [0x40,0xa5,0x1d,0xd5]
+// CHECK-UNKNOWN: d51da540
+
+msr MPAMCTL_EL2, x0
+// CHECK-INST: msr MPAMCTL_EL2, x0
+// CHECK-ENCODING: [0x40,0xa5,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ca540
+
+msr MPAMCTL_EL3, x0
+// CHECK-INST: msr MPAMCTL_EL3, x0
+// CHECK-ENCODING: [0x40,0xa5,0x1e,0xd5]
+// CHECK-UNKNOWN: d51ea540
+
+msr MPAMVIDCR_EL2, x0
+// CHECK-INST: msr MPAMVIDCR_EL2, x0
+// CHECK-ENCODING: [0x00,0xa7,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ca700
+
+msr MPAMVIDSR_EL2, x0
+// CHECK-INST: msr MPAMVIDSR_EL2, x0
+// CHECK-ENCODING: [0x20,0xa7,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ca720
+
+msr MPAMVIDSR_EL3, x0
+// CHECK-INST: msr MPAMVIDSR_EL3, x0
+// CHECK-ENCODING: [0x20,0xa7,0x1e,0xd5]
+// CHECK-UNKNOWN: d51ea720
+
+
+mrs x0, MPAMCTL_EL1
+// CHECK-INST: mrs x0, MPAMCTL_EL1
+// CHECK-ENCODING: [0x40,0xa5,0x38,0xd5]
+// CHECK-UNKNOWN: d538a540
+
+mrs x0, MPAMCTL_EL12
+// CHECK-INST: mrs x0, MPAMCTL_EL12
+// CHECK-ENCODING: [0x40,0xa5,0x3d,0xd5]
+// CHECK-UNKNOWN: d53da540
+
+mrs x0, MPAMCTL_EL2
+// CHECK-INST: mrs x0, MPAMCTL_EL2
+// CHECK-ENCODING: [0x40,0xa5,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ca540
+
+mrs x0, MPAMCTL_EL3
+// CHECK-INST: mrs x0, MPAMCTL_EL3
+// CHECK-ENCODING: [0x40,0xa5,0x3e,0xd5]
+// CHECK-UNKNOWN: d53ea540
+
+mrs x0, MPAMVIDCR_EL2
+// CHECK-INST: mrs x0, MPAMVIDCR_EL2
+// CHECK-ENCODING: [0x00,0xa7,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ca700
+
+mrs x0, MPAMVIDSR_EL2
+// CHECK-INST: mrs x0, MPAMVIDSR_EL2
+// CHECK-ENCODING: [0x20,0xa7,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ca720
+
+mrs x0, MPAMVIDSR_EL3
+// CHECK-INST: mrs x0, MPAMVIDSR_EL3
+// CHECK-ENCODING: [0x20,0xa7,0x3e,0xd5]
+// CHECK-UNKNOWN: d53ea720
+
+
+//------------------------------------------------------------------------------
+// Armv9.7-A FEAT_MPAMV2_VID Extensions
+//------------------------------------------------------------------------------
+
+mlbi vmalle1
+// CHECK-INST: mlbi vmalle1
+// CHECK-ENCODING: [0xbf,0x70,0x0c,0xd5]
+// CHECK-UNKNOWN: d50c70bf sys #4, c7, c0, #5
+// CHECK-ERROR: error: MLBI VMALLE1 requires: mpamv2
+
+mlbi vpide1, x0
+// CHECK-INST: mlbi vpide1, x0
+// CHECK-ENCODING: [0xc0,0x70,0x0c,0xd5]
+// CHECK-UNKNOWN: d50c70c0 sys #4, c7, c0, #6, x0
+// CHECK-ERROR: error: MLBI VPIDE1 requires: mpamv2
+
+mlbi vpmge1, x0
+// CHECK-INST: mlbi vpmge1, x0
+// CHECK-ENCODING: [0xe0,0x70,0x0c,0xd5]
+// CHECK-UNKNOWN: d50c70e0 sys #4, c7, c0, #7, x0
+// CHECK-ERROR: error: MLBI VPMGE1 requires: mpamv2
+
+// Check that invalid encodings are rendered as SYS aliases
+// [0x9f,0x70,0x0c,0xd5] -> mlbi alle1
+// [0x9e,0x70,0x0c,0xd5] -> sys #4, c7, c0, #4, x30
+
+mlbi alle1
+// CHECK-INST: mlbi alle1
+// CHECK-ENCODING: [0x9f,0x70,0x0c,0xd5]
+// CHECK-UNKNOWN: d50c709f sys #4, c7, c0, #4
+// CHECK-ERROR: error: MLBI ALLE1 requires: mpamv2
+
+sys #4, c7, c0, #4, x30
+// CHECK-INST: sys #4, c7, c0, #4, x30
+// CHECK-ENCODING: [0x9e,0x70,0x0c,0xd5]
+// CHECK-UNKNOWN: d50c709e sys #4, c7, c0, #4, x30
diff --git a/llvm/test/MC/AArch64/armv9.7a-mtetc-diagnostics.s b/llvm/test/MC/AArch64/armv9.7a-mtetc-diagnostics.s
new file mode 100644
index 0000000..dc2a290
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.7a-mtetc-diagnostics.s
@@ -0,0 +1,16 @@
+// RUN: not llvm-mc -triple=aarch64 -mattr=+mtetc -show-encoding < %s 2>&1 \
+// RUN: | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN: | FileCheck %s --check-prefix=CHECK-REQUIRES-MTETC
+
+//------------------------------------------------------------------------------
+// FEAT_MTETC Extension instructions
+//------------------------------------------------------------------------------
+
+dc zgbva
+// CHECK-ERROR: error: specified dc op requires a register
+// CHECK-REQUIRES-MTETC: DC ZGBVA requires: mtetc
+
+dc gbva
+// CHECK-ERROR: error: specified dc op requires a register
+// CHECK-REQUIRES-MTETC: DC GBVA requires: mtetc
diff --git a/llvm/test/MC/AArch64/armv9.7a-mtetc.s b/llvm/test/MC/AArch64/armv9.7a-mtetc.s
new file mode 100644
index 0000000..087b23b
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.7a-mtetc.s
@@ -0,0 +1,29 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+mtetc < %s \
+// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN: | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+mtetc < %s \
+// RUN: | llvm-objdump -d --mattr=+mtetc --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+mtetc < %s \
+// RUN: | llvm-objdump -d --mattr=-mtetc --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+mtetc < %s \
+// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN: | llvm-mc -triple=aarch64 -mattr=+mtetc -disassemble -show-encoding \
+// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+//------------------------------------------------------------------------------
+// FEAT_MTETC Extension instructions
+//------------------------------------------------------------------------------
+
+dc zgbva, x0
+// CHECK-INST: dc zgbva, x0
+// CHECK-ENCODING: [0xa0,0x74,0x0b,0xd5]
+// CHECK-UNKNOWN: d50b74a0 sys #3, c7, c4, #5, x0
+// CHECK-ERROR: DC ZGBVA requires: mtetc
+
+dc gbva, x0
+// CHECK-INST: dc gbva, x0
+// CHECK-ENCODING: [0xe0,0x74,0x0b,0xd5]
+// CHECK-UNKNOWN: d50b74e0 sys #3, c7, c4, #7, x0
+// CHECK-ERROR: DC GBVA requires: mtetc
diff --git a/llvm/test/MC/AArch64/armv9.7a-tlbid-diagnostics.s b/llvm/test/MC/AArch64/armv9.7a-tlbid-diagnostics.s
new file mode 100644
index 0000000..2440fd3
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.7a-tlbid-diagnostics.s
@@ -0,0 +1,64 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+tlb-rmi,+tlbiw,+rme < %s 2>&1 \
+// RUN: | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+tlb-rmi,+tlbiw,+tlbid,+rme < %s 2>&1 \
+// RUN: | FileCheck %s --check-prefix=CHECK-NO-REGISTER
+
+// Test without using +tlbid - no optional register operand allowed
+
+tlbi vmalle1is, x1
+// CHECK-ERROR: error: specified tlbi op does not use a register
+
+tlbi vmalle1is, x5
+// CHECK-ERROR: error: specified tlbi op does not use a register
+
+tlbi vmalle1os, x5
+// CHECK-ERROR: error: specified tlbi op does not use a register
+
+tlbi vmalls12e1os, x5
+// CHECK-ERROR: error: specified tlbi op does not use a register
+
+tlbi alle1is, x5
+// CHECK-ERROR: error: specified tlbi op does not use a register
+
+tlbi alle2is, x5
+// CHECK-ERROR: error: specified tlbi op does not use a register
+
+tlbi alle3is, x5
+// CHECK-ERROR: error: specified tlbi op does not use a register
+
+tlbi vmallws2e1os, x1
+// CHECK-ERROR: error: specified tlbi op does not use a register
+
+tlbi vmalls12e1is, x1
+// CHECK-ERROR: error: specified tlbi op does not use a register
+
+tlbi vmallws2e1is, x1
+// CHECK-ERROR: error: specified tlbi op does not use a register
+
+tlbi vmalle1, x1
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-NO-REGISTER: error: specified tlbi op does not use a register
+
+tlbi alle1, x1
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-NO-REGISTER: error: specified tlbi op does not use a register
+
+tlbi alle2, x1
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-NO-REGISTER: error: specified tlbi op does not use a register
+
+tlbi alle3, x1
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-NO-REGISTER: error: specified tlbi op does not use a register
+
+tlbi vmalls12e1, x1
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-NO-REGISTER: error: specified tlbi op does not use a register
+
+tlbi paallos, x1
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-NO-REGISTER: error: specified tlbi op does not use a register
+
+tlbi paall, x1
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-NO-REGISTER: error: specified tlbi op does not use a register
diff --git a/llvm/test/MC/AArch64/armv9.7a-tlbid.s b/llvm/test/MC/AArch64/armv9.7a-tlbid.s
new file mode 100644
index 0000000..1362bd3
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.7a-tlbid.s
@@ -0,0 +1,84 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+tlbid,+tlb-rmi,+tlbiw < %s \
+// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+tlb-rmi,+tlbiw < %s 2>&1 \
+// RUN: | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+tlbid,+tlb-rmi,+tlbiw < %s \
+// RUN: | llvm-objdump -d --mattr=+tlbid,+tlb-rmi,+tlbiw --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+tlbid,+tlb-rmi,+tlbiw < %s \
+// RUN: | llvm-objdump -d --mattr=-tlbid --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+tlbid,+tlb-rmi,+tlbiw < %s \
+// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN: | llvm-mc -triple=aarch64 -mattr=+tlbid,+tlb-rmi,+tlbiw -disassemble -show-encoding \
+// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+// Armv9.7-A TLBI Domains (FEAT_TLBID)
+
+tlbi vmalle1is
+// CHECK-INST: tlbi vmalle1is
+// CHECK-ENCODING: encoding: [0x1f,0x83,0x08,0xd5]
+// CHECK-UNKNOWN: d508831f tlbi vmalle1is
+
+tlbi vmalle1is, xzr
+// CHECK-INST: tlbi vmalle1is
+// CHECK-ENCODING: encoding: [0x1f,0x83,0x08,0xd5]
+// CHECK-UNKNOWN: d508831f tlbi vmalle1is
+
+tlbi vmalle1is, x31
+// CHECK-INST: tlbi vmalle1is
+// CHECK-ENCODING: encoding: [0x1f,0x83,0x08,0xd5]
+// CHECK-UNKNOWN: d508831f tlbi vmalle1is
+
+tlbi vmalle1is, x5
+// CHECK-INST: tlbi vmalle1is, x5
+// CHECK-ENCODING: encoding: [0x05,0x83,0x08,0xd5]
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-UNKNOWN: d5088305 sys #0, c8, c3, #0, x5
+
+tlbi vmalle1os, x5
+// CHECK-INST: tlbi vmalle1os, x5
+// CHECK-ENCODING: encoding: [0x05,0x81,0x08,0xd5]
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-UNKNOWN: d5088105 sys #0, c8, c1, #0, x5
+
+tlbi alle1is, x5
+// CHECK-INST: tlbi alle1is, x5
+// CHECK-ENCODING: encoding: [0x85,0x83,0x0c,0xd5]
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-UNKNOWN: d50c8385 sys #4, c8, c3, #4, x5
+
+tlbi alle2is, x5
+// CHECK-INST: tlbi alle2is, x5
+// CHECK-ENCODING: encoding: [0x05,0x83,0x0c,0xd5]
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-UNKNOWN: d50c8305 sys #4, c8, c3, #0, x5
+
+tlbi alle3is, x5
+// CHECK-INST: tlbi alle3is, x5
+// CHECK-ENCODING: encoding: [0x05,0x83,0x0e,0xd5]
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-UNKNOWN: d50e8305 sys #6, c8, c3, #0, x5
+
+tlbi vmalls12e1is, x1
+// CHECK-INST: tlbi vmalls12e1is, x1
+// CHECK-ENCODING: encoding: [0xc1,0x83,0x0c,0xd5]
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-UNKNOWN: d50c83c1 sys #4, c8, c3, #6, x1
+
+tlbi vmalls12e1os, x5
+// CHECK-INST: tlbi vmalls12e1os, x5
+// CHECK-ENCODING: encoding: [0xc5,0x81,0x0c,0xd5]
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-UNKNOWN: d50c81c5 sys #4, c8, c1, #6, x5
+
+tlbi vmallws2e1is, x1
+// CHECK-INST: tlbi vmallws2e1is, x1
+// CHECK-ENCODING: encoding: [0x41,0x82,0x0c,0xd5]
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-UNKNOWN: d50c8241 sys #4, c8, c2, #2, x1
+
+tlbi vmallws2e1os, x1
+// CHECK-INST: tlbi vmallws2e1os, x1
+// CHECK-ENCODING: encoding: [0x41,0x85,0x0c,0xd5]
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-UNKNOWN: d50c8541 sys #4, c8, c5, #2, x1
diff --git a/llvm/test/MC/AArch64/neon-fdot-diagnostics.s b/llvm/test/MC/AArch64/neon-fdot-diagnostics.s
new file mode 100644
index 0000000..4f5f557
--- /dev/null
+++ b/llvm/test/MC/AArch64/neon-fdot-diagnostics.s
@@ -0,0 +1,59 @@
+// RUN: not llvm-mc -triple=aarch64 -mattr=f16f32dot 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Invalid operand
+
+fdot v0.2s, v0.4b, v0.4b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fdot v0.2s, v0.4b, v0.4b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot v0.2b, v0.4b, v0.4b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fdot v0.2b, v0.4b, v0.4b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot v0.2s, v0.4s, v0.4s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fdot v0.2s, v0.4s, v0.4s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot v0.2h, v0.4h, v0.4h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fdot v0.2h, v0.4h, v0.4h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// fdot indexed
+
+fdot v0.2s, v0.4b, v0.4b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fdot v0.2s, v0.4b, v0.4b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot v0.2b, v0.4b, v0.4b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fdot v0.2b, v0.4b, v0.4b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot v0.2s, v0.4s, v0.4s[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fdot v0.2s, v0.4s, v0.4s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot v0.2h, v0.4h, v0.4h[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fdot v0.2h, v0.4h, v0.4h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid immediate range
+
+fdot v0.2s, v0.4h, v0.2h[-1]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3].
+// CHECK-NEXT: fdot v0.2s, v0.4h, v0.2h[-1]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot v0.2s, v0.4h, v0.2h[4]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3].
+// CHECK-NEXT: fdot v0.2s, v0.4h, v0.2h[4]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/neon-fdot.s b/llvm/test/MC/AArch64/neon-fdot.s
new file mode 100644
index 0000000..c8a8e2f
--- /dev/null
+++ b/llvm/test/MC/AArch64/neon-fdot.s
@@ -0,0 +1,147 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+f16f32dot < %s \
+// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN: | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+f16f32dot < %s \
+// RUN: | llvm-objdump -d --mattr=+f16f32dot --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+f16f32dot < %s \
+// RUN: | llvm-objdump -d --mattr=-f16f32dot --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+f16f32dot < %s \
+// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN: | llvm-mc -triple=aarch64 -mattr=+f16f32dot -disassemble -show-encoding \
+// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+fdot v0.2s, v0.4h, v0.4h
+// CHECK-INST: fdot v0.2s, v0.4h, v0.4h
+// CHECK-ENCODING: encoding: [0x00,0xfc,0x80,0x0e]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 0e80fc00 <unknown>
+
+fdot v10.2s, v10.4h, v10.4h
+// CHECK-INST: fdot v10.2s, v10.4h, v10.4h
+// CHECK-ENCODING: encoding: [0x4a,0xfd,0x8a,0x0e]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 0e8afd4a <unknown>
+
+fdot v31.2s, v31.4h, v31.4h
+// CHECK-INST: fdot v31.2s, v31.4h, v31.4h
+// CHECK-ENCODING: encoding: [0xff,0xff,0x9f,0x0e]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 0e9fffff <unknown>
+
+fdot v0.4s, v0.8h, v0.8h
+// CHECK-INST: fdot v0.4s, v0.8h, v0.8h
+// CHECK-ENCODING: encoding: [0x00,0xfc,0x80,0x4e]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 4e80fc00 <unknown>
+
+fdot v10.4s, v10.8h, v10.8h
+// CHECK-INST: fdot v10.4s, v10.8h, v10.8h
+// CHECK-ENCODING: encoding: [0x4a,0xfd,0x8a,0x4e]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 4e8afd4a <unknown>
+
+fdot v31.4s, v31.8h, v31.8h
+// CHECK-INST: fdot v31.4s, v31.8h, v31.8h
+// CHECK-ENCODING: encoding: [0xff,0xff,0x9f,0x4e]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 4e9fffff <unknown>
+
+// fdot indexed
+
+fdot v0.2s, v0.4h, v0.2h[0]
+// CHECK-INST: fdot v0.2s, v0.4h, v0.2h[0]
+// CHECK-ENCODING: encoding: [0x00,0x90,0x40,0x0f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 0f409000 <unknown>
+
+fdot v10.2s, v0.4h, v0.2h[0]
+// CHECK-INST: fdot v10.2s, v0.4h, v0.2h[0]
+// CHECK-ENCODING: encoding: [0x0a,0x90,0x40,0x0f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 0f40900a <unknown>
+
+fdot v21.2s, v0.4h, v0.2h[0]
+// CHECK-INST: fdot v21.2s, v0.4h, v0.2h[0]
+// CHECK-ENCODING: encoding: [0x15,0x90,0x40,0x0f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 0f409015 <unknown>
+
+fdot v31.2s, v0.4h, v0.2h[0]
+// CHECK-INST: fdot v31.2s, v0.4h, v0.2h[0]
+// CHECK-ENCODING: encoding: [0x1f,0x90,0x40,0x0f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 0f40901f <unknown>
+
+fdot v0.2s, v10.4h, v0.2h[0]
+// CHECK-INST: fdot v0.2s, v10.4h, v0.2h[0]
+// CHECK-ENCODING: encoding: [0x40,0x91,0x40,0x0f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 0f409140 <unknown>
+
+fdot v10.2s, v10.4h, v0.2h[0]
+// CHECK-INST: fdot v10.2s, v10.4h, v0.2h[0]
+// CHECK-ENCODING: encoding: [0x4a,0x91,0x40,0x0f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 0f40914a <unknown>
+
+fdot v21.2s, v10.4h, v0.2h[0]
+// CHECK-INST: fdot v21.2s, v10.4h, v0.2h[0]
+// CHECK-ENCODING: encoding: [0x55,0x91,0x40,0x0f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 0f409155 <unknown>
+
+fdot v31.2s, v10.4h, v0.2h[0]
+// CHECK-INST: fdot v31.2s, v10.4h, v0.2h[0]
+// CHECK-ENCODING: encoding: [0x5f,0x91,0x40,0x0f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 0f40915f <unknown>
+
+fdot v0.4s, v21.8h, v31.2h[3]
+// CHECK-INST: fdot v0.4s, v21.8h, v31.2h[3]
+// CHECK-ENCODING: encoding: [0xa0,0x9a,0x7f,0x4f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 4f7f9aa0 <unknown>
+
+fdot v10.4s, v21.8h, v31.2h[3]
+// CHECK-INST: fdot v10.4s, v21.8h, v31.2h[3]
+// CHECK-ENCODING: encoding: [0xaa,0x9a,0x7f,0x4f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 4f7f9aaa <unknown>
+
+fdot v21.4s, v21.8h, v31.2h[3]
+// CHECK-INST: fdot v21.4s, v21.8h, v31.2h[3]
+// CHECK-ENCODING: encoding: [0xb5,0x9a,0x7f,0x4f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 4f7f9ab5 <unknown>
+
+fdot v31.4s, v21.8h, v31.2h[3]
+// CHECK-INST: fdot v31.4s, v21.8h, v31.2h[3]
+// CHECK-ENCODING: encoding: [0xbf,0x9a,0x7f,0x4f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 4f7f9abf <unknown>
+
+fdot v0.4s, v31.8h, v31.2h[3]
+// CHECK-INST: fdot v0.4s, v31.8h, v31.2h[3]
+// CHECK-ENCODING: encoding: [0xe0,0x9b,0x7f,0x4f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 4f7f9be0 <unknown>
+
+fdot v10.4s, v31.8h, v31.2h[3]
+// CHECK-INST: fdot v10.4s, v31.8h, v31.2h[3]
+// CHECK-ENCODING: encoding: [0xea,0x9b,0x7f,0x4f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 4f7f9bea <unknown>
+
+fdot v21.4s, v31.8h, v31.2h[3]
+// CHECK-INST: fdot v21.4s, v31.8h, v31.2h[3]
+// CHECK-ENCODING: encoding: [0xf5,0x9b,0x7f,0x4f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 4f7f9bf5 <unknown>
+
+fdot v31.4s, v31.8h, v31.2h[3]
+// CHECK-INST: fdot v31.4s, v31.8h, v31.2h[3]
+// CHECK-ENCODING: encoding: [0xff,0x9b,0x7f,0x4f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 4f7f9bff <unknown>
diff --git a/llvm/test/MC/AArch64/neon-fmmla-HtoS-diagnostics.s b/llvm/test/MC/AArch64/neon-fmmla-HtoS-diagnostics.s
new file mode 100644
index 0000000..ccc0742
--- /dev/null
+++ b/llvm/test/MC/AArch64/neon-fmmla-HtoS-diagnostics.s
@@ -0,0 +1,24 @@
+// RUN: not llvm-mc -triple=aarch64 -mattr=+f16f32mm 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Invalid operand/vector
+
+fmmla v0.4b, v0.8b, v0.8b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fmmla v0.4b, v0.8b, v0.8b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmmla v0.4h, v0.8h, v0.8h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fmmla v0.4h, v0.8h, v0.8h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmmla v0.4s, v0.8s, v0.8s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid vector kind qualifier
+// CHECK-NEXT: fmmla v0.4s, v0.8s, v0.8s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmmla v0.4d, v0.8d, v0.8d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid vector kind qualifier
+// CHECK-NEXT: fmmla v0.4d, v0.8d, v0.8d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/neon-fmmla-HtoS.s b/llvm/test/MC/AArch64/neon-fmmla-HtoS.s
new file mode 100644
index 0000000..6b3d352
--- /dev/null
+++ b/llvm/test/MC/AArch64/neon-fmmla-HtoS.s
@@ -0,0 +1,37 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+f16f32mm< %s \
+// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN: | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+f16f32mm < %s \
+// RUN: | llvm-objdump -d --mattr=+f16f32mm --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+f16f32mm < %s \
+// RUN: | llvm-objdump -d --mattr=-f16f32mm --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+f16f32mm < %s \
+// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN: | llvm-mc -triple=aarch64 -mattr=+f16f32mm -disassemble -show-encoding \
+// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+fmmla v0.4s, v0.8h, v0.8h
+// CHECK-INST: fmmla v0.4s, v0.8h, v0.8h
+// CHECK-ENCODING: encoding: [0x00,0xec,0x40,0x4e]
+// CHECK-ERROR: instruction requires: f16f32mm
+// CHECK-UNKNOWN: 4e40ec00 <unknown>
+
+fmmla v10.4s, v10.8h, v10.8h
+// CHECK-INST: fmmla v10.4s, v10.8h, v10.8h
+// CHECK-ENCODING: encoding: [0x4a,0xed,0x4a,0x4e]
+// CHECK-ERROR: instruction requires: f16f32mm
+// CHECK-UNKNOWN: 4e4aed4a <unknown>
+
+fmmla v21.4s, v21.8h, v21.8h
+// CHECK-INST: fmmla v21.4s, v21.8h, v21.8h
+// CHECK-ENCODING: encoding: [0xb5,0xee,0x55,0x4e]
+// CHECK-ERROR: instruction requires: f16f32mm
+// CHECK-UNKNOWN: 4e55eeb5 <unknown>
+
+fmmla v31.4s, v31.8h, v31.8h
+// CHECK-INST: fmmla v31.4s, v31.8h, v31.8h
+// CHECK-ENCODING: encoding: [0xff,0xef,0x5f,0x4e]
+// CHECK-ERROR: instruction requires: f16f32mm
+// CHECK-UNKNOWN: 4e5fefff <unknown>
diff --git a/llvm/test/MC/AArch64/neon-fmmla-diagnostics.s b/llvm/test/MC/AArch64/neon-fmmla-diagnostics.s
new file mode 100644
index 0000000..7fc5373
--- /dev/null
+++ b/llvm/test/MC/AArch64/neon-fmmla-diagnostics.s
@@ -0,0 +1,24 @@
+// RUN: not llvm-mc -triple=aarch64 -mattr=+f16mm 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Invalid operand/vector
+
+fmmla v0.8b, v0.8b, v0.8b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fmmla v0.8b, v0.8b, v0.8b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmmla v0.8b, v0.8h, v0.8h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fmmla v0.8b, v0.8h, v0.8h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmmla v0.8s, v0.8h, v0.8h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid vector kind qualifier
+// CHECK-NEXT: fmmla v0.8s, v0.8h, v0.8h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmmla v0.8d, v0.8h, v0.8h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid vector kind qualifier
+// CHECK-NEXT: fmmla v0.8d, v0.8h, v0.8h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/neon-fmmla.s b/llvm/test/MC/AArch64/neon-fmmla.s
new file mode 100644
index 0000000..f35c2fb
--- /dev/null
+++ b/llvm/test/MC/AArch64/neon-fmmla.s
@@ -0,0 +1,37 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+f16mm< %s \
+// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN: | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+f16mm < %s \
+// RUN: | llvm-objdump -d --mattr=+f16mm --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+f16mm < %s \
+// RUN: | llvm-objdump -d --mattr=-f16mm --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+f16mm < %s \
+// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN: | llvm-mc -triple=aarch64 -mattr=+f16mm -disassemble -show-encoding \
+// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+fmmla v0.8h, v0.8h, v0.8h
+// CHECK-INST: fmmla v0.8h, v0.8h, v0.8h
+// CHECK-ENCODING: encoding: [0x00,0xec,0xc0,0x4e]
+// CHECK-ERROR: instruction requires: f16mm
+// CHECK-UNKNOWN: 4ec0ec00 <unknown>
+
+fmmla v10.8h, v10.8h, v10.8h
+// CHECK-INST: fmmla v10.8h, v10.8h, v10.8h
+// CHECK-ENCODING: encoding: [0x4a,0xed,0xca,0x4e]
+// CHECK-ERROR: instruction requires: f16mm
+// CHECK-UNKNOWN: 4ecaed4a <unknown>
+
+fmmla v21.8h, v21.8h, v21.8h
+// CHECK-INST: fmmla v21.8h, v21.8h, v21.8h
+// CHECK-ENCODING: encoding: [0xb5,0xee,0xd5,0x4e]
+// CHECK-ERROR: instruction requires: f16mm
+// CHECK-UNKNOWN: 4ed5eeb5 <unknown>
+
+fmmla v31.8h, v31.8h, v31.8h
+// CHECK-INST: fmmla v31.8h, v31.8h, v31.8h
+// CHECK-ENCODING: encoding: [0xff,0xef,0xdf,0x4e]
+// CHECK-ERROR: instruction requires: f16mm
+// CHECK-UNKNOWN: 4edfefff <unknown>
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_sopp.s b/llvm/test/MC/AMDGPU/gfx1250_asm_sopp.s
index 4f7ca47..358fe0b 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_sopp.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_sopp.s
@@ -45,6 +45,10 @@ s_set_vgpr_msb 255
// GFX1250: [0xff,0x00,0x86,0xbf]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+s_set_vgpr_msb 0xffff
+// GFX1250: [0xff,0xff,0x86,0xbf]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
s_monitor_sleep 1
// GFX1250: s_monitor_sleep 1 ; encoding: [0x01,0x00,0x84,0xbf]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_err.s b/llvm/test/MC/AMDGPU/gfx1250_err.s
index 9d1131e..676eb48 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_err.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_err.s
@@ -1,15 +1,5 @@
// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s 2>&1 | FileCheck --check-prefixes=GFX1250-ERR --implicit-check-not=error: -strict-whitespace %s
-s_set_vgpr_msb -1
-// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: s_set_vgpr_msb accepts values in range [0..255]
-// GFX1250-ERR: s_set_vgpr_msb -1
-// GFX1250-ERR: ^
-
-s_set_vgpr_msb 256
-// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: s_set_vgpr_msb accepts values in range [0..255]
-// GFX1250-ERR: s_set_vgpr_msb 256
-// GFX1250-ERR: ^
-
s_load_b32 s4, s[2:3], 10 th:TH_LOAD_NT th:TH_LOAD_NT
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
// GFX1250-ERR: s_load_b32 s4, s[2:3], 10 th:TH_LOAD_NT th:TH_LOAD_NT
diff --git a/llvm/test/MC/Disassembler/AArch64/armv8.4a-mpam.txt b/llvm/test/MC/Disassembler/AArch64/armv8.4a-mpam.txt
index ad22000..16eba25 100644
--- a/llvm/test/MC/Disassembler/AArch64/armv8.4a-mpam.txt
+++ b/llvm/test/MC/Disassembler/AArch64/armv8.4a-mpam.txt
@@ -65,35 +65,39 @@
#CHECK: mrs x0, MPAMVPM7_EL2
#CHECK: mrs x0, MPAMIDR_EL1
-#CHECK-NOV84: msr S3_0_C10_C5_1, x0
-#CHECK-NOV84: msr S3_0_C10_C5_0, x0
-#CHECK-NOV84: msr S3_4_C10_C5_0, x0
-#CHECK-NOV84: msr S3_6_C10_C5_0, x0
-#CHECK-NOV84: msr S3_5_C10_C5_0, x0
-#CHECK-NOV84: msr S3_4_C10_C4_0, x0
-#CHECK-NOV84: msr S3_4_C10_C4_1, x0
-#CHECK-NOV84: msr S3_4_C10_C6_0, x0
-#CHECK-NOV84: msr S3_4_C10_C6_1, x0
-#CHECK-NOV84: msr S3_4_C10_C6_2, x0
-#CHECK-NOV84: msr S3_4_C10_C6_3, x0
-#CHECK-NOV84: msr S3_4_C10_C6_4, x0
-#CHECK-NOV84: msr S3_4_C10_C6_5, x0
-#CHECK-NOV84: msr S3_4_C10_C6_6, x0
-#CHECK-NOV84: msr S3_4_C10_C6_7, x0
-#CHECK-NOV84: mrs x0, S3_0_C10_C5_1
-#CHECK-NOV84: mrs x0, S3_0_C10_C5_0
-#CHECK-NOV84: mrs x0, S3_4_C10_C5_0
-#CHECK-NOV84: mrs x0, S3_6_C10_C5_0
-#CHECK-NOV84: mrs x0, S3_5_C10_C5_0
-#CHECK-NOV84: mrs x0, S3_4_C10_C4_0
-#CHECK-NOV84: mrs x0, S3_4_C10_C4_1
-#CHECK-NOV84: mrs x0, S3_4_C10_C6_0
-#CHECK-NOV84: mrs x0, S3_4_C10_C6_1
-#CHECK-NOV84: mrs x0, S3_4_C10_C6_2
-#CHECK-NOV84: mrs x0, S3_4_C10_C6_3
-#CHECK-NOV84: mrs x0, S3_4_C10_C6_4
-#CHECK-NOV84: mrs x0, S3_4_C10_C6_5
-#CHECK-NOV84: mrs x0, S3_4_C10_C6_6
-#CHECK-NOV84: mrs x0, S3_4_C10_C6_7
-#CHECK-NOV84: mrs x0, S3_0_C10_C4_4
+// Available outside MPAM from Armv9.7
+#CHECK-NOV84: msr MPAM0_EL1, x0
+#CHECK-NOV84: msr MPAM1_EL1, x0
+#CHECK-NOV84: msr MPAM2_EL2, x0
+#CHECK-NOV84: msr MPAM3_EL3, x0
+#CHECK-NOV84: msr MPAM1_EL12, x0
+#CHECK-NOV84: msr MPAMHCR_EL2, x0
+#CHECK-NOV84: msr MPAMVPMV_EL2, x0
+#CHECK-NOV84: msr MPAMVPM0_EL2, x0
+#CHECK-NOV84: msr MPAMVPM1_EL2, x0
+#CHECK-NOV84: msr MPAMVPM2_EL2, x0
+#CHECK-NOV84: msr MPAMVPM3_EL2, x0
+#CHECK-NOV84: msr MPAMVPM4_EL2, x0
+#CHECK-NOV84: msr MPAMVPM5_EL2, x0
+#CHECK-NOV84: msr MPAMVPM6_EL2, x0
+#CHECK-NOV84: msr MPAMVPM7_EL2, x0
+
+// Available outside MPAM from Armv9.7
+#CHECK-NOV84: mrs x0, MPAM0_EL1
+#CHECK-NOV84: mrs x0, MPAM1_EL1
+#CHECK-NOV84: mrs x0, MPAM2_EL2
+#CHECK-NOV84: mrs x0, MPAM3_EL3
+#CHECK-NOV84: mrs x0, MPAM1_EL12
+#CHECK-NOV84: mrs x0, MPAMHCR_EL2
+
+#CHECK-NOV84: mrs x0, MPAMVPMV_EL2
+#CHECK-NOV84: mrs x0, MPAMVPM0_EL2
+#CHECK-NOV84: mrs x0, MPAMVPM1_EL2
+#CHECK-NOV84: mrs x0, MPAMVPM2_EL2
+#CHECK-NOV84: mrs x0, MPAMVPM3_EL2
+#CHECK-NOV84: mrs x0, MPAMVPM4_EL2
+#CHECK-NOV84: mrs x0, MPAMVPM5_EL2
+#CHECK-NOV84: mrs x0, MPAMVPM6_EL2
+#CHECK-NOV84: mrs x0, MPAMVPM7_EL2
+#CHECK-NOV84: mrs x0, MPAMIDR_EL1
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sopp.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sopp.txt
index a8627d6..b84324b 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sopp.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sopp.txt
@@ -33,6 +33,9 @@
# GFX1250: s_set_vgpr_msb 0xff ; encoding: [0xff,0x00,0x86,0xbf]
0xff,0x00,0x86,0xbf
+# GFX1250: s_set_vgpr_msb 0xffff ; encoding: [0xff,0xff,0x86,0xbf]
+0xff,0xff,0x86,0xbf
+
# GFX1250: s_monitor_sleep 0 ; encoding: [0x00,0x00,0x84,0xbf]
0x00,0x00,0x84,0xbf
diff --git a/llvm/test/MC/Disassembler/Xtensa/debug.txt b/llvm/test/MC/Disassembler/Xtensa/debug.txt
index 1321f09..5438760 100644
--- a/llvm/test/MC/Disassembler/Xtensa/debug.txt
+++ b/llvm/test/MC/Disassembler/Xtensa/debug.txt
@@ -9,7 +9,7 @@
# CHECK-DEBUG: break 1, 1
# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
-[0x2c,0xf1]
+[0x2d,0xf1]
# CHECK-DEBUG: break.n 1
# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
diff --git a/llvm/test/MC/ELF/cfi-sframe-cfi-escape-diagnostics.s b/llvm/test/MC/ELF/cfi-sframe-cfi-escape-diagnostics.s
new file mode 100644
index 0000000..cb44a76
--- /dev/null
+++ b/llvm/test/MC/ELF/cfi-sframe-cfi-escape-diagnostics.s
@@ -0,0 +1,36 @@
+# RUN: llvm-mc --filetype=obj --gsframe -triple x86_64 %s -o %t.o 2>&1 | FileCheck %s
+# RUN: llvm-readelf --sframe %t.o | FileCheck %s --check-prefix=NOFDES
+
+## Tests that .cfi_escape sequences that are unrepresentable in sframe warn
+## and do not produce FDEs.
+
+ .align 1024
+cfi_escape_sp:
+ .cfi_startproc
+ .long 0
+## Setting SP via other registers makes it unrepresentable in sframe
+## DW_CFA_expression,reg 0x7,length 2,DW_OP_breg6,SLEB(-8)
+# CHECK: {{.*}}.s:[[#@LINE+1]]:9: warning: skipping SFrame FDE; .cfi_escape DW_CFA_expression with SP reg 7
+ .cfi_escape 0x10, 0x7, 0x2, 0x76, 0x78
+ .long 0
+.cfi_endproc
+
+cfi_escape_args_sp:
+ .cfi_startproc
+ .long 0
+## DW_CFA_GNU_args_size is not OK if cfa is SP
+# CHECK: {{.*}}.s:[[#@LINE+1]]:9: warning: skipping SFrame FDE; .cfi_escape DW_CFA_GNU_args_size with non frame-pointer CFA
+ .cfi_escape 0x2e, 0x20
+ .cfi_endproc
+
+cfi_escape_val_offset:
+ .cfi_startproc
+ .long 0
+ .cfi_def_cfa_offset 16
+## DW_CFA_val_offset,rbp,ULEB scaled offset(16)
+# CHECK: {{.*}}.s:[[#@LINE+1]]:9: warning: skipping SFrame FDE; .cfi_escape DW_CFA_val_offset with FP reg 6
+ .cfi_escape 0x14,0x6,0x2
+ .long 0
+ .cfi_endproc
+
+# NOFDES: Num FDEs: 0
diff --git a/llvm/test/MC/ELF/cfi-sframe-cfi-escape.s b/llvm/test/MC/ELF/cfi-sframe-cfi-escape.s
new file mode 100644
index 0000000..df8e7d2
--- /dev/null
+++ b/llvm/test/MC/ELF/cfi-sframe-cfi-escape.s
@@ -0,0 +1,46 @@
+# RUN: llvm-mc --filetype=obj --gsframe -triple x86_64 %s -o %t.o
+# RUN: llvm-readelf --sframe %t.o | FileCheck %s
+
+## Tests that .cfi_escape sequences that are ok to pass through work.
+
+ .align 1024
+cfi_escape_ok:
+ .cfi_startproc
+ .long 0
+ .cfi_def_cfa_offset 16
+ ## Uninteresting register
+## DW_CFA_expression,reg 0xc,length 2,DW_OP_breg6,SLEB(-8)
+ .cfi_escape 0x10,0xc,0x2,0x76,0x78
+## DW_CFA_nop
+ .cfi_escape 0x0
+ .cfi_escape 0x0,0x0,0x0,0x0
+ ## Uninteresting register
+## DW_CFA_val_offset,reg 0xc,ULEB scaled offset
+ .cfi_escape 0x14,0xc,0x4
+ .long 0
+ .cfi_endproc
+
+cfi_escape_gnu_args_fp:
+ .cfi_startproc
+ .long 0
+## DW_CFA_GNU_args_size is OK if arg size is zero
+ .cfi_escape 0x2e, 0x0
+ .long 0
+ .cfi_def_cfa_register 6
+ .long 0
+## DW_CFA_GNU_args_size is OK if cfa is FP
+ .cfi_escape 0x2e, 0x20
+ .cfi_endproc
+
+cfi_escape_long_expr:
+ .cfi_startproc
+ .long 0
+ .cfi_def_cfa_offset 16
+## This is a long, but valid, dwarf expression without sframe
+## implications. An FDE can still be created.
+## DW_CFA_val_offset,rcx,ULEB scaled offset(16), DW_CFA_expr,r10,length,DW_OP_deref,SLEB(-8)
+ .cfi_escape 0x14,0x2,0x2,0x10,0xa,0x2,0x76,0x78
+ .long 0
+ .cfi_endproc
+
+# CHECK: Num FDEs: 3
diff --git a/llvm/test/MC/Hexagon/arch-support.s b/llvm/test/MC/Hexagon/arch-support.s
index eb362a7..94a6eb1 100644
--- a/llvm/test/MC/Hexagon/arch-support.s
+++ b/llvm/test/MC/Hexagon/arch-support.s
@@ -10,6 +10,7 @@
# RUN: llvm-mc -triple=hexagon -mv73 -filetype=obj %s | llvm-readelf -h - | FileCheck --check-prefix=CHECK-V73 %s
# RUN: llvm-mc -triple=hexagon -mv75 -filetype=obj %s | llvm-readelf -h - | FileCheck --check-prefix=CHECK-V75 %s
# RUN: llvm-mc -triple=hexagon -mv79 -filetype=obj %s | llvm-readelf -h - | FileCheck --check-prefix=CHECK-V79 %s
+# RUN: llvm-mc -triple=hexagon -mv81 -filetype=obj %s | llvm-readelf -h - | FileCheck --check-prefix=CHECK-V81 %s
## Check which arch version llvm-mc sets when the user does not provide one.
# RUN: llvm-mc -triple=hexagon -filetype=obj %s | llvm-readelf -h - | FileCheck --check-prefix=CHECK-DEFAULT %s
@@ -26,6 +27,7 @@
# RUN: llvm-mc -triple=hexagon -mv73 -filetype=obj %s | llvm-objdump --disassemble - | FileCheck --check-prefix=CHECK-OBJDUMP %s
# RUN: llvm-mc -triple=hexagon -mv75 -filetype=obj %s | llvm-objdump --disassemble - | FileCheck --check-prefix=CHECK-OBJDUMP %s
# RUN: llvm-mc -triple=hexagon -mv79 -filetype=obj %s | llvm-objdump --disassemble - | FileCheck --check-prefix=CHECK-OBJDUMP %s
+# RUN: llvm-mc -triple=hexagon -mv81 -filetype=obj %s | llvm-objdump --disassemble - | FileCheck --check-prefix=CHECK-OBJDUMP %s
.text
r1 = r1
@@ -41,6 +43,7 @@ r1 = r1
# CHECK-V73: Flags:{{.*}}0x73
# CHECK-V75: Flags:{{.*}}0x75
# CHECK-V79: Flags:{{.*}}0x79
+# CHECK-V81: Flags:{{.*}}0x81
# CHECK-DEFAULT: Flags:{{.*}}0x68
# CHECK-OBJDUMP: { r1 = r1 }
diff --git a/llvm/test/MC/Hexagon/v81_arch.s b/llvm/test/MC/Hexagon/v81_arch.s
new file mode 100644
index 0000000..0cd5d6b
--- /dev/null
+++ b/llvm/test/MC/Hexagon/v81_arch.s
@@ -0,0 +1,10 @@
+# RUN: llvm-mc -arch=hexagon -mcpu=hexagonv81 -filetype=obj %s | llvm-objdump -d - | FileCheck %s
+# RUN: llvm-mc -arch=hexagon -mcpu=hexagonv81 -mhvx -filetype=obj %s | llvm-objdump -d - | FileCheck %s
+
+r1=memw(r0)
+{ r0=r0
+ memw(r0)=r0.new }
+
+# CHECK: { r1 = memw(r0+#0x0) }
+# CHECK: { r0 = r0
+# CHECK: memw(r0+#0x0) = r0.new }
diff --git a/llvm/test/MC/PowerPC/ppc64-encoding-ext.s b/llvm/test/MC/PowerPC/ppc64-encoding-ext.s
index 959f3c5..6662220 100644
--- a/llvm/test/MC/PowerPC/ppc64-encoding-ext.s
+++ b/llvm/test/MC/PowerPC/ppc64-encoding-ext.s
@@ -3491,12 +3491,18 @@
# CHECK-BE: mfamr 2 # encoding: [0x7c,0x5d,0x02,0xa6]
# CHECK-LE: mfamr 2 # encoding: [0xa6,0x02,0x5d,0x7c]
mfamr 2
-# CHECK-BE: mtpid 2 # encoding: [0x7c,0x50,0x0b,0xa6]
-# CHECK-LE: mtpid 2 # encoding: [0xa6,0x0b,0x50,0x7c]
+# CHECK-BE: mtspr 48, 2 # encoding: [0x7c,0x50,0x0b,0xa6]
+# CHECK-LE: mtspr 48, 2 # encoding: [0xa6,0x0b,0x50,0x7c]
mtpid 2
-# CHECK-BE: mfpid 2 # encoding: [0x7c,0x50,0x0a,0xa6]
-# CHECK-LE: mfpid 2 # encoding: [0xa6,0x0a,0x50,0x7c]
+# CHECK-BE: mtspr 48, 2 # encoding: [0x7c,0x50,0x0b,0xa6]
+# CHECK-LE: mtspr 48, 2 # encoding: [0xa6,0x0b,0x50,0x7c]
+ mtpidr 2
+# CHECK-BE: mfspr 2, 48 # encoding: [0x7c,0x50,0x0a,0xa6]
+# CHECK-LE: mfspr 2, 48 # encoding: [0xa6,0x0a,0x50,0x7c]
mfpid 2
+# CHECK-BE: mfspr 2, 48 # encoding: [0x7c,0x50,0x0a,0xa6]
+# CHECK-LE: mfspr 2, 48 # encoding: [0xa6,0x0a,0x50,0x7c]
+ mfpidr 2
# CHECK-BE: mtlr 2 # encoding: [0x7c,0x48,0x03,0xa6]
# CHECK-LE: mtlr 2 # encoding: [0xa6,0x03,0x48,0x7c]
mtlr 2
diff --git a/llvm/test/MC/Xtensa/debug.s b/llvm/test/MC/Xtensa/debug.s
index 36b1f11..4ca6368 100644
--- a/llvm/test/MC/Xtensa/debug.s
+++ b/llvm/test/MC/Xtensa/debug.s
@@ -11,7 +11,7 @@ break 1, 1
# Instruction format RRRN
# CHECK-INST: break.n 1
-# CHECK: encoding: [0x2c,0xf1]
+# CHECK: encoding: [0x2d,0xf1]
break.n 1
# Instruction format RRR
diff --git a/llvm/test/MachineVerifier/test_g_shuffle_vector.mir b/llvm/test/MachineVerifier/test_g_shuffle_vector.mir
index 6aba6731..c4ca2d2 100644
--- a/llvm/test/MachineVerifier/test_g_shuffle_vector.mir
+++ b/llvm/test/MachineVerifier/test_g_shuffle_vector.mir
@@ -50,10 +50,16 @@ body: |
%20:_(<2 x s32>) = G_SHUFFLE_VECTOR %3, %19, shufflemask(1, 0)
; CHECK: Bad machine code: G_SHUFFLE_VECTOR cannot change element type
- %21:_(s16) = G_SHUFFLE_VECTOR %3, %4, shufflemask(0)
+ %21:_(s16) = G_SHUFFLE_VECTOR %3, %4, shufflemask(0, 1)
+
+ ; CHECK: Bad machine code: G_SHUFFLE_VECTOR must have vector src
+ %22:_(<2 x s32>) = G_SHUFFLE_VECTOR %3, %4, shufflemask(0, 0)
+
+ %23:_(p0) = G_IMPLICIT_DEF
+ ; CHECK: Bad machine code: G_SHUFFLE_VECTOR must have vector src
+ %24:_(<2 x p0>) = G_SHUFFLE_VECTOR %23, %23, shufflemask(0, 0)
; CHECK: Bad machine code: Out of bounds shuffle index
- %22:_(s32) = G_IMPLICIT_DEF
- %20:_(<2 x s32>) = G_SHUFFLE_VECTOR %22, %22, shufflemask(0, 2)
+ %26:_(<2 x s32>) = G_SHUFFLE_VECTOR %0, %1, shufflemask(0, 7)
...
diff --git a/llvm/test/TableGen/intrinsic-manual-name.td b/llvm/test/TableGen/intrinsic-manual-name.td
new file mode 100644
index 0000000..5751fc2
--- /dev/null
+++ b/llvm/test/TableGen/intrinsic-manual-name.td
@@ -0,0 +1,6 @@
+// RUN: llvm-tblgen -gen-intrinsic-impl -I %p/../../include %s -DTEST_INTRINSICS_SUPPRESS_DEFS 2>&1 | FileCheck %s -DFILE=%s
+
+include "llvm/IR/Intrinsics.td"
+
+// CHECK: [[FILE]]:[[@LINE+1]]:5: note: Explicitly specified name matches default name, consider dropping it
+def int_foo0 : Intrinsic<[llvm_anyint_ty], [], [], "llvm.foo0">;
diff --git a/llvm/test/ThinLTO/X86/devirt_external_comdat_same_guid.ll b/llvm/test/ThinLTO/X86/devirt_external_comdat_same_guid.ll
index 1f0737b..d0b8d14 100644
--- a/llvm/test/ThinLTO/X86/devirt_external_comdat_same_guid.ll
+++ b/llvm/test/ThinLTO/X86/devirt_external_comdat_same_guid.ll
@@ -24,9 +24,9 @@
; RUN: llvm-dis %t5.1.4.opt.bc -o - | FileCheck %s --check-prefix=CHECK-IR1
; RUN: llvm-dis %t5.2.4.opt.bc -o - | FileCheck %s --check-prefix=CHECK-IR2
-; PRINT-DAG: Devirtualized call to {{.*}} (_ZN1A1nEi)
+; PRINT-DAG: Devirtualized call to {{.*}} (_ZN1B1nEi)
-; REMARK-DAG: single-impl: devirtualized a call to _ZN1A1nEi
+; REMARK-DAG: single-impl: devirtualized a call to _ZN1B1nEi
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-grtev4-linux-gnu"
@@ -55,7 +55,7 @@ entry:
ret i32 0
}
-; CHECK-IR1: define i32 @test(
+; CHECK-IR1: define noundef i32 @test(
define i32 @test(ptr %obj, i32 %a) {
entry:
%vtable = load ptr, ptr %obj
@@ -65,7 +65,7 @@ entry:
%fptr1 = load ptr, ptr %fptrptr, align 8
; Check that the call was devirtualized.
- ; CHECK-IR1: tail call i32 {{.*}}@_ZN1A1nEi
+ ; CHECK-IR1: tail call i32 {{.*}}@_ZN1B1nEi
%call = tail call i32 %fptr1(ptr nonnull %obj, i32 %a)
ret i32 %call
@@ -73,7 +73,7 @@ entry:
; CHECK-IR2: define i32 @test2
; Check that the call was devirtualized.
-; CHECK-IR2: tail call i32 @_ZN1A1nEi
+; CHECK-IR2: tail call i32 @_ZN1B1nEi
declare i1 @llvm.type.test(ptr, metadata)
declare void @llvm.assume(i1)
diff --git a/llvm/test/ThinLTO/X86/dtlto/json.ll b/llvm/test/ThinLTO/X86/dtlto/json.ll
index 1a38438..ee1c428 100644
--- a/llvm/test/ThinLTO/X86/dtlto/json.ll
+++ b/llvm/test/ThinLTO/X86/dtlto/json.ll
@@ -7,21 +7,33 @@ RUN: rm -rf %t && split-file %s %t && cd %t
RUN: opt -thinlto-bc t1.ll -o t1.bc
RUN: opt -thinlto-bc t2.ll -o t2.bc
-; Perform DTLTO.
+; Perform DTLTO with clang.
RUN: not llvm-lto2 run t1.bc t2.bc -o my.output \
RUN: -r=t1.bc,t1,px -r=t2.bc,t2,px \
RUN: -dtlto-distributor=%python \
RUN: -dtlto-distributor-arg=%llvm_src_root/utils/dtlto/validate.py,--da1=10,--da2=10 \
RUN: -dtlto-compiler=my_clang.exe \
RUN: -dtlto-compiler-arg=--rota1=10,--rota2=20 \
-RUN: 2>&1 | FileCheck %s
+RUN: 2>&1 | FileCheck --check-prefixes=CHECK,CLANG %s
+
+; Perform DTLTO with LLVM driver.
+RUN: not llvm-lto2 run t1.bc t2.bc -o my.output \
+RUN: -r=t1.bc,t1,px -r=t2.bc,t2,px \
+RUN: -dtlto-distributor=%python \
+RUN: -dtlto-distributor-arg=%llvm_src_root/utils/dtlto/validate.py,--da1=10,--da2=10 \
+RUN: -dtlto-compiler=llvm \
+RUN: -dtlto-compiler-prepend-arg=clang \
+RUN: -dtlto-compiler-arg=--rota1=10,--rota2=20 \
+RUN: 2>&1 | FileCheck --check-prefixes=CHECK,LLVM %s
CHECK: distributor_args=['--da1=10', '--da2=10']
; Check the common object.
CHECK: "linker_output": "my.output"
CHECK: "args":
-CHECK-NEXT: "my_clang.exe"
+CLANG-NEXT: "my_clang.exe"
+LLVM-NEXT: "llvm"
+LLVM-NEXT: "clang"
CHECK-NEXT: "-c"
CHECK-NEXT: "--target=x86_64-unknown-linux-gnu"
CHECK-NEXT: "-O2"
@@ -30,7 +42,7 @@ CHECK-NEXT: "-Wno-unused-command-line-argument"
CHECK-NEXT: "--rota1=10"
CHECK-NEXT: "--rota2=20"
CHECK-NEXT: ]
-CHECK: "inputs": []
+CHECK: "inputs": []
; Check the first job entry.
CHECK: "args":
diff --git a/llvm/test/Transforms/ADCE/2016-09-06.ll b/llvm/test/Transforms/ADCE/2016-09-06.ll
index 850f412..1329ac6 100644
--- a/llvm/test/Transforms/ADCE/2016-09-06.ll
+++ b/llvm/test/Transforms/ADCE/2016-09-06.ll
@@ -5,7 +5,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; Function Attrs: nounwind uwtable
-define i32 @foo(i32, i32, i32) #0 {
+define i32 @foo(i32, i32, i32) {
%4 = alloca i32, align 4
%5 = alloca i32, align 4
%6 = alloca i32, align 4
@@ -48,8 +48,6 @@ B21:
ret i32 %I22
}
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.ident = !{!0}
!0 = !{!"clang version 4.0.0"}
diff --git a/llvm/test/Transforms/ADCE/blocks-with-dead-term-nondeterministic.ll b/llvm/test/Transforms/ADCE/blocks-with-dead-term-nondeterministic.ll
index 9708be9..5e844b4 100644
--- a/llvm/test/Transforms/ADCE/blocks-with-dead-term-nondeterministic.ll
+++ b/llvm/test/Transforms/ADCE/blocks-with-dead-term-nondeterministic.ll
@@ -5,7 +5,7 @@ target triple = "x86_64-apple-macosx10.10.0"
; CHECK: uselistorder label %bb16, { 1, 0 }
; Function Attrs: noinline nounwind ssp uwtable
-define void @ham(i1 %arg) local_unnamed_addr #0 {
+define void @ham(i1 %arg) local_unnamed_addr {
bb:
br i1 false, label %bb1, label %bb22
@@ -64,8 +64,6 @@ bb22: ; preds = %bb21, %bb
ret void
}
-attributes #0 = { noinline nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.module.flags = !{!0}
!0 = !{i32 7, !"PIC Level", i32 2}
diff --git a/llvm/test/Transforms/AddDiscriminators/basic.ll b/llvm/test/Transforms/AddDiscriminators/basic.ll
index 5186537..fc4c10a 100644
--- a/llvm/test/Transforms/AddDiscriminators/basic.ll
+++ b/llvm/test/Transforms/AddDiscriminators/basic.ll
@@ -11,7 +11,7 @@
; if (i < 10) x = i;
; }
-define void @foo(i32 %i) #0 !dbg !4 {
+define void @foo(i32 %i) !dbg !4 {
entry:
%i.addr = alloca i32, align 4
%x = alloca i32, align 4
@@ -35,8 +35,6 @@ if.end: ; preds = %if.then, %entry
; CHECK: ret void, !dbg ![[END:[0-9]+]]
}
-attributes #0 = { nounwind uwtable noinline optnone "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!7, !8}
!llvm.ident = !{!9}
diff --git a/llvm/test/Transforms/AddDiscriminators/call-nested.ll b/llvm/test/Transforms/AddDiscriminators/call-nested.ll
index 99340a5..f1373e4 100644
--- a/llvm/test/Transforms/AddDiscriminators/call-nested.ll
+++ b/llvm/test/Transforms/AddDiscriminators/call-nested.ll
@@ -9,7 +9,7 @@
; #6 }
; Function Attrs: uwtable
-define i32 @_Z3bazv() #0 !dbg !4 {
+define i32 @_Z3bazv() !dbg !4 {
%1 = call i32 @_Z3barv(), !dbg !11
; CHECK: %1 = call i32 @_Z3barv(), !dbg ![[CALL0:[0-9]+]]
%2 = call i32 @_Z3barv(), !dbg !12
@@ -19,12 +19,9 @@ define i32 @_Z3bazv() #0 !dbg !4 {
ret i32 %3, !dbg !14
}
-declare i32 @_Z3fooii(i32, i32) #1
+declare i32 @_Z3fooii(i32, i32)
-declare i32 @_Z3barv() #1
-
-attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+declare i32 @_Z3barv()
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!8, !9}
diff --git a/llvm/test/Transforms/AddDiscriminators/call.ll b/llvm/test/Transforms/AddDiscriminators/call.ll
index 93d3aa4..11b21ef 100644
--- a/llvm/test/Transforms/AddDiscriminators/call.ll
+++ b/llvm/test/Transforms/AddDiscriminators/call.ll
@@ -8,7 +8,7 @@
; #5 }
; Function Attrs: uwtable
-define void @_Z3foov() #0 !dbg !4 {
+define void @_Z3foov() !dbg !4 {
call void @_Z3barv(), !dbg !10
; CHECK: call void @_Z3barv(), !dbg ![[CALL0:[0-9]+]]
%a = alloca [100 x i8], align 16
@@ -21,13 +21,10 @@ define void @_Z3foov() #0 !dbg !4 {
ret void, !dbg !13
}
-declare void @_Z3barv() #1
+declare void @_Z3barv()
declare void @llvm.lifetime.start.p0(ptr nocapture) nounwind argmemonly
declare void @llvm.lifetime.end.p0(ptr nocapture) nounwind argmemonly
-attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!7, !8}
!llvm.ident = !{!9}
diff --git a/llvm/test/Transforms/AddDiscriminators/diamond.ll b/llvm/test/Transforms/AddDiscriminators/diamond.ll
index c93a57a..9edcf39 100644
--- a/llvm/test/Transforms/AddDiscriminators/diamond.ll
+++ b/llvm/test/Transforms/AddDiscriminators/diamond.ll
@@ -12,7 +12,7 @@
; bar(3): discriminator 2
; Function Attrs: uwtable
-define void @_Z3fooi(i32 %i) #0 !dbg !4 {
+define void @_Z3fooi(i32 %i) !dbg !4 {
%1 = alloca i32, align 4
store i32 %i, ptr %1, align 4
call void @llvm.dbg.declare(metadata ptr %1, metadata !11, metadata !12), !dbg !13
@@ -34,13 +34,9 @@ define void @_Z3fooi(i32 %i) #0 !dbg !4 {
}
; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
-declare void @_Z3bari(i32) #2
-
-attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+declare void @_Z3bari(i32)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!8, !9}
diff --git a/llvm/test/Transforms/AddDiscriminators/first-only.ll b/llvm/test/Transforms/AddDiscriminators/first-only.ll
index 7ae9ed0..415e5f0 100644
--- a/llvm/test/Transforms/AddDiscriminators/first-only.ll
+++ b/llvm/test/Transforms/AddDiscriminators/first-only.ll
@@ -13,7 +13,7 @@
; }
; }
-define void @foo(i32 %i) #0 !dbg !4 {
+define void @foo(i32 %i) !dbg !4 {
entry:
%i.addr = alloca i32, align 4
%x = alloca i32, align 4
@@ -44,8 +44,6 @@ if.end: ; preds = %if.then, %entry
; CHECK: ret void, !dbg ![[END:[0-9]+]]
}
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!7, !8}
!llvm.ident = !{!9}
diff --git a/llvm/test/Transforms/AddDiscriminators/invoke.ll b/llvm/test/Transforms/AddDiscriminators/invoke.ll
index d39014d..a3989b6 100644
--- a/llvm/test/Transforms/AddDiscriminators/invoke.ll
+++ b/llvm/test/Transforms/AddDiscriminators/invoke.ll
@@ -5,14 +5,14 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.14.0"
; Function Attrs: ssp uwtable
-define void @_Z3foov() #0 personality ptr @__gxx_personality_v0 !dbg !8 {
+define void @_Z3foov() personality ptr @__gxx_personality_v0 !dbg !8 {
entry:
%exn.slot = alloca ptr
%ehselector.slot = alloca i32
; CHECK: call void @_Z12bar_noexceptv({{.*}} !dbg ![[CALL1:[0-9]+]]
- call void @_Z12bar_noexceptv() #4, !dbg !11
+ call void @_Z12bar_noexceptv(), !dbg !11
; CHECK: call void @_Z12bar_noexceptv({{.*}} !dbg ![[CALL2:[0-9]+]]
- call void @_Z12bar_noexceptv() #4, !dbg !13
+ call void @_Z12bar_noexceptv(), !dbg !13
invoke void @_Z3barv()
; CHECK: unwind label {{.*}} !dbg ![[INVOKE:[0-9]+]]
to label %invoke.cont unwind label %lpad, !dbg !14
@@ -31,8 +31,8 @@ lpad: ; preds = %entry
catch: ; preds = %lpad
%exn = load ptr, ptr %exn.slot, align 8, !dbg !15
- %3 = call ptr @__cxa_begin_catch(ptr %exn) #4, !dbg !15
- invoke void @__cxa_rethrow() #5
+ %3 = call ptr @__cxa_begin_catch(ptr %exn), !dbg !15
+ invoke void @__cxa_rethrow()
to label %unreachable unwind label %lpad1, !dbg !17
lpad1: ; preds = %catch
@@ -62,7 +62,7 @@ terminate.lpad: ; preds = %lpad1
%7 = landingpad { ptr, i32 }
catch ptr null, !dbg !20
%8 = extractvalue { ptr, i32 } %7, 0, !dbg !20
- call void @__clang_call_terminate(ptr %8) #6, !dbg !20
+ call void @__clang_call_terminate(ptr %8), !dbg !20
unreachable, !dbg !20
unreachable: ; preds = %catch
@@ -70,9 +70,9 @@ unreachable: ; preds = %catch
}
; Function Attrs: nounwind
-declare void @_Z12bar_noexceptv() #1
+declare void @_Z12bar_noexceptv()
-declare void @_Z3barv() #2
+declare void @_Z3barv()
declare i32 @__gxx_personality_v0(...)
@@ -83,22 +83,14 @@ declare void @__cxa_rethrow()
declare void @__cxa_end_catch()
; Function Attrs: noinline noreturn nounwind
-define linkonce_odr hidden void @__clang_call_terminate(ptr) #3 {
- %2 = call ptr @__cxa_begin_catch(ptr %0) #4
- call void @_ZSt9terminatev() #6
+define linkonce_odr hidden void @__clang_call_terminate(ptr) {
+ %2 = call ptr @__cxa_begin_catch(ptr %0)
+ call void @_ZSt9terminatev()
unreachable
}
declare void @_ZSt9terminatev()
-attributes #0 = { ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { noinline noreturn nounwind }
-attributes #4 = { nounwind }
-attributes #5 = { noreturn }
-attributes #6 = { noreturn nounwind }
-
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5, !6}
!llvm.ident = !{!7}
diff --git a/llvm/test/Transforms/AddDiscriminators/multiple.ll b/llvm/test/Transforms/AddDiscriminators/multiple.ll
index 54c1a5d..8e8ca6a 100644
--- a/llvm/test/Transforms/AddDiscriminators/multiple.ll
+++ b/llvm/test/Transforms/AddDiscriminators/multiple.ll
@@ -10,7 +10,7 @@
; The two stores inside the if-then-else line must have different discriminator
; values.
-define void @foo(i32 %i) #0 !dbg !4 {
+define void @foo(i32 %i) !dbg !4 {
entry:
%i.addr = alloca i32, align 4
%x = alloca i32, align 4
@@ -45,8 +45,6 @@ if.end: ; preds = %if.else, %if.then
ret void, !dbg !12
}
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!7, !8}
!llvm.ident = !{!9}
diff --git a/llvm/test/Transforms/AddDiscriminators/no-discriminators.ll b/llvm/test/Transforms/AddDiscriminators/no-discriminators.ll
index c23edd6..f84579b 100644
--- a/llvm/test/Transforms/AddDiscriminators/no-discriminators.ll
+++ b/llvm/test/Transforms/AddDiscriminators/no-discriminators.ll
@@ -12,7 +12,7 @@
; altered. If they are, it means that the discriminators pass added a
; new lexical scope.
-define i32 @foo(i64 %i) #0 !dbg !4 {
+define i32 @foo(i64 %i) !dbg !4 {
entry:
%retval = alloca i32, align 4
%i.addr = alloca i64, align 8
@@ -39,10 +39,7 @@ return: ; preds = %if.else, %if.then
}
; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
; We should be able to add discriminators even in the absence of llvm.dbg.cu.
; When using sample profiles, the front end will generate line tables but it
diff --git a/llvm/test/Transforms/AddDiscriminators/oneline.ll b/llvm/test/Transforms/AddDiscriminators/oneline.ll
index 533d547..fc1675b 100644
--- a/llvm/test/Transforms/AddDiscriminators/oneline.ll
+++ b/llvm/test/Transforms/AddDiscriminators/oneline.ll
@@ -10,7 +10,7 @@
; return 100: discriminator 4
; return 99: discriminator 6
-define i32 @_Z3fooi(i32 %i) #0 !dbg !4 {
+define i32 @_Z3fooi(i32 %i) !dbg !4 {
%1 = alloca i32, align 4
%2 = alloca i32, align 4
store i32 %i, ptr %2, align 4, !tbaa !13
@@ -49,10 +49,7 @@ define i32 @_Z3fooi(i32 %i) #0 !dbg !4 {
}
; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!10, !11}
diff --git a/llvm/test/Transforms/Attributor/reduced/register_benchmark_test.ll b/llvm/test/Transforms/Attributor/reduced/register_benchmark_test.ll
index eb7d78f..4704238 100644
--- a/llvm/test/Transforms/Attributor/reduced/register_benchmark_test.ll
+++ b/llvm/test/Transforms/Attributor/reduced/register_benchmark_test.ll
@@ -1557,24 +1557,24 @@ declare dso_local void @_GLOBAL__sub_I_register_benchmark_test.cc() #0 section "
; Function Attrs: cold noreturn nounwind
declare void @llvm.trap() #20
-attributes #0 = { uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
attributes #3 = { nounwind }
-attributes #4 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
attributes #5 = { argmemonly nounwind willreturn }
-attributes #6 = { alwaysinline uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #7 = { alwaysinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #8 = { nobuiltin "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #9 = { nobuiltin nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #10 = { inlinehint uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #11 = { inlinehint nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #12 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #13 = { norecurse uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #6 = { alwaysinline uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #7 = { alwaysinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #8 = { nobuiltin "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #9 = { nobuiltin nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #10 = { inlinehint uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #11 = { inlinehint nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #12 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #13 = { norecurse uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
attributes #14 = { nounwind readnone willreturn }
-attributes #15 = { noreturn "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #15 = { noreturn "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
attributes #16 = { noinline noreturn nounwind }
attributes #17 = { argmemonly nounwind willreturn writeonly }
-attributes #18 = { noreturn uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #19 = { inlinehint noreturn uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #18 = { noreturn uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #19 = { inlinehint noreturn uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
attributes #20 = { cold noreturn nounwind }
diff --git a/llvm/test/Transforms/CodeGenPrepare/ARM/bitreverse-recognize.ll b/llvm/test/Transforms/CodeGenPrepare/ARM/bitreverse-recognize.ll
index d272fef..189186b 100644
--- a/llvm/test/Transforms/CodeGenPrepare/ARM/bitreverse-recognize.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/ARM/bitreverse-recognize.ll
@@ -4,7 +4,7 @@ target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
target triple = "armv7--linux-gnueabihf"
; CHECK-LABEL: @f
-define i32 @f(i32 %a) #0 {
+define i32 @f(i32 %a) {
; CHECK: call i32 @llvm.bitreverse.i32
entry:
br label %for.body
@@ -25,8 +25,6 @@ for.body: ; preds = %for.body, %entry
br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !3
}
-attributes #0 = { norecurse nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/bitreverse-hang.ll b/llvm/test/Transforms/CodeGenPrepare/X86/bitreverse-hang.ll
index eec0967..35115cf 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/bitreverse-hang.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/bitreverse-hang.ll
@@ -21,7 +21,7 @@
@b = common global i32 0, align 4
; CHECK: define i32 @fn1
-define i32 @fn1() #0 {
+define i32 @fn1() {
entry:
%b.promoted = load i32, ptr @b, align 4, !tbaa !2
br label %for.body
@@ -40,8 +40,6 @@ for.end: ; preds = %for.body
ret i32 undef
}
-attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
diff --git a/llvm/test/Transforms/CodeGenPrepare/dom-tree.ll b/llvm/test/Transforms/CodeGenPrepare/dom-tree.ll
index 1c990ff..14360fe 100644
--- a/llvm/test/Transforms/CodeGenPrepare/dom-tree.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/dom-tree.ll
@@ -10,7 +10,7 @@
target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
target triple = "armv7--linux-gnueabihf"
-define i32 @f(i32 %a) #0 {
+define i32 @f(i32 %a) {
entry:
br label %for.body
@@ -30,8 +30,6 @@ for.body:
br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !3
}
-attributes #0 = { norecurse nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}
diff --git a/llvm/test/Transforms/ConstantHoisting/X86/ehpad.ll b/llvm/test/Transforms/ConstantHoisting/X86/ehpad.ll
index 46fa066..78760db 100644
--- a/llvm/test/Transforms/ConstantHoisting/X86/ehpad.ll
+++ b/llvm/test/Transforms/ConstantHoisting/X86/ehpad.ll
@@ -20,7 +20,7 @@ target triple = "x86_64-pc-windows-msvc"
; BFIHOIST: br label %endif
; Function Attrs: norecurse
-define i32 @main(i32 %argc, ptr nocapture readnone %argv) local_unnamed_addr #0 personality ptr @__CxxFrameHandler3 {
+define i32 @main(i32 %argc, ptr nocapture readnone %argv) local_unnamed_addr personality ptr @__CxxFrameHandler3 {
%call = tail call i64 @fn(i64 0)
%call1 = tail call i64 @fn(i64 1)
%tobool = icmp eq i32 %argc, 0
@@ -62,9 +62,6 @@ endif:
ret i32 0
}
-declare i64 @fn(i64) local_unnamed_addr #1
+declare i64 @fn(i64) local_unnamed_addr
declare i32 @__CxxFrameHandler3(...)
-
-attributes #0 = { norecurse "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/ConstraintElimination/add-nsw.ll b/llvm/test/Transforms/ConstraintElimination/add-nsw.ll
index 5127e92..4b8ac09 100644
--- a/llvm/test/Transforms/ConstraintElimination/add-nsw.ll
+++ b/llvm/test/Transforms/ConstraintElimination/add-nsw.ll
@@ -757,8 +757,7 @@ define i1 @add_neg_1_known_sge_ult_1(i32 %a) {
; CHECK-NEXT: [[A_SGE:%.*]] = icmp sge i32 [[A:%.*]], 1
; CHECK-NEXT: call void @llvm.assume(i1 [[A_SGE]])
; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[A]], -1
-; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[SUB]], [[A]]
-; CHECK-NEXT: ret i1 [[C]]
+; CHECK-NEXT: ret i1 true
;
entry:
%a.sge = icmp sge i32 %a, 1
@@ -823,8 +822,7 @@ define i1 @add_neg_3_known_sge_ult_1(i32 %a) {
; CHECK-NEXT: [[A_SGE:%.*]] = icmp sge i32 [[A:%.*]], 3
; CHECK-NEXT: call void @llvm.assume(i1 [[A_SGE]])
; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[A]], -3
-; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[SUB]], [[A]]
-; CHECK-NEXT: ret i1 [[C]]
+; CHECK-NEXT: ret i1 true
;
entry:
%a.sge = icmp sge i32 %a, 3
diff --git a/llvm/test/Transforms/ConstraintElimination/gep-arithmetic-add.ll b/llvm/test/Transforms/ConstraintElimination/gep-arithmetic-add.ll
index 52adc78..8dcac78 100644
--- a/llvm/test/Transforms/ConstraintElimination/gep-arithmetic-add.ll
+++ b/llvm/test/Transforms/ConstraintElimination/gep-arithmetic-add.ll
@@ -389,8 +389,7 @@ define i1 @gep_count_add_1_sge_known_ult_1(i32 %count, ptr %p) {
; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[COUNT]], -1
; CHECK-NEXT: [[SUB_EXT:%.*]] = zext i32 [[SUB]] to i64
; CHECK-NEXT: [[GEP_SUB:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[SUB_EXT]]
-; CHECK-NEXT: [[C:%.*]] = icmp ult ptr [[GEP_SUB]], [[GEP_COUNT]]
-; CHECK-NEXT: ret i1 [[C]]
+; CHECK-NEXT: ret i1 true
;
entry:
%sge = icmp sge i32 %count, 1
@@ -415,8 +414,7 @@ define i1 @gep_count_add_1_sge_known_uge_1(i32 %count, ptr %p) {
; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[COUNT]], -1
; CHECK-NEXT: [[SUB_EXT:%.*]] = zext i32 [[SUB]] to i64
; CHECK-NEXT: [[GEP_SUB:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[SUB_EXT]]
-; CHECK-NEXT: [[C:%.*]] = icmp uge ptr [[GEP_SUB]], [[P]]
-; CHECK-NEXT: ret i1 [[C]]
+; CHECK-NEXT: ret i1 true
;
entry:
%sge = icmp sge i32 %count, 1
diff --git a/llvm/test/Transforms/Coroutines/coro-debug.ll b/llvm/test/Transforms/Coroutines/coro-debug.ll
index d1f1922..109be51f 100644
--- a/llvm/test/Transforms/Coroutines/coro-debug.ll
+++ b/llvm/test/Transforms/Coroutines/coro-debug.ll
@@ -14,7 +14,7 @@ entry:
%0 = call token @llvm.coro.id(i32 0, ptr null, ptr @flink, ptr null), !dbg !16
%1 = call i64 @llvm.coro.size.i64(), !dbg !16
%call = call ptr @malloc(i64 %1), !dbg !16
- %2 = call ptr @llvm.coro.begin(token %0, ptr %call) #7, !dbg !16
+ %2 = call ptr @llvm.coro.begin(token %0, ptr %call), !dbg !16
store ptr %2, ptr %coro_hdl, align 8, !dbg !16
%3 = call i8 @llvm.coro.suspend(token none, i1 false), !dbg !17
%conv = sext i8 %3 to i32, !dbg !17
@@ -69,7 +69,7 @@ coro_Cleanup: ; preds = %sw.epilog, %sw.bb1
br label %coro_Suspend, !dbg !24
coro_Suspend: ; preds = %coro_Cleanup, %sw.default
- call void @llvm.coro.end(ptr null, i1 false, token none) #7, !dbg !24
+ call void @llvm.coro.end(ptr null, i1 false, token none), !dbg !24
%7 = load ptr, ptr %coro_hdl, align 8, !dbg !24
store i32 0, ptr %late_local, !dbg !24
ret ptr %7, !dbg !24
@@ -82,47 +82,40 @@ ehcleanup:
}
; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+declare void @llvm.dbg.value(metadata, metadata, metadata)
; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
; Function Attrs: argmemonly nounwind readonly
-declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) #2
+declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr)
-declare ptr @malloc(i64) #3
+declare ptr @malloc(i64)
declare ptr @allocate()
declare void @print({ ptr, i32 })
declare void @log()
; Function Attrs: nounwind readnone
-declare i64 @llvm.coro.size.i64() #4
+declare i64 @llvm.coro.size.i64()
; Function Attrs: nounwind
-declare ptr @llvm.coro.begin(token, ptr writeonly) #5
+declare ptr @llvm.coro.begin(token, ptr writeonly)
; Function Attrs: nounwind
-declare i8 @llvm.coro.suspend(token, i1) #5
+declare i8 @llvm.coro.suspend(token, i1)
-declare void @free(ptr) #3
+declare void @free(ptr)
; Function Attrs: argmemonly nounwind readonly
-declare ptr @llvm.coro.free(token, ptr nocapture readonly) #2
+declare ptr @llvm.coro.free(token, ptr nocapture readonly)
; Function Attrs: nounwind
-declare void @llvm.coro.end(ptr, i1, token) #5
+declare void @llvm.coro.end(ptr, i1, token)
; Function Attrs: argmemonly nounwind readonly
-declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #2
-
-attributes #0 = { noinline nounwind presplitcoroutine "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
-attributes #2 = { argmemonly nounwind readonly }
-attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind readnone }
-attributes #5 = { nounwind }
-attributes #6 = { alwaysinline }
-attributes #7 = { noduplicate }
+declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8)
+
+attributes #0 = { noinline nounwind presplitcoroutine }
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4}
diff --git a/llvm/test/Transforms/Coroutines/coro-split-dbg.ll b/llvm/test/Transforms/Coroutines/coro-split-dbg.ll
index c53bea8..577ca9a 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-dbg.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-dbg.ll
@@ -6,9 +6,9 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
-declare void @bar(...) local_unnamed_addr #2
+declare void @bar(...) local_unnamed_addr
; Function Attrs: nounwind uwtable
define ptr @f() #3 !dbg !16 {
@@ -16,14 +16,14 @@ entry:
%0 = tail call token @llvm.coro.id(i32 0, ptr null, ptr @f, ptr null), !dbg !26
%1 = tail call i64 @llvm.coro.size.i64(), !dbg !26
%call = tail call ptr @malloc(i64 %1), !dbg !26
- %2 = tail call ptr @llvm.coro.begin(token %0, ptr %call) #9, !dbg !26
+ %2 = tail call ptr @llvm.coro.begin(token %0, ptr %call), !dbg !26
tail call void @llvm.dbg.value(metadata ptr %2, metadata !21, metadata !12), !dbg !26
br label %for.cond, !dbg !27
for.cond: ; preds = %for.cond, %entry
tail call void @llvm.dbg.value(metadata i32 undef, metadata !22, metadata !12), !dbg !28
- tail call void @llvm.dbg.value(metadata i32 undef, metadata !11, metadata !12) #7, !dbg !29
- tail call void (...) @bar() #7, !dbg !33
+ tail call void @llvm.dbg.value(metadata i32 undef, metadata !11, metadata !12), !dbg !29
+ tail call void (...) @bar(), !dbg !33
%3 = tail call token @llvm.coro.save(ptr null), !dbg !34
%4 = tail call i8 @llvm.coro.suspend(token %3, i1 false), !dbg !34
%conv = sext i8 %4 to i32, !dbg !34
@@ -38,40 +38,31 @@ coro_Cleanup: ; preds = %for.cond
br label %coro_Suspend, !dbg !36
coro_Suspend: ; preds = %for.cond, %if.then, %coro_Cleanup
- tail call void @llvm.coro.end(ptr null, i1 false, token none) #9, !dbg !38
+ tail call void @llvm.coro.end(ptr null, i1 false, token none), !dbg !38
ret ptr %2, !dbg !39
}
; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(ptr nocapture) #4
+declare void @llvm.lifetime.start.p0(ptr nocapture)
; Function Attrs: argmemonly nounwind readonly
-declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) #5
+declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr)
; Function Attrs: nounwind
-declare noalias ptr @malloc(i64) local_unnamed_addr #6
-declare i64 @llvm.coro.size.i64() #1
-declare ptr @llvm.coro.begin(token, ptr writeonly) #7
-declare token @llvm.coro.save(ptr) #7
-declare i8 @llvm.coro.suspend(token, i1) #7
-declare void @llvm.lifetime.end.p0(ptr nocapture) #4
-declare ptr @llvm.coro.free(token, ptr nocapture readonly) #5
-declare void @free(ptr nocapture) local_unnamed_addr #6
-declare void @llvm.coro.end(ptr, i1, token) #7
-declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #5
+declare noalias ptr @malloc(i64) local_unnamed_addr
+declare i64 @llvm.coro.size.i64()
+declare ptr @llvm.coro.begin(token, ptr writeonly)
+declare token @llvm.coro.save(ptr)
+declare i8 @llvm.coro.suspend(token, i1)
+declare void @llvm.lifetime.end.p0(ptr nocapture)
+declare ptr @llvm.coro.free(token, ptr nocapture readonly)
+declare void @free(ptr nocapture) local_unnamed_addr
+declare void @llvm.coro.end(ptr, i1, token)
+declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8)
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+declare void @llvm.dbg.value(metadata, metadata, metadata)
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind uwtable presplitcoroutine "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { argmemonly nounwind }
-attributes #5 = { argmemonly nounwind readonly }
-attributes #6 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #7 = { nounwind }
-attributes #8 = { alwaysinline nounwind }
-attributes #9 = { noduplicate }
+attributes #3 = { nounwind uwtable presplitcoroutine }
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4}
diff --git a/llvm/test/Transforms/DeadArgElim/dbginfo.ll b/llvm/test/Transforms/DeadArgElim/dbginfo.ll
index a27ca9d..f6313c7 100644
--- a/llvm/test/Transforms/DeadArgElim/dbginfo.ll
+++ b/llvm/test/Transforms/DeadArgElim/dbginfo.ll
@@ -21,14 +21,14 @@
; updated LLVM functions.
; Function Attrs: uwtable
-define void @_Z2f2v() #0 !dbg !4 {
+define void @_Z2f2v() !dbg !4 {
entry:
call void (i32, ...) @_ZL2f1iz(i32 1), !dbg !15
ret void, !dbg !16
}
; Function Attrs: nounwind uwtable
-define internal void @_ZL2f1iz(i32, ...) #1 !dbg !8 {
+define internal void @_ZL2f1iz(i32, ...) !dbg !8 {
entry:
call void @llvm.dbg.value(metadata i32 %0, metadata !17, metadata !18), !dbg !19
ret void, !dbg !20
@@ -40,8 +40,6 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) #2
; Function Attrs: nounwind readnone
declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-attributes #0 = { uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { nounwind readnone }
!llvm.dbg.cu = !{!0}
diff --git a/llvm/test/Transforms/DeadStoreElimination/matrix-intrinsics.ll b/llvm/test/Transforms/DeadStoreElimination/matrix-intrinsics.ll
index ae3c746..2eaa275 100644
--- a/llvm/test/Transforms/DeadStoreElimination/matrix-intrinsics.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/matrix-intrinsics.ll
@@ -5,8 +5,8 @@ define void @dead_unstrided_store_non_matrix_load(ptr noalias %src, ptr noalias
; CHECK-LABEL: define void @dead_unstrided_store_non_matrix_load(
; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
; CHECK-NEXT: [[L:%.*]] = load double, ptr [[SRC]], align 8
+; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
; CHECK-NEXT: ret void
;
entry:
@@ -173,7 +173,6 @@ define void @dead_unstrided_store(ptr noalias %src, ptr noalias %dst) {
; CHECK-LABEL: define void @dead_unstrided_store(
; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
; CHECK-NEXT: ret void
@@ -241,7 +240,6 @@ define void @dead_matrix_store_non_matrix_overwrite_unstrided(ptr noalias %src,
; CHECK-LABEL: define void @dead_matrix_store_non_matrix_overwrite_unstrided(
; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
; CHECK-NEXT: store <8 x double> zeroinitializer, ptr [[DST]], align 64
; CHECK-NEXT: ret void
@@ -257,7 +255,6 @@ define void @dead_matrix_store_non_matrix_overwrite_strided(ptr noalias %src, pt
; CHECK-LABEL: define void @dead_matrix_store_non_matrix_overwrite_strided(
; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 8, i1 false, i32 4, i32 2)
; CHECK-NEXT: store <16 x double> zeroinitializer, ptr [[DST]], align 128
; CHECK-NEXT: ret void
@@ -289,7 +286,6 @@ define void @live_matrix_store_non_matrix_overwrite_strided(ptr noalias %src, pt
; CHECK-LABEL: define void @live_matrix_store_non_matrix_overwrite_strided(
; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 8, i1 false, i32 4, i32 2)
; CHECK-NEXT: store <8 x double> zeroinitializer, ptr [[DST]], align 64
; CHECK-NEXT: ret void
@@ -305,8 +301,6 @@ define void @dead_matrix_store_dimension_change(ptr noalias %src, ptr noalias %d
; CHECK-LABEL: define void @dead_matrix_store_dimension_change(
; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 8, i1 false, i32 4, i32 2)
-; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
; CHECK-NEXT: call void @llvm.matrix.column.major.store.v9f64.i32(<9 x double> zeroinitializer, ptr [[DST]], i32 3, i1 false, i32 3, i32 3)
; CHECK-NEXT: ret void
;
diff --git a/llvm/test/Transforms/DeadStoreElimination/mda-with-dbg-values.ll b/llvm/test/Transforms/DeadStoreElimination/mda-with-dbg-values.ll
index f66aa0f..2cec6b5 100644
--- a/llvm/test/Transforms/DeadStoreElimination/mda-with-dbg-values.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/mda-with-dbg-values.ll
@@ -14,7 +14,7 @@ target triple = "x86_64-unknown-linux-gnu"
@g = common global [1 x i8] zeroinitializer, align 1, !dbg !0
; Function Attrs: noinline nounwind uwtable
-define void @foo() #0 !dbg !14 {
+define void @foo() !dbg !14 {
entry:
%i = alloca i8, align 1
store i8 1, ptr %i, align 1, !dbg !19
@@ -37,7 +37,6 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
; Function Attrs: argmemonly nounwind
declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1) #2
-attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind readnone speculatable }
attributes #2 = { argmemonly nounwind }
diff --git a/llvm/test/Transforms/FunctionImport/Inputs/funcimport_debug.ll b/llvm/test/Transforms/FunctionImport/Inputs/funcimport_debug.ll
index 4d3a241..628669d 100644
--- a/llvm/test/Transforms/FunctionImport/Inputs/funcimport_debug.ll
+++ b/llvm/test/Transforms/FunctionImport/Inputs/funcimport_debug.ll
@@ -3,13 +3,11 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16
target triple = "x86_64-unknown-linux-gnu"
; Function Attrs: nounwind uwtable
-define void @func() #0 !dbg !4 {
+define void @func() !dbg !4 {
entry:
ret void, !dbg !10
}
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!7, !8}
!llvm.ident = !{!9}
diff --git a/llvm/test/Transforms/FunctionImport/funcimport_debug.ll b/llvm/test/Transforms/FunctionImport/funcimport_debug.ll
index f449047..713dc4e 100644
--- a/llvm/test/Transforms/FunctionImport/funcimport_debug.ll
+++ b/llvm/test/Transforms/FunctionImport/funcimport_debug.ll
@@ -24,16 +24,13 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16
target triple = "x86_64-unknown-linux-gnu"
; Function Attrs: nounwind uwtable
-define i32 @main() #0 !dbg !4 {
+define i32 @main() !dbg !4 {
entry:
call void (...) @func(), !dbg !11
ret i32 0, !dbg !12
}
-declare void @func(...) #1
-
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+declare void @func(...)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!8, !9}
diff --git a/llvm/test/Transforms/GCOVProfiling/exit-block.ll b/llvm/test/Transforms/GCOVProfiling/exit-block.ll
index 1840f04..543edac 100644
--- a/llvm/test/Transforms/GCOVProfiling/exit-block.ll
+++ b/llvm/test/Transforms/GCOVProfiling/exit-block.ll
@@ -13,7 +13,7 @@ target triple = "x86_64-unknown-linux-gnu"
@A = common global i32 0, align 4, !dbg !9
; Function Attrs: nounwind uwtable
-define void @test() #0 !dbg !4 {
+define void @test() !dbg !4 {
entry:
tail call void (...) @f() #2, !dbg !14
%0 = load i32, ptr @A, align 4, !dbg !15
@@ -28,12 +28,10 @@ if.end: ; preds = %entry, %if.then
ret void, !dbg !18
}
-declare void @f(...) #1
+declare void @f(...)
-declare void @g(...) #1
+declare void @g(...)
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { nounwind }
!llvm.gcov = !{!19}
diff --git a/llvm/test/Transforms/GCOVProfiling/linezero.ll b/llvm/test/Transforms/GCOVProfiling/linezero.ll
index 8bfeabd..1eae413 100644
--- a/llvm/test/Transforms/GCOVProfiling/linezero.ll
+++ b/llvm/test/Transforms/GCOVProfiling/linezero.ll
@@ -9,7 +9,7 @@ target triple = "x86_64-unknown-linux-gnu"
%struct.vector = type { i8 }
; Function Attrs: nounwind
-define i32 @_Z4testv() #0 !dbg !15 {
+define i32 @_Z4testv() !dbg !15 {
entry:
%retval = alloca i32, align 4
%__range = alloca ptr, align 8
@@ -63,19 +63,19 @@ return: ; No predecessors!
}
; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
-declare void @_Z13TagFieldSpecsv() #2
+declare void @_Z13TagFieldSpecsv()
-declare ptr @_ZN6vector5beginEv(ptr) #2
+declare ptr @_ZN6vector5beginEv(ptr)
-declare ptr @_ZN6vector3endEv(ptr) #2
+declare ptr @_ZN6vector3endEv(ptr)
; Function Attrs: noreturn nounwind
-declare void @llvm.trap() #3
+declare void @llvm.trap()
; Function Attrs: nounwind
-define void @_Z2f1v() #0 !dbg !20 {
+define void @_Z2f1v() !dbg !20 {
entry:
br label %0
@@ -83,11 +83,6 @@ entry:
ret void, !dbg !45
}
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { noreturn nounwind }
-
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!23, !24}
!llvm.gcov = !{!25}
diff --git a/llvm/test/Transforms/GCOVProfiling/split-indirectbr-critical-edges.ll b/llvm/test/Transforms/GCOVProfiling/split-indirectbr-critical-edges.ll
index 7f169e2..98bdd74 100644
--- a/llvm/test/Transforms/GCOVProfiling/split-indirectbr-critical-edges.ll
+++ b/llvm/test/Transforms/GCOVProfiling/split-indirectbr-critical-edges.ll
@@ -10,7 +10,7 @@
; CHECK-NEXT: load {{.*}} @__llvm_gcov_ctr
; CHECK-NOT: load {{.*}} @__llvm_gcov_ctr
-define dso_local i32 @cannot_split(ptr nocapture readonly %p) #0 !dbg !7 {
+define dso_local i32 @cannot_split(ptr nocapture readonly %p) !dbg !7 {
entry:
%targets = alloca <2 x ptr>, align 16
store <2 x ptr> <ptr blockaddress(@cannot_split, %indirect), ptr blockaddress(@cannot_split, %end)>, ptr %targets, align 16, !dbg !9
@@ -42,8 +42,6 @@ end: ; preds = %indirect
ret i32 0, !dbg !22
}
-attributes #0 = { norecurse nounwind readonly uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/Transforms/GVN/cond_br2.ll b/llvm/test/Transforms/GVN/cond_br2.ll
index 6ceec95..4811ad0 100644
--- a/llvm/test/Transforms/GVN/cond_br2.ll
+++ b/llvm/test/Transforms/GVN/cond_br2.ll
@@ -11,12 +11,12 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
%"union.llvm::SmallVectorBase::U" = type { x86_fp80 }
; Function Attrs: ssp uwtable
-define void @_Z4testv() #0 personality ptr @__gxx_personality_v0 {
+define void @_Z4testv() personality ptr @__gxx_personality_v0 {
; CHECK-LABEL: define void @_Z4testv(
-; CHECK-SAME: ) #[[ATTR0:[0-9]+]] personality ptr @__gxx_personality_v0 {
+; CHECK-SAME: ) personality ptr @__gxx_personality_v0 {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[SV:%.*]] = alloca %"class.llvm::SmallVector", align 16
-; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr [[SV]]) #[[ATTR4:[0-9]+]]
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr [[SV]])
; CHECK-NEXT: [[FIRSTEL_I_I_I_I_I_I:%.*]] = getelementptr inbounds %"class.llvm::SmallVector", ptr [[SV]], i64 0, i32 0, i32 0, i32 0, i32 0, i32 3
; CHECK-NEXT: store ptr [[FIRSTEL_I_I_I_I_I_I]], ptr [[SV]], align 16, !tbaa [[ANYPTR_TBAA0:![0-9]+]]
; CHECK-NEXT: [[ENDX_I_I_I_I_I_I:%.*]] = getelementptr inbounds %"class.llvm::SmallVector", ptr [[SV]], i64 0, i32 0, i32 0, i32 0, i32 0, i32 1
@@ -66,10 +66,10 @@ define void @_Z4testv() #0 personality ptr @__gxx_personality_v0 {
; CHECK-NEXT: [[CMP_I_I_I_I19:%.*]] = icmp eq ptr [[TMP0]], [[FIRSTEL_I_I_I_I_I_I]]
; CHECK-NEXT: br i1 [[CMP_I_I_I_I19]], label %[[_ZN4LLVM11SMALLVECTORIILJ8EED1EV_EXIT21:.*]], label %[[IF_THEN_I_I_I20:.*]]
; CHECK: [[IF_THEN_I_I_I20]]:
-; CHECK-NEXT: call void @free(ptr [[TMP0]]) #[[ATTR4]]
+; CHECK-NEXT: call void @free(ptr [[TMP0]])
; CHECK-NEXT: br label %[[_ZN4LLVM11SMALLVECTORIILJ8EED1EV_EXIT21]]
; CHECK: [[_ZN4LLVM11SMALLVECTORIILJ8EED1EV_EXIT21]]:
-; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr [[SV]]) #[[ATTR4]]
+; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr [[SV]])
; CHECK-NEXT: ret void
; CHECK: [[LPAD]]:
; CHECK-NEXT: [[TMP1:%.*]] = landingpad { ptr, i32 }
@@ -78,7 +78,7 @@ define void @_Z4testv() #0 personality ptr @__gxx_personality_v0 {
; CHECK-NEXT: [[CMP_I_I_I_I:%.*]] = icmp eq ptr [[TMP2]], [[FIRSTEL_I_I_I_I_I_I]]
; CHECK-NEXT: br i1 [[CMP_I_I_I_I]], label %[[EH_RESUME:.*]], label %[[IF_THEN_I_I_I:.*]]
; CHECK: [[IF_THEN_I_I_I]]:
-; CHECK-NEXT: call void @free(ptr [[TMP2]]) #[[ATTR4]]
+; CHECK-NEXT: call void @free(ptr [[TMP2]])
; CHECK-NEXT: br label %[[EH_RESUME]]
; CHECK: [[EH_RESUME]]:
; CHECK-NEXT: resume { ptr, i32 } [[TMP1]]
@@ -86,7 +86,7 @@ define void @_Z4testv() #0 personality ptr @__gxx_personality_v0 {
entry:
%sv = alloca %"class.llvm::SmallVector", align 16
- call void @llvm.lifetime.start.p0(ptr %sv) #1
+ call void @llvm.lifetime.start.p0(ptr %sv)
%FirstEl.i.i.i.i.i.i = getelementptr inbounds %"class.llvm::SmallVector", ptr %sv, i64 0, i32 0, i32 0, i32 0, i32 0, i32 3
store ptr %FirstEl.i.i.i.i.i.i, ptr %sv, align 16, !tbaa !4
%EndX.i.i.i.i.i.i = getelementptr inbounds %"class.llvm::SmallVector", ptr %sv, i64 0, i32 0, i32 0, i32 0, i32 0, i32 1
@@ -151,11 +151,11 @@ invoke.cont3: ; preds = %invoke.cont2
br i1 %cmp.i.i.i.i19, label %_ZN4llvm11SmallVectorIiLj8EED1Ev.exit21, label %if.then.i.i.i20
if.then.i.i.i20: ; preds = %invoke.cont3
- call void @free(ptr %5) #1
+ call void @free(ptr %5)
br label %_ZN4llvm11SmallVectorIiLj8EED1Ev.exit21
_ZN4llvm11SmallVectorIiLj8EED1Ev.exit21: ; preds = %invoke.cont3, %if.then.i.i.i20
- call void @llvm.lifetime.end.p0(ptr %sv) #1
+ call void @llvm.lifetime.end.p0(ptr %sv)
ret void
lpad: ; preds = %if.end.i14, %if.end.i, %invoke.cont2
@@ -166,7 +166,7 @@ lpad: ; preds = %if.end.i14, %if.end
br i1 %cmp.i.i.i.i, label %eh.resume, label %if.then.i.i.i
if.then.i.i.i: ; preds = %lpad
- call void @free(ptr %7) #1
+ call void @free(ptr %7)
br label %eh.resume
eh.resume: ; preds = %if.then.i.i.i, %lpad
@@ -174,24 +174,19 @@ eh.resume: ; preds = %if.then.i.i.i, %lpa
}
; Function Attrs: nounwind
-declare void @llvm.lifetime.start.p0(ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(ptr nocapture)
declare i32 @__gxx_personality_v0(...)
-declare void @_Z1gRN4llvm11SmallVectorIiLj8EEE(ptr) #2
+declare void @_Z1gRN4llvm11SmallVectorIiLj8EEE(ptr)
; Function Attrs: nounwind
-declare void @llvm.lifetime.end.p0(ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(ptr nocapture)
-declare void @_ZN4llvm15SmallVectorBase8grow_podEmm(ptr, i64, i64) #2
+declare void @_ZN4llvm15SmallVectorBase8grow_podEmm(ptr, i64, i64)
; Function Attrs: nounwind
-declare void @free(ptr nocapture) #3
-
-attributes #0 = { ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind }
-attributes #2 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+declare void @free(ptr nocapture)
!0 = !{!"any pointer", !1}
!1 = !{!"omnipotent char", !2}
diff --git a/llvm/test/Transforms/GVN/matrix-intrinsics.ll b/llvm/test/Transforms/GVN/matrix-intrinsics.ll
index 78dbfe1..03bd45b 100644
--- a/llvm/test/Transforms/GVN/matrix-intrinsics.ll
+++ b/llvm/test/Transforms/GVN/matrix-intrinsics.ll
@@ -8,9 +8,8 @@ define void @redundant_unstrided_load(ptr %src) {
; CHECK-NEXT: [[SRC_OFFSET:%.*]] = getelementptr inbounds double, ptr [[SRC]], i32 8
; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 4, i1 false, i32 4, i32 2)
; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
-; CHECK-NEXT: [[L_2:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 4, i1 false, i32 4, i32 2)
; CHECK-NEXT: call void @use(<8 x double> [[L]])
-; CHECK-NEXT: call void @use(<8 x double> [[L_2]])
+; CHECK-NEXT: call void @use(<8 x double> [[L]])
; CHECK-NEXT: ret void
;
entry:
@@ -30,9 +29,8 @@ define void @redundant_unstrided_load_non_matrix_store(ptr %src) {
; CHECK-NEXT: [[SRC_OFFSET:%.*]] = getelementptr inbounds double, ptr [[SRC]], i32 1
; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 4, i1 false, i32 4, i32 2)
; CHECK-NEXT: store double 4.200000e+01, ptr [[SRC]], align 8
-; CHECK-NEXT: [[L_2:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 4, i1 false, i32 4, i32 2)
; CHECK-NEXT: call void @use(<8 x double> [[L]])
-; CHECK-NEXT: call void @use(<8 x double> [[L_2]])
+; CHECK-NEXT: call void @use(<8 x double> [[L]])
; CHECK-NEXT: ret void
;
entry:
@@ -52,9 +50,8 @@ define void @redundant_strided_load(ptr %src) {
; CHECK-NEXT: [[SRC_OFFSET:%.*]] = getelementptr inbounds double, ptr [[SRC]], i32 16
; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 8, i1 false, i32 4, i32 2)
; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[SRC]], i32 8, i1 false, i32 4, i32 2)
-; CHECK-NEXT: [[L_2:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 8, i1 false, i32 4, i32 2)
; CHECK-NEXT: call void @use(<8 x double> [[L]])
-; CHECK-NEXT: call void @use(<8 x double> [[L_2]])
+; CHECK-NEXT: call void @use(<8 x double> [[L]])
; CHECK-NEXT: ret void
;
entry:
@@ -75,9 +72,8 @@ define void @redundant_strided_load_non_matrix_store(ptr %src) {
; CHECK-NEXT: [[SRC_OFFSET:%.*]] = getelementptr inbounds double, ptr [[SRC]], i32 16
; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 8, i1 false, i32 4, i32 2)
; CHECK-NEXT: store double 4.200000e+01, ptr [[SRC]], align 8
-; CHECK-NEXT: [[L_2:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 8, i1 false, i32 4, i32 2)
; CHECK-NEXT: call void @use(<8 x double> [[L]])
-; CHECK-NEXT: call void @use(<8 x double> [[L_2]])
+; CHECK-NEXT: call void @use(<8 x double> [[L]])
; CHECK-NEXT: ret void
;
entry:
diff --git a/llvm/test/Transforms/GVN/pr33549.ll b/llvm/test/Transforms/GVN/pr33549.ll
index a8ce37c..a1e28f7 100644
--- a/llvm/test/Transforms/GVN/pr33549.ll
+++ b/llvm/test/Transforms/GVN/pr33549.ll
@@ -4,9 +4,9 @@
@Data = common local_unnamed_addr global [32 x i32] zeroinitializer, align 4
; Function Attrs: norecurse nounwind
-define void @testshl() local_unnamed_addr #0 {
+define void @testshl() local_unnamed_addr {
; CHECK-LABEL: define void @testshl(
-; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: ) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
; CHECK: [[FOR_BODY]]:
@@ -78,8 +78,6 @@ for.end10: ; preds = %for.inc8
ret void
}
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m7" "target-features"="+d16,+dsp,+fp-armv8,+hwdiv,+strict-align,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}
diff --git a/llvm/test/Transforms/GVN/pr42605.ll b/llvm/test/Transforms/GVN/pr42605.ll
index 3e6241c..98b447f8 100644
--- a/llvm/test/Transforms/GVN/pr42605.ll
+++ b/llvm/test/Transforms/GVN/pr42605.ll
@@ -114,7 +114,7 @@ if.end: ; preds = %if.then, %entry
ret void
}
-attributes #0 = { noinline norecurse nounwind readonly uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline norecurse nounwind readonly uwtable }
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
diff --git a/llvm/test/Transforms/GVNHoist/hoist-unsafe-pr31729.ll b/llvm/test/Transforms/GVNHoist/hoist-unsafe-pr31729.ll
index b9c8537..082d016 100644
--- a/llvm/test/Transforms/GVNHoist/hoist-unsafe-pr31729.ll
+++ b/llvm/test/Transforms/GVNHoist/hoist-unsafe-pr31729.ll
@@ -14,7 +14,7 @@
@res = common global i32 0, align 4
; Function Attrs:
-define i64 @func() #0 {
+define i64 @func() {
entry:
ret i64 1
}
@@ -27,7 +27,7 @@ entry:
%2 = load volatile i32, ptr @g_x_u, align 4
%3 = load volatile i32, ptr @g_z_u, align 4
%4 = load volatile i32, ptr @g_m, align 4
- %call = call i64 @func() #4
+ %call = call i64 @func()
%conv = sext i32 %1 to i64
%cmp = icmp ne i64 %call, %conv
br i1 %cmp, label %if.end, label %lor.lhs.false
@@ -42,7 +42,7 @@ if.then:
br label %cleanup
if.end:
- %call4 = call i64 @func() #4
+ %call4 = call i64 @func()
%conv5 = zext i32 %3 to i64
%cmp6 = icmp ne i64 %call4, %conv5
br i1 %cmp6, label %if.end14, label %lor.lhs.false8
@@ -57,7 +57,7 @@ if.then13:
br label %cleanup
if.end14:
- %call15 = call i64 @func() #4
+ %call15 = call i64 @func()
%cmp17 = icmp ne i64 %call15, %conv
br i1 %cmp17, label %if.end25, label %lor.lhs.false19
@@ -77,5 +77,3 @@ cleanup:
%retval.0 = phi i32 [ 0, %if.end25 ], [ 1, %if.then24 ], [ 1, %if.then13 ], [ 1, %if.then ]
ret i32 %retval.0
}
-
-attributes #0 = { minsize noinline nounwind optsize uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/GVNHoist/pr30499.ll b/llvm/test/Transforms/GVNHoist/pr30499.ll
index bd6f98c..6df9484b 100644
--- a/llvm/test/Transforms/GVNHoist/pr30499.ll
+++ b/llvm/test/Transforms/GVNHoist/pr30499.ll
@@ -1,6 +1,6 @@
; RUN: opt -S -passes=gvn-hoist < %s
-define void @_Z3fn2v() #0 {
+define void @_Z3fn2v() {
entry:
%a = alloca ptr, align 8
%b = alloca i32, align 4
@@ -11,7 +11,7 @@ entry:
br i1 %tobool, label %if.then, label %if.end
if.then: ; preds = %entry
- %call = call i64 @_Z3fn1v() #2
+ %call = call i64 @_Z3fn1v()
%conv = trunc i64 %call to i32
store i32 %conv, ptr %b, align 4
br label %if.end
@@ -23,8 +23,4 @@ if.end: ; preds = %if.then, %entry
}
; Function Attrs: nounwind readonly
-declare i64 @_Z3fn1v() #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readonly }
+declare i64 @_Z3fn1v()
diff --git a/llvm/test/Transforms/IndVarSimplify/X86/widen-nsw.ll b/llvm/test/Transforms/IndVarSimplify/X86/widen-nsw.ll
index 7cba30d..81a9323 100644
--- a/llvm/test/Transforms/IndVarSimplify/X86/widen-nsw.ll
+++ b/llvm/test/Transforms/IndVarSimplify/X86/widen-nsw.ll
@@ -4,7 +4,7 @@ target triple = "x86_64-apple-macosx"
; CHECK-LABEL: @test1
; CHECK: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-define i32 @test1(ptr %a) #0 {
+define i32 @test1(ptr %a) {
entry:
br label %for.cond
@@ -25,5 +25,3 @@ for.body: ; preds = %for.cond
for.end: ; preds = %for.cond
ret i32 %sum.0
}
-
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/Inline/always-inline-attr.ll b/llvm/test/Transforms/Inline/always-inline-attr.ll
index 08ea307..4e69822 100644
--- a/llvm/test/Transforms/Inline/always-inline-attr.ll
+++ b/llvm/test/Transforms/Inline/always-inline-attr.ll
@@ -6,7 +6,7 @@ target triple = "x86_64-grtev4-linux-gnu"
; After AlwaysInline the callee's attributes should be merged into caller's attibutes.
-; CHECK: define dso_local <2 x i64> @foo(ptr byval(<8 x i64>) align 64 %0) #0
+; CHECK: define dso_local <2 x i64> @foo(ptr byval(<8 x i64>) align 64 %0)
; CHECK: attributes #0 = { mustprogress uwtable "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="512"
; Function Attrs: uwtable mustprogress
@@ -36,12 +36,10 @@ entry:
}
; Function Attrs: nounwind readnone
-declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32>, <16 x i8>, i16) #2
-
-attributes #0 = { uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "prefer-vector-width"="128" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+aes,+avx,+avx2,+avx512bw,+avx512dq,+avx512f,+avx512vl,+bmi2,+cx16,+cx8,+f16c,+fma,+fxsr,+mmx,+pclmul,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { alwaysinline nounwind uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="512" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "prefer-vector-width"="128" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+aes,+avx,+avx2,+avx512f,+cx16,+cx8,+f16c,+fma,+fxsr,+mmx,+pclmul,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
+declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32>, <16 x i8>, i16)
+attributes #0 = { uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "prefer-vector-width"="128" }
+attributes #1 = { alwaysinline nounwind uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="512" "prefer-vector-width"="128" }
!2 = !{!3, !3, i64 0}
!3 = !{!"omnipotent char", !4, i64 0}
diff --git a/llvm/test/Transforms/Inline/debug-info-duplicate-calls.ll b/llvm/test/Transforms/Inline/debug-info-duplicate-calls.ll
index a4c4134..bf0dfa2 100644
--- a/llvm/test/Transforms/Inline/debug-info-duplicate-calls.ll
+++ b/llvm/test/Transforms/Inline/debug-info-duplicate-calls.ll
@@ -89,11 +89,10 @@ entry:
ret void, !dbg !20
}
-declare void @_Z2f1v() #2
+attributes #0 = { uwtable }
+attributes #1 = { alwaysinline inlinehint uwtable }
-attributes #0 = { uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { alwaysinline inlinehint uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+declare void @_Z2f1v()
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!10, !11}
diff --git a/llvm/test/Transforms/Inline/inline-vla.ll b/llvm/test/Transforms/Inline/inline-vla.ll
index 8e4bb3d..1e3c235 100644
--- a/llvm/test/Transforms/Inline/inline-vla.ll
+++ b/llvm/test/Transforms/Inline/inline-vla.ll
@@ -9,7 +9,7 @@
@.str1 = private unnamed_addr constant [3 x i8] c"ab\00", align 1
; Function Attrs: nounwind ssp uwtable
-define i32 @main(i32 %argc, ptr nocapture readnone %argv) #0 {
+define i32 @main(i32 %argc, ptr nocapture readnone %argv) {
entry:
%data = alloca [2 x i8], align 1
call fastcc void @memcpy2(ptr %data, ptr @.str, i64 1)
@@ -18,7 +18,7 @@ entry:
}
; Function Attrs: inlinehint nounwind ssp uwtable
-define internal fastcc void @memcpy2(ptr nocapture %dst, ptr nocapture readonly %src, i64 %size) #1 {
+define internal fastcc void @memcpy2(ptr nocapture %dst, ptr nocapture readonly %src, i64 %size) {
entry:
%vla = alloca i64, i64 %size, align 16
call void @llvm.memcpy.p0.p0.i64(ptr %vla, ptr %src, i64 %size, i1 false)
@@ -27,11 +27,7 @@ entry:
}
; Function Attrs: nounwind
-declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1) #2
-
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { inlinehint nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
+declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1)
!llvm.ident = !{!0}
diff --git a/llvm/test/Transforms/Inline/optimization-remarks-hotness-threshold.ll b/llvm/test/Transforms/Inline/optimization-remarks-hotness-threshold.ll
index 3021935..2b4aedd 100644
--- a/llvm/test/Transforms/Inline/optimization-remarks-hotness-threshold.ll
+++ b/llvm/test/Transforms/Inline/optimization-remarks-hotness-threshold.ll
@@ -27,20 +27,18 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.11.0"
; Function Attrs: nounwind ssp uwtable
-define i32 @foo() #0 !dbg !7 {
+define i32 @foo() !dbg !7 {
entry:
ret i32 1, !dbg !9
}
; Function Attrs: nounwind ssp uwtable
-define i32 @bar() #0 !dbg !10 {
+define i32 @bar() !dbg !10 {
entry:
%call = call i32 @foo(), !dbg !11
ret i32 %call, !dbg !12
}
-attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
!llvm.ident = !{!6}
diff --git a/llvm/test/Transforms/Inline/optimization-remarks-passed-deleted-callee-yaml.ll b/llvm/test/Transforms/Inline/optimization-remarks-passed-deleted-callee-yaml.ll
index d394713..f547c37 100644
--- a/llvm/test/Transforms/Inline/optimization-remarks-passed-deleted-callee-yaml.ll
+++ b/llvm/test/Transforms/Inline/optimization-remarks-passed-deleted-callee-yaml.ll
@@ -63,20 +63,18 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.11.0"
; Function Attrs: nounwind ssp uwtable
-define internal i32 @foo() #0 !dbg !7 {
+define internal i32 @foo() !dbg !7 {
entry:
ret i32 1, !dbg !9
}
; Function Attrs: nounwind ssp uwtable
-define i32 @bar() #0 !dbg !10 !prof !13 {
+define i32 @bar() !dbg !10 !prof !13 {
entry:
%call = call i32 @foo(), !dbg !11
ret i32 %call, !dbg !12
}
-attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
!llvm.ident = !{!6}
diff --git a/llvm/test/Transforms/Inline/optimization-remarks-passed-yaml.ll b/llvm/test/Transforms/Inline/optimization-remarks-passed-yaml.ll
index b0a238f..64b305b 100644
--- a/llvm/test/Transforms/Inline/optimization-remarks-passed-yaml.ll
+++ b/llvm/test/Transforms/Inline/optimization-remarks-passed-yaml.ll
@@ -72,20 +72,18 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.11.0"
; Function Attrs: nounwind ssp uwtable
-define i32 @foo() #0 !dbg !7 {
+define i32 @foo() !dbg !7 {
entry:
ret i32 1, !dbg !9
}
; Function Attrs: nounwind ssp uwtable
-define i32 @bar() #0 !dbg !10 !prof !13 {
+define i32 @bar() !dbg !10 !prof !13 {
entry:
%call = call i32 @foo(), !dbg !11
ret i32 %call, !dbg !12
}
-attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
!llvm.ident = !{!6}
diff --git a/llvm/test/Transforms/Inline/optimization-remarks.ll b/llvm/test/Transforms/Inline/optimization-remarks.ll
index bc1e690..135d835 100644
--- a/llvm/test/Transforms/Inline/optimization-remarks.ll
+++ b/llvm/test/Transforms/Inline/optimization-remarks.ll
@@ -81,9 +81,9 @@ entry:
ret i32 %add
}
-attributes #0 = { alwaysinline nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { noinline nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { alwaysinline nounwind uwtable }
+attributes #1 = { noinline nounwind uwtable }
+attributes #2 = { nounwind uwtable }
!llvm.ident = !{!0}
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-to-svbool-binops.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-to-svbool-binops.ll
index ecedbdb..abe1ed0 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-to-svbool-binops.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-to-svbool-binops.ll
@@ -124,6 +124,39 @@ define <vscale x 8 x i1> @try_combine_svbool_binop_orr(<vscale x 8 x i1> %a, <vs
ret <vscale x 8 x i1> %t3
}
+; Verify predicate cast does not hinder "isAllActive" knowledge.
+define <vscale x 8 x half> @try_combine_svbool_binop_fadd(<vscale x 8 x half> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: @try_combine_svbool_binop_fadd(
+; CHECK-NEXT: [[T2:%.*]] = fadd <vscale x 8 x half> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT: ret <vscale x 8 x half> [[T2]]
+;
+ %t1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> splat (i1 true))
+ %t2 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> %t1, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
+ ret <vscale x 8 x half> %t2
+}
+
+; Verify predicate cast does not hinder "isAllActive" knowledge.
+define <vscale x 4 x float> @try_combine_svbool_binop_fmul(<vscale x 4 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: @try_combine_svbool_binop_fmul(
+; CHECK-NEXT: [[T2:%.*]] = fmul <vscale x 4 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT: ret <vscale x 4 x float> [[T2]]
+;
+ %t1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> splat (i1 true))
+ %t2 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmul.nxv4f32(<vscale x 4 x i1> %t1, <vscale x 4 x float> %a, <vscale x 4 x float> %b)
+ ret <vscale x 4 x float> %t2
+}
+
+; Verify predicate cast does not hinder "isAllActive" knowledge.
+define <vscale x 2 x double> @try_combine_svbool_binop_fsub(<vscale x 2 x double> %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: @try_combine_svbool_binop_fsub(
+; CHECK-NEXT: [[T2:%.*]] = fsub <vscale x 2 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT: ret <vscale x 2 x double> [[T2]]
+;
+ %t1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> splat (i1 true))
+ %t2 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fsub.nxv2f64(<vscale x 2 x i1> %t1, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
+ ret <vscale x 2 x double> %t2
+}
+
declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1>)
declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1>)
declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1>)
diff --git a/llvm/test/Transforms/InstCombine/bitreverse-hang.ll b/llvm/test/Transforms/InstCombine/bitreverse-hang.ll
index bb01299..3f29509 100644
--- a/llvm/test/Transforms/InstCombine/bitreverse-hang.ll
+++ b/llvm/test/Transforms/InstCombine/bitreverse-hang.ll
@@ -21,7 +21,7 @@
@b = common global i32 0, align 4
; CHECK: define i32 @fn1
-define i32 @fn1() #0 {
+define i32 @fn1() {
entry:
%b.promoted = load i32, ptr @b, align 4, !tbaa !2
br label %for.body
@@ -40,8 +40,6 @@ for.end: ; preds = %for.body
ret i32 undef
}
-attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
diff --git a/llvm/test/Transforms/InstCombine/constant-vector-insert.ll b/llvm/test/Transforms/InstCombine/constant-vector-insert.ll
new file mode 100644
index 0000000..2688540
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/constant-vector-insert.ll
@@ -0,0 +1,156 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=instcombine %s | FileCheck %s
+; RUN: opt -S -passes=instcombine %s \
+; RUN: -use-constant-int-for-fixed-length-splat \
+; RUN -use-constant-fp-for-fixed-length-splat \
+; RUN: -use-constant-int-for-scalable-splat \
+; RUN: -use-constant-fp-for-scalable-splat | FileCheck %s
+
+define <vscale x 4 x i32> @insert_div() {
+; CHECK-LABEL: @insert_div(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[DIV:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> splat (i32 3), i64 0)
+; CHECK-NEXT: ret <vscale x 4 x i32> [[DIV]]
+;
+entry:
+ %0 = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> splat (i32 9), i64 0)
+ %div = udiv <vscale x 4 x i32> %0, splat (i32 3)
+ ret <vscale x 4 x i32> %div
+}
+
+define <vscale x 4 x i32> @insert_div_splat_lhs() {
+; CHECK-LABEL: @insert_div_splat_lhs(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[DIV:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> splat (i32 5), <4 x i32> splat (i32 2), i64 0)
+; CHECK-NEXT: ret <vscale x 4 x i32> [[DIV]]
+;
+entry:
+ %0 = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> splat(i32 2), <4 x i32> splat (i32 5), i64 0)
+ %div = udiv <vscale x 4 x i32> splat (i32 10), %0
+ ret <vscale x 4 x i32> %div
+}
+
+define <vscale x 4 x i32> @insert_div_mixed_splat() {
+; CHECK-LABEL: @insert_div_mixed_splat(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[DIV:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> splat (i32 6), <4 x i32> splat (i32 3), i64 0)
+; CHECK-NEXT: ret <vscale x 4 x i32> [[DIV]]
+;
+entry:
+ %0 = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> splat (i32 18), <4 x i32> splat (i32 9), i64 0)
+ %div = udiv <vscale x 4 x i32> %0, splat (i32 3)
+ ret <vscale x 4 x i32> %div
+}
+
+define <vscale x 4 x i32> @insert_mul() {
+; CHECK-LABEL: @insert_mul(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[MUL:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> splat (i32 7), i64 4)
+; CHECK-NEXT: ret <vscale x 4 x i32> [[MUL]]
+;
+entry:
+ %0 = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> splat (i32 1), i64 4)
+ %mul = mul <vscale x 4 x i32> %0, splat (i32 7)
+ ret <vscale x 4 x i32> %mul
+}
+
+define <vscale x 4 x i32> @insert_add() {
+; CHECK-LABEL: @insert_add(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[ADD:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> splat (i32 16), i64 0)
+; CHECK-NEXT: ret <vscale x 4 x i32> [[ADD]]
+;
+entry:
+ %0 = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> splat (i32 5), i64 0)
+ %add = add <vscale x 4 x i32> %0, splat (i32 11)
+ ret <vscale x 4 x i32> %add
+}
+
+define <vscale x 4 x i32> @insert_add_non_splat_subvector() {
+; CHECK-LABEL: @insert_add_non_splat_subvector(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[ADD:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> <i32 101, i32 102, i32 103, i32 104>, i64 0)
+; CHECK-NEXT: ret <vscale x 4 x i32> [[ADD]]
+;
+entry:
+ %0 = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 4>, i64 0)
+ %add = add <vscale x 4 x i32> %0, splat (i32 100)
+ ret <vscale x 4 x i32> %add
+}
+
+define <vscale x 4 x float> @insert_add_fp() {
+; CHECK-LABEL: @insert_add_fp(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[ADD:%.*]] = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> splat (float 6.250000e+00), <4 x float> splat (float 5.500000e+00), i64 0)
+; CHECK-NEXT: ret <vscale x 4 x float> [[ADD]]
+;
+entry:
+ %0 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> splat(float 1.25), <4 x float> splat (float 0.5), i64 0)
+ %add = fadd <vscale x 4 x float> %0, splat (float 5.0)
+ ret <vscale x 4 x float> %add
+}
+
+define <vscale x 8 x i32> @insert_add_scalable_subvector() {
+; CHECK-LABEL: @insert_add_scalable_subvector(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[ADD:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> splat (i32 20), <vscale x 4 x i32> splat (i32 -4), i64 0)
+; CHECK-NEXT: ret <vscale x 8 x i32> [[ADD]]
+;
+entry:
+ %0 = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> splat(i32 16), <vscale x 4 x i32> splat (i32 -8), i64 0)
+ %add = add <vscale x 8 x i32> %0, splat (i32 4)
+ ret <vscale x 8 x i32> %add
+}
+
+define <vscale x 4 x i32> @insert_sub() {
+; CHECK-LABEL: @insert_sub(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[SUB:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> zeroinitializer, i64 8)
+; CHECK-NEXT: ret <vscale x 4 x i32> [[SUB]]
+;
+entry:
+ %0 = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> splat (i32 11), i64 8)
+ %sub = add <vscale x 4 x i32> %0, splat (i32 -11)
+ ret <vscale x 4 x i32> %sub
+}
+
+define <vscale x 4 x i32> @insert_and_partially_undef() {
+; CHECK-LABEL: @insert_and_partially_undef(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[AND:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> zeroinitializer, <4 x i32> splat (i32 4), i64 0)
+; CHECK-NEXT: ret <vscale x 4 x i32> [[AND]]
+;
+entry:
+ %0 = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> splat (i32 6), i64 0)
+ %and = and <vscale x 4 x i32> %0, splat (i32 4)
+ ret <vscale x 4 x i32> %and
+}
+
+define <vscale x 4 x i32> @insert_fold_chain() {
+; CHECK-LABEL: @insert_fold_chain(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[ADD:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> splat (i32 11), <4 x i32> splat (i32 8), i64 0)
+; CHECK-NEXT: ret <vscale x 4 x i32> [[ADD]]
+;
+entry:
+ %0 = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> splat (i32 21), <4 x i32> splat (i32 12), i64 0)
+ %div = udiv <vscale x 4 x i32> %0, splat (i32 3)
+ %add = add <vscale x 4 x i32> %div, splat (i32 4)
+ ret <vscale x 4 x i32> %add
+}
+
+; TODO: This could be folded more.
+define <vscale x 4 x i32> @insert_add_both_insert_vector() {
+; CHECK-LABEL: @insert_add_both_insert_vector(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> splat (i32 10), <4 x i32> splat (i32 5), i64 0)
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> splat (i32 -1), <4 x i32> splat (i32 2), i64 0)
+; CHECK-NEXT: [[ADD:%.*]] = add <vscale x 4 x i32> [[TMP0]], [[TMP1]]
+; CHECK-NEXT: ret <vscale x 4 x i32> [[ADD]]
+;
+entry:
+ %0 = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> splat(i32 10), <4 x i32> splat (i32 5), i64 0)
+ %1 = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> splat(i32 -1), <4 x i32> splat (i32 2), i64 0)
+ %add = add <vscale x 4 x i32> %0, %1
+ ret <vscale x 4 x i32> %add
+}
diff --git a/llvm/test/Transforms/InstCombine/ctlz-cttz.ll b/llvm/test/Transforms/InstCombine/ctlz-cttz.ll
new file mode 100644
index 0000000..871fb34
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/ctlz-cttz.ll
@@ -0,0 +1,145 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -S -passes=instcombine | FileCheck %s
+
+; ctpop(~i & (i - 1)) -> bitwidth - cttz(i, false)
+define i8 @ctlz_to_sub_bw_cttz(i8 %a0) {
+; CHECK-LABEL: define i8 @ctlz_to_sub_bw_cttz(
+; CHECK-SAME: i8 [[A0:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[A0]], i1 false)
+; CHECK-NEXT: [[CLZ:%.*]] = sub nuw nsw i8 8, [[TMP1]]
+; CHECK-NEXT: ret i8 [[CLZ]]
+;
+ %dec = add i8 %a0, -1
+ %not = xor i8 %a0, -1
+ %and = and i8 %dec, %not
+ %clz = tail call i8 @llvm.ctlz.i8(i8 %and, i1 false)
+ ret i8 %clz
+}
+
+define i8 @ctlz_to_sub_bw_cttz_poison(i8 %a0) {
+; CHECK-LABEL: define i8 @ctlz_to_sub_bw_cttz_poison(
+; CHECK-SAME: i8 [[A0:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[A0]], i1 false)
+; CHECK-NEXT: [[CLZ:%.*]] = sub nuw nsw i8 8, [[TMP1]]
+; CHECK-NEXT: ret i8 [[CLZ]]
+;
+ %dec = add i8 %a0, -1
+ %not = xor i8 %a0, -1
+ %and = and i8 %dec, %not
+ %clz = tail call i8 @llvm.ctlz.i8(i8 %and, i1 true)
+ ret i8 %clz
+}
+
+define i8 @ctlz_to_sub_bw_cttz_different_add(i8 %a0) {
+; CHECK-LABEL: define i8 @ctlz_to_sub_bw_cttz_different_add(
+; CHECK-SAME: i8 [[A0:%.*]]) {
+; CHECK-NEXT: [[DEC:%.*]] = add i8 [[A0]], 1
+; CHECK-NEXT: [[NOT:%.*]] = xor i8 [[A0]], -1
+; CHECK-NEXT: [[AND:%.*]] = and i8 [[DEC]], [[NOT]]
+; CHECK-NEXT: [[CLZ:%.*]] = tail call range(i8 0, 9) i8 @llvm.ctlz.i8(i8 [[AND]], i1 false)
+; CHECK-NEXT: ret i8 [[CLZ]]
+;
+ %dec = add i8 %a0, 1
+ %not = xor i8 %a0, -1
+ %and = and i8 %dec, %not
+ %clz = tail call i8 @llvm.ctlz.i8(i8 %and, i1 false)
+ ret i8 %clz
+}
+
+define i8 @ctlz_to_sub_bw_cttz_different_xor(i8 %a0) {
+; CHECK-LABEL: define i8 @ctlz_to_sub_bw_cttz_different_xor(
+; CHECK-SAME: i8 [[A0:%.*]]) {
+; CHECK-NEXT: [[DEC:%.*]] = add i8 [[A0]], -1
+; CHECK-NEXT: [[NOT:%.*]] = xor i8 [[A0]], 1
+; CHECK-NEXT: [[AND:%.*]] = and i8 [[DEC]], [[NOT]]
+; CHECK-NEXT: [[CLZ:%.*]] = tail call range(i8 0, 9) i8 @llvm.ctlz.i8(i8 [[AND]], i1 false)
+; CHECK-NEXT: ret i8 [[CLZ]]
+;
+ %dec = add i8 %a0, -1
+ %not = xor i8 %a0, 1
+ %and = and i8 %dec, %not
+ %clz = tail call i8 @llvm.ctlz.i8(i8 %and, i1 false)
+ ret i8 %clz
+}
+
+declare void @use(i8)
+
+define i8 @ctlz_to_sub_bw_cttz_multi_use_dec(i8 %a0) {
+; CHECK-LABEL: define i8 @ctlz_to_sub_bw_cttz_multi_use_dec(
+; CHECK-SAME: i8 [[A0:%.*]]) {
+; CHECK-NEXT: [[DEC:%.*]] = add i8 [[A0]], -1
+; CHECK-NEXT: call void @use(i8 [[DEC]])
+; CHECK-NEXT: [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[A0]], i1 false)
+; CHECK-NEXT: [[CLZ:%.*]] = sub nuw nsw i8 8, [[TMP1]]
+; CHECK-NEXT: ret i8 [[CLZ]]
+;
+ %dec = add i8 %a0, -1
+ call void @use(i8 %dec)
+ %not = xor i8 %a0, -1
+ %and = and i8 %dec, %not
+ %clz = tail call i8 @llvm.ctlz.i8(i8 %and, i1 false)
+ ret i8 %clz
+}
+
+define i8 @ctlz_to_sub_bw_cttz_multi_use_not(i8 %a0) {
+; CHECK-LABEL: define i8 @ctlz_to_sub_bw_cttz_multi_use_not(
+; CHECK-SAME: i8 [[A0:%.*]]) {
+; CHECK-NEXT: [[NOT:%.*]] = xor i8 [[A0]], -1
+; CHECK-NEXT: call void @use(i8 [[NOT]])
+; CHECK-NEXT: [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[A0]], i1 false)
+; CHECK-NEXT: [[CLZ:%.*]] = sub nuw nsw i8 8, [[TMP1]]
+; CHECK-NEXT: ret i8 [[CLZ]]
+;
+ %dec = add i8 %a0, -1
+ %not = xor i8 %a0, -1
+ call void @use(i8 %not)
+ %and = and i8 %dec, %not
+ %clz = tail call i8 @llvm.ctlz.i8(i8 %and, i1 false)
+ ret i8 %clz
+}
+
+define i8 @ctlz_to_sub_bw_cttz_multi_use_and(i8 %a0) {
+; CHECK-LABEL: define i8 @ctlz_to_sub_bw_cttz_multi_use_and(
+; CHECK-SAME: i8 [[A0:%.*]]) {
+; CHECK-NEXT: [[DEC:%.*]] = add i8 [[A0]], -1
+; CHECK-NEXT: [[NOT:%.*]] = xor i8 [[A0]], -1
+; CHECK-NEXT: [[AND:%.*]] = and i8 [[DEC]], [[NOT]]
+; CHECK-NEXT: call void @use(i8 [[AND]])
+; CHECK-NEXT: [[CLZ:%.*]] = tail call range(i8 0, 9) i8 @llvm.ctlz.i8(i8 [[AND]], i1 false)
+; CHECK-NEXT: ret i8 [[CLZ]]
+;
+ %dec = add i8 %a0, -1
+ %not = xor i8 %a0, -1
+ %and = and i8 %dec, %not
+ call void @use(i8 %and)
+ %clz = tail call i8 @llvm.ctlz.i8(i8 %and, i1 false)
+ ret i8 %clz
+}
+
+define i8 @ctlz_to_sub_bw_cttz_commute_and(i8 %a0) {
+; CHECK-LABEL: define i8 @ctlz_to_sub_bw_cttz_commute_and(
+; CHECK-SAME: i8 [[A0:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[A0]], i1 false)
+; CHECK-NEXT: [[CLZ:%.*]] = sub nuw nsw i8 8, [[TMP1]]
+; CHECK-NEXT: ret i8 [[CLZ]]
+;
+ %dec = add i8 %a0, -1
+ %not = xor i8 %a0, -1
+ %and = and i8 %not, %dec
+ %clz = tail call i8 @llvm.ctlz.i8(i8 %and, i1 false)
+ ret i8 %clz
+}
+
+define <2 x i8> @ctlz_to_sub_bw_cttz_vec_splat(<2 x i8> %a0) {
+; CHECK-LABEL: define <2 x i8> @ctlz_to_sub_bw_cttz_vec_splat(
+; CHECK-SAME: <2 x i8> [[A0:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = call range(i8 0, 9) <2 x i8> @llvm.cttz.v2i8(<2 x i8> [[A0]], i1 false)
+; CHECK-NEXT: [[CLZ:%.*]] = sub nuw nsw <2 x i8> splat (i8 8), [[TMP1]]
+; CHECK-NEXT: ret <2 x i8> [[CLZ]]
+;
+ %dec = add <2 x i8> %a0, <i8 -1, i8 -1>
+ %not = xor <2 x i8> %a0, <i8 -1, i8 -1>
+ %and = and <2 x i8> %dec, %not
+ %clz = tail call <2 x i8>@llvm.ctlz.v2i8(<2 x i8> %and, i1 false)
+ ret <2 x i8> %clz
+}
diff --git a/llvm/test/Transforms/InstCombine/intrinsic-select.ll b/llvm/test/Transforms/InstCombine/intrinsic-select.ll
index 2f1f9fc..fc9ab9f 100644
--- a/llvm/test/Transforms/InstCombine/intrinsic-select.ll
+++ b/llvm/test/Transforms/InstCombine/intrinsic-select.ll
@@ -222,8 +222,7 @@ declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
define i32 @vec_to_scalar_select_scalar(i1 %b) {
; CHECK-LABEL: @vec_to_scalar_select_scalar(
-; CHECK-NEXT: [[S:%.*]] = select i1 [[B:%.*]], <2 x i32> <i32 1, i32 2>, <2 x i32> <i32 3, i32 4>
-; CHECK-NEXT: [[C:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[S]])
+; CHECK-NEXT: [[C:%.*]] = select i1 [[B:%.*]], i32 3, i32 7
; CHECK-NEXT: ret i32 [[C]]
;
%s = select i1 %b, <2 x i32> <i32 1, i32 2>, <2 x i32> <i32 3, i32 4>
@@ -371,3 +370,36 @@ define float @test_fabs_select_multiuse_both_constant(i1 %cond, float %x) {
%fabs = call float @llvm.fabs.f32(float %select)
ret float %fabs
}
+
+; Negative test: Don't replace with select between vector mask and zeroinitializer.
+define <16 x i1> @test_select_of_active_lane_mask_bound(i64 %base, i64 %n, i1 %cond) {
+; CHECK-LABEL: @test_select_of_active_lane_mask_bound(
+; CHECK-NEXT: [[S:%.*]] = select i1 [[COND:%.*]], i64 [[N:%.*]], i64 0
+; CHECK-NEXT: [[MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[BASE:%.*]], i64 [[S]])
+; CHECK-NEXT: ret <16 x i1> [[MASK]]
+;
+ %s = select i1 %cond, i64 %n, i64 0
+ %mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 %base, i64 %s)
+ ret <16 x i1> %mask
+}
+
+define <16 x i1> @test_select_of_active_lane_mask_bound_both_constant(i64 %base, i64 %n, i1 %cond) {
+; CHECK-LABEL: @test_select_of_active_lane_mask_bound_both_constant(
+; CHECK-NEXT: [[MASK:%.*]] = select i1 [[COND:%.*]], <16 x i1> splat (i1 true), <16 x i1> zeroinitializer
+; CHECK-NEXT: ret <16 x i1> [[MASK]]
+;
+ %s = select i1 %cond, i64 16, i64 0
+ %mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 0, i64 %s)
+ ret <16 x i1> %mask
+}
+
+define { i64, i1 } @test_select_of_overflow_intrinsic_operand(i64 %n, i1 %cond) {
+; CHECK-LABEL: @test_select_of_overflow_intrinsic_operand(
+; CHECK-NEXT: [[TMP1:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[N:%.*]], i64 42)
+; CHECK-NEXT: [[ADD_OVERFLOW:%.*]] = select i1 [[COND:%.*]], { i64, i1 } [[TMP1]], { i64, i1 } { i64 42, i1 false }
+; CHECK-NEXT: ret { i64, i1 } [[ADD_OVERFLOW]]
+;
+ %s = select i1 %cond, i64 %n, i64 0
+ %add_overflow = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %s, i64 42)
+ ret { i64, i1 } %add_overflow
+}
diff --git a/llvm/test/Transforms/InstCombine/phi.ll b/llvm/test/Transforms/InstCombine/phi.ll
index 3454835..1c8b21c 100644
--- a/llvm/test/Transforms/InstCombine/phi.ll
+++ b/llvm/test/Transforms/InstCombine/phi.ll
@@ -3026,6 +3026,56 @@ join:
ret i32 %umax
}
+define i32 @cross_lane_intrinsic_over_phi(i1 %c, i1 %c2, <4 x i32> %a) {
+; CHECK-LABEL: @cross_lane_intrinsic_over_phi(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br i1 [[C:%.*]], label [[IF:%.*]], label [[JOIN:%.*]]
+; CHECK: if:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[A:%.*]])
+; CHECK-NEXT: br label [[JOIN]]
+; CHECK: join:
+; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ [[TMP0]], [[IF]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: call void @may_exit()
+; CHECK-NEXT: ret i32 [[PHI]]
+;
+entry:
+ br i1 %c, label %if, label %join
+
+if:
+ br label %join
+
+join:
+ %phi = phi <4 x i32> [ %a, %if ], [ zeroinitializer, %entry ]
+ call void @may_exit()
+ %sum = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %phi)
+ ret i32 %sum
+}
+
+define { i64, i1 } @overflow_intrinsic_over_phi(i1 %c, i64 %a) {
+; CHECK-LABEL: @overflow_intrinsic_over_phi(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br i1 [[C:%.*]], label [[IF:%.*]], label [[JOIN:%.*]]
+; CHECK: if:
+; CHECK-NEXT: [[TMP0:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A:%.*]], i64 1)
+; CHECK-NEXT: br label [[JOIN]]
+; CHECK: join:
+; CHECK-NEXT: [[PHI:%.*]] = phi { i64, i1 } [ [[TMP0]], [[IF]] ], [ { i64 1, i1 false }, [[ENTRY:%.*]] ]
+; CHECK-NEXT: call void @may_exit()
+; CHECK-NEXT: ret { i64, i1 } [[PHI]]
+;
+entry:
+ br i1 %c, label %if, label %join
+
+if:
+ br label %join
+
+join:
+ %phi = phi i64 [ %a, %if ], [ 0, %entry ]
+ call void @may_exit()
+ %add_overflow = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %phi, i64 1)
+ ret { i64, i1 } %add_overflow
+}
+
define i32 @multiple_intrinsics_with_multiple_phi_uses(i1 %c, i32 %arg) {
; CHECK-LABEL: @multiple_intrinsics_with_multiple_phi_uses(
; CHECK-NEXT: entry:
diff --git a/llvm/test/Transforms/InstCombine/ptrtoaddr.ll b/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
index 410c43c..f19cca8 100644
--- a/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
+++ b/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
@@ -40,3 +40,134 @@ define i128 @ptrtoaddr_sext(ptr %p) {
%ext = sext i64 %p.addr to i128
ret i128 %ext
}
+
+define i64 @sub_ptrtoaddr(ptr %p, i64 %offset) {
+; CHECK-LABEL: define i64 @sub_ptrtoaddr(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[OFFSET:%.*]]) {
+; CHECK-NEXT: ret i64 [[OFFSET]]
+;
+ %p2 = getelementptr i8, ptr %p, i64 %offset
+ %p.addr = ptrtoaddr ptr %p to i64
+ %p2.addr = ptrtoaddr ptr %p2 to i64
+ %sub = sub i64 %p2.addr, %p.addr
+ ret i64 %sub
+}
+
+define i64 @sub_ptrtoint_ptrtoaddr(ptr %p, i64 %offset) {
+; CHECK-LABEL: define i64 @sub_ptrtoint_ptrtoaddr(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[OFFSET:%.*]]) {
+; CHECK-NEXT: ret i64 [[OFFSET]]
+;
+ %p2 = getelementptr i8, ptr %p, i64 %offset
+ %p.int = ptrtoint ptr %p to i64
+ %p2.addr = ptrtoaddr ptr %p2 to i64
+ %sub = sub i64 %p2.addr, %p.int
+ ret i64 %sub
+}
+
+define i32 @sub_ptrtoaddr_addrsize(ptr addrspace(1) %p, i32 %offset) {
+; CHECK-LABEL: define i32 @sub_ptrtoaddr_addrsize(
+; CHECK-SAME: ptr addrspace(1) [[P:%.*]], i32 [[OFFSET:%.*]]) {
+; CHECK-NEXT: ret i32 [[OFFSET]]
+;
+ %p2 = getelementptr i8, ptr addrspace(1) %p, i32 %offset
+ %p.addr = ptrtoaddr ptr addrspace(1) %p to i32
+ %p2.addr = ptrtoaddr ptr addrspace(1) %p2 to i32
+ %sub = sub i32 %p2.addr, %p.addr
+ ret i32 %sub
+}
+
+define i32 @sub_trunc_ptrtoaddr(ptr %p, i64 %offset) {
+; CHECK-LABEL: define i32 @sub_trunc_ptrtoaddr(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[OFFSET:%.*]]) {
+; CHECK-NEXT: [[SUB:%.*]] = trunc i64 [[OFFSET]] to i32
+; CHECK-NEXT: ret i32 [[SUB]]
+;
+ %p2 = getelementptr i8, ptr %p, i64 %offset
+ %p.addr = ptrtoaddr ptr %p to i64
+ %p2.addr = ptrtoaddr ptr %p2 to i64
+ %p.addr.trunc = trunc i64 %p.addr to i32
+ %p2.addr.trunc = trunc i64 %p2.addr to i32
+ %sub = sub i32 %p2.addr.trunc, %p.addr.trunc
+ ret i32 %sub
+}
+
+define i16 @sub_trunc_ptrtoaddr_addrsize(ptr addrspace(1) %p, i32 %offset) {
+; CHECK-LABEL: define i16 @sub_trunc_ptrtoaddr_addrsize(
+; CHECK-SAME: ptr addrspace(1) [[P:%.*]], i32 [[OFFSET:%.*]]) {
+; CHECK-NEXT: [[SUB:%.*]] = trunc i32 [[OFFSET]] to i16
+; CHECK-NEXT: ret i16 [[SUB]]
+;
+ %p2 = getelementptr i8, ptr addrspace(1) %p, i32 %offset
+ %p.addr = ptrtoaddr ptr addrspace(1) %p to i32
+ %p2.addr = ptrtoaddr ptr addrspace(1) %p2 to i32
+ %p.addr.trunc = trunc i32 %p.addr to i16
+ %p2.addr.trunc = trunc i32 %p2.addr to i16
+ %sub = sub i16 %p2.addr.trunc, %p.addr.trunc
+ ret i16 %sub
+}
+
+define i16 @sub_trunc_ptrtoint_ptrtoaddr_addrsize(ptr addrspace(1) %p, i32 %offset) {
+; CHECK-LABEL: define i16 @sub_trunc_ptrtoint_ptrtoaddr_addrsize(
+; CHECK-SAME: ptr addrspace(1) [[P:%.*]], i32 [[OFFSET:%.*]]) {
+; CHECK-NEXT: [[SUB:%.*]] = trunc i32 [[OFFSET]] to i16
+; CHECK-NEXT: ret i16 [[SUB]]
+;
+ %p2 = getelementptr i8, ptr addrspace(1) %p, i32 %offset
+ %p.int = ptrtoint ptr addrspace(1) %p to i64
+ %p2.addr = ptrtoaddr ptr addrspace(1) %p2 to i32
+ %p.int.trunc = trunc i64 %p.int to i16
+ %p2.addr.trunc = trunc i32 %p2.addr to i16
+ %sub = sub i16 %p2.addr.trunc, %p.int.trunc
+ ret i16 %sub
+}
+
+define i128 @sub_zext_ptrtoaddr(ptr %p, i64 %offset) {
+; CHECK-LABEL: define i128 @sub_zext_ptrtoaddr(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[OFFSET:%.*]]) {
+; CHECK-NEXT: [[SUB:%.*]] = zext i64 [[OFFSET]] to i128
+; CHECK-NEXT: ret i128 [[SUB]]
+;
+ %p2 = getelementptr nuw i8, ptr %p, i64 %offset
+ %p.addr = ptrtoaddr ptr %p to i64
+ %p2.addr = ptrtoaddr ptr %p2 to i64
+ %p.addr.ext = zext i64 %p.addr to i128
+ %p2.addr.ext = zext i64 %p2.addr to i128
+ %sub = sub i128 %p2.addr.ext, %p.addr.ext
+ ret i128 %sub
+}
+
+define i64 @sub_zext_ptrtoaddr_addrsize(ptr addrspace(1) %p, i32 %offset) {
+; CHECK-LABEL: define i64 @sub_zext_ptrtoaddr_addrsize(
+; CHECK-SAME: ptr addrspace(1) [[P:%.*]], i32 [[OFFSET:%.*]]) {
+; CHECK-NEXT: [[SUB:%.*]] = zext i32 [[OFFSET]] to i64
+; CHECK-NEXT: ret i64 [[SUB]]
+;
+ %p2 = getelementptr nuw i8, ptr addrspace(1) %p, i32 %offset
+ %p.addr = ptrtoaddr ptr addrspace(1) %p to i32
+ %p2.addr = ptrtoaddr ptr addrspace(1) %p2 to i32
+ %p.addr.ext = zext i32 %p.addr to i64
+ %p2.addr.ext = zext i32 %p2.addr to i64
+ %sub = sub i64 %p2.addr.ext, %p.addr.ext
+ ret i64 %sub
+}
+
+define i128 @sub_zext_ptrtoint_ptrtoaddr_addrsize(ptr addrspace(1) %p, i32 %offset) {
+; CHECK-LABEL: define i128 @sub_zext_ptrtoint_ptrtoaddr_addrsize(
+; CHECK-SAME: ptr addrspace(1) [[P:%.*]], i32 [[OFFSET:%.*]]) {
+; CHECK-NEXT: [[P2:%.*]] = getelementptr nuw i8, ptr addrspace(1) [[P]], i32 [[OFFSET]]
+; CHECK-NEXT: [[P_INT:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64
+; CHECK-NEXT: [[P2_ADDR:%.*]] = ptrtoaddr ptr addrspace(1) [[P2]] to i32
+; CHECK-NEXT: [[P_INT_EXT:%.*]] = zext i64 [[P_INT]] to i128
+; CHECK-NEXT: [[P2_ADDR_EXT:%.*]] = zext i32 [[P2_ADDR]] to i128
+; CHECK-NEXT: [[SUB:%.*]] = sub nsw i128 [[P2_ADDR_EXT]], [[P_INT_EXT]]
+; CHECK-NEXT: ret i128 [[SUB]]
+;
+ %p2 = getelementptr nuw i8, ptr addrspace(1) %p, i32 %offset
+ %p.int = ptrtoint ptr addrspace(1) %p to i64
+ %p2.addr = ptrtoaddr ptr addrspace(1) %p2 to i32
+ %p.int.ext = zext i64 %p.int to i128
+ %p2.addr.ext = zext i32 %p2.addr to i128
+ %sub = sub i128 %p2.addr.ext, %p.int.ext
+ ret i128 %sub
+}
diff --git a/llvm/test/Transforms/InstCombine/scmp.ll b/llvm/test/Transforms/InstCombine/scmp.ll
index c0be5b9..2ae062cd 100644
--- a/llvm/test/Transforms/InstCombine/scmp.ll
+++ b/llvm/test/Transforms/InstCombine/scmp.ll
@@ -519,9 +519,7 @@ define <3 x i2> @scmp_unary_shuffle_ops(<3 x i8> %x, <3 x i8> %y) {
define i32 @scmp_sgt_slt(i32 %a) {
; CHECK-LABEL: define i32 @scmp_sgt_slt(
; CHECK-SAME: i32 [[A:%.*]]) {
-; CHECK-NEXT: [[A_LOBIT:%.*]] = ashr i32 [[A]], 31
-; CHECK-NEXT: [[CMP_INV:%.*]] = icmp slt i32 [[A]], 1
-; CHECK-NEXT: [[RETVAL_0:%.*]] = select i1 [[CMP_INV]], i32 [[A_LOBIT]], i32 1
+; CHECK-NEXT: [[RETVAL_0:%.*]] = call i32 @llvm.scmp.i32.i32(i32 [[A]], i32 0)
; CHECK-NEXT: ret i32 [[RETVAL_0]]
;
%cmp = icmp sgt i32 %a, 0
@@ -747,3 +745,55 @@ define i8 @scmp_from_select_eq_and_gt_neg3(i32 %x, i32 %y) {
%r = select i1 %eq, i8 0, i8 %sel1
ret i8 %r
}
+
+define i32 @scmp_ashr(i32 %a) {
+; CHECK-LABEL: define i32 @scmp_ashr(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT: [[RETVAL_0:%.*]] = call i32 @llvm.scmp.i32.i32(i32 [[A]], i32 0)
+; CHECK-NEXT: ret i32 [[RETVAL_0]]
+;
+ %a.lobit = ashr i32 %a, 31
+ %cmp.inv = icmp slt i32 %a, 1
+ %retval.0 = select i1 %cmp.inv, i32 %a.lobit, i32 1
+ ret i32 %retval.0
+}
+
+; select (icmp sgt X, 0), 1, ashr X, bitwidth-1 -> scmp(X, 0)
+define i8 @scmp_ashr_sgt_pattern(i8 %a) {
+; CHECK-LABEL: define i8 @scmp_ashr_sgt_pattern(
+; CHECK-SAME: i8 [[A:%.*]]) {
+; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.scmp.i8.i8(i8 [[A]], i8 0)
+; CHECK-NEXT: ret i8 [[R]]
+;
+ %a.lobit = ashr i8 %a, 7
+ %cmp = icmp sgt i8 %a, 0
+ %retval = select i1 %cmp, i8 1, i8 %a.lobit
+ ret i8 %retval
+}
+
+; select (icmp slt X, 1), ashr X, bitwidth-1, 1 -> scmp(X, 0)
+define i8 @scmp_ashr_slt_pattern(i8 %a) {
+; CHECK-LABEL: define i8 @scmp_ashr_slt_pattern(
+; CHECK-SAME: i8 [[A:%.*]]) {
+; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.scmp.i8.i8(i8 [[A]], i8 0)
+; CHECK-NEXT: ret i8 [[R]]
+;
+ %a.lobit = ashr i8 %a, 7
+ %cmp = icmp slt i8 %a, 1
+ %retval = select i1 %cmp, i8 %a.lobit, i8 1
+ ret i8 %retval
+}
+
+define i8 @scmp_ashr_slt_pattern_neg(i8 %a) {
+; CHECK-LABEL: define i8 @scmp_ashr_slt_pattern_neg(
+; CHECK-SAME: i8 [[A:%.*]]) {
+; CHECK-NEXT: [[A_LOBIT:%.*]] = ashr i8 [[A]], 4
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[A]], 1
+; CHECK-NEXT: [[RETVAL:%.*]] = select i1 [[CMP]], i8 [[A_LOBIT]], i8 1
+; CHECK-NEXT: ret i8 [[RETVAL]]
+;
+ %a.lobit = ashr i8 %a, 4
+ %cmp = icmp slt i8 %a, 1
+ %retval = select i1 %cmp, i8 %a.lobit, i8 1
+ ret i8 %retval
+}
diff --git a/llvm/test/Transforms/InstCombine/select-extractelement-inseltpoison.ll b/llvm/test/Transforms/InstCombine/select-extractelement-inseltpoison.ll
index 2348490..2d06f42 100644
--- a/llvm/test/Transforms/InstCombine/select-extractelement-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/select-extractelement-inseltpoison.ll
@@ -208,6 +208,3 @@ define <4 x i32> @extract_cond_type_mismatch(<4 x i32> %x, <4 x i32> %y, <5 x i1
%r = select i1 %cond, <4 x i32> %x, <4 x i32> %y
ret <4 x i32> %r
}
-
-
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/InstCombine/select-extractelement.ll b/llvm/test/Transforms/InstCombine/select-extractelement.ll
index 621d278..6f80b7d 100644
--- a/llvm/test/Transforms/InstCombine/select-extractelement.ll
+++ b/llvm/test/Transforms/InstCombine/select-extractelement.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -passes=instcombine < %s | FileCheck %s
-declare void @v4float_user(<4 x float>) #0
+declare void @v4float_user(<4 x float>)
-define float @extract_one_select(<4 x float> %a, <4 x float> %b, i32 %c) #0 {
+define float @extract_one_select(<4 x float> %a, <4 x float> %b, i32 %c) {
; CHECK-LABEL: @extract_one_select(
; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[C:%.*]], 0
; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP_NOT]], <4 x float> [[B:%.*]], <4 x float> [[A:%.*]]
@@ -17,7 +17,7 @@ define float @extract_one_select(<4 x float> %a, <4 x float> %b, i32 %c) #0 {
}
; Multiple extractelements
-define <2 x float> @extract_two_select(<4 x float> %a, <4 x float> %b, i32 %c) #0 {
+define <2 x float> @extract_two_select(<4 x float> %a, <4 x float> %b, i32 %c) {
; CHECK-LABEL: @extract_two_select(
; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[C:%.*]], 0
; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP_NOT]], <4 x float> [[B:%.*]], <4 x float> [[A:%.*]]
@@ -34,7 +34,7 @@ define <2 x float> @extract_two_select(<4 x float> %a, <4 x float> %b, i32 %c) #
}
; Select has an extra non-extractelement user, don't change it
-define float @extract_one_select_user(<4 x float> %a, <4 x float> %b, i32 %c) #0 {
+define float @extract_one_select_user(<4 x float> %a, <4 x float> %b, i32 %c) {
; CHECK-LABEL: @extract_one_select_user(
; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[C:%.*]], 0
; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP_NOT]], <4 x float> [[B:%.*]], <4 x float> [[A:%.*]]
@@ -49,7 +49,7 @@ define float @extract_one_select_user(<4 x float> %a, <4 x float> %b, i32 %c) #0
ret float %extract
}
-define float @extract_one_vselect_user(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define float @extract_one_vselect_user(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
; CHECK-LABEL: @extract_one_vselect_user(
; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq <4 x i32> [[C:%.*]], zeroinitializer
; CHECK-NEXT: [[SEL:%.*]] = select <4 x i1> [[CMP_NOT]], <4 x float> [[B:%.*]], <4 x float> [[A:%.*]]
@@ -67,7 +67,7 @@ define float @extract_one_vselect_user(<4 x float> %a, <4 x float> %b, <4 x i32>
; Do not convert the vector select into a scalar select. That would increase
; the instruction count and potentially obfuscate a vector min/max idiom.
-define float @extract_one_vselect(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define float @extract_one_vselect(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
; CHECK-LABEL: @extract_one_vselect(
; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq <4 x i32> [[C:%.*]], zeroinitializer
; CHECK-NEXT: [[SELECT:%.*]] = select <4 x i1> [[CMP_NOT]], <4 x float> [[B:%.*]], <4 x float> [[A:%.*]]
@@ -81,7 +81,7 @@ define float @extract_one_vselect(<4 x float> %a, <4 x float> %b, <4 x i32> %c)
}
; Multiple extractelements from a vector select
-define <2 x float> @extract_two_vselect(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define <2 x float> @extract_two_vselect(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
; CHECK-LABEL: @extract_two_vselect(
; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq <4 x i32> [[C:%.*]], zeroinitializer
; CHECK-NEXT: [[SEL:%.*]] = select <4 x i1> [[CMP_NOT]], <4 x float> [[B:%.*]], <4 x float> [[A:%.*]]
@@ -100,7 +100,7 @@ define <2 x float> @extract_two_vselect(<4 x float> %a, <4 x float> %b, <4 x i32
; The vector selects are not decomposed into scalar selects because that would increase
; the instruction count. Extract+insert is converted to non-lane-crossing shuffles.
; Test multiple extractelements
-define <4 x float> @simple_vector_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define <4 x float> @simple_vector_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
; CHECK-LABEL: @simple_vector_select(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x i32> [[C:%.*]], i64 0
@@ -232,5 +232,3 @@ define i32 @inf_loop_partial_undef(<2 x i1> %a, <2 x i1> %b, <2 x i32> %x, <2 x
%t11 = extractelement <2 x i32> %p, i32 0
ret i32 %t11
}
-
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/InstCombine/select_frexp.ll b/llvm/test/Transforms/InstCombine/select_frexp.ll
index d025aed..ccfcca1 100644
--- a/llvm/test/Transforms/InstCombine/select_frexp.ll
+++ b/llvm/test/Transforms/InstCombine/select_frexp.ll
@@ -115,10 +115,10 @@ define float @test_select_frexp_no_const(float %x, float %y, i1 %cond) {
define i32 @test_select_frexp_extract_exp(float %x, i1 %cond) {
; CHECK-LABEL: define i32 @test_select_frexp_extract_exp(
; CHECK-SAME: float [[X:%.*]], i1 [[COND:%.*]]) {
-; CHECK-NEXT: [[SEL:%.*]] = select i1 [[COND]], float 1.000000e+00, float [[X]]
-; CHECK-NEXT: [[FREXP:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[SEL]])
+; CHECK-NEXT: [[FREXP:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[X]])
; CHECK-NEXT: [[FREXP_1:%.*]] = extractvalue { float, i32 } [[FREXP]], 1
-; CHECK-NEXT: ret i32 [[FREXP_1]]
+; CHECK-NEXT: [[FREXP_2:%.*]] = select i1 [[COND]], i32 1, i32 [[FREXP_1]]
+; CHECK-NEXT: ret i32 [[FREXP_2]]
;
%sel = select i1 %cond, float 1.000000e+00, float %x
%frexp = call { float, i32 } @llvm.frexp.f32.i32(float %sel)
@@ -132,7 +132,7 @@ define float @test_select_frexp_fast_math_select(float %x, i1 %cond) {
; CHECK-SAME: float [[X:%.*]], i1 [[COND:%.*]]) {
; CHECK-NEXT: [[FREXP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[X]])
; CHECK-NEXT: [[MANTISSA:%.*]] = extractvalue { float, i32 } [[FREXP1]], 0
-; CHECK-NEXT: [[SELECT_FREXP:%.*]] = select nnan ninf nsz i1 [[COND]], float 5.000000e-01, float [[MANTISSA]]
+; CHECK-NEXT: [[SELECT_FREXP:%.*]] = select i1 [[COND]], float 5.000000e-01, float [[MANTISSA]]
; CHECK-NEXT: ret float [[SELECT_FREXP]]
;
%sel = select nnan ninf nsz i1 %cond, float 1.000000e+00, float %x
diff --git a/llvm/test/Transforms/InstCombine/sub-gep.ll b/llvm/test/Transforms/InstCombine/sub-gep.ll
index 8eeaea1..ee70137 100644
--- a/llvm/test/Transforms/InstCombine/sub-gep.ll
+++ b/llvm/test/Transforms/InstCombine/sub-gep.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -passes=instcombine < %s | FileCheck %s
-target datalayout = "e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-p2:32:32"
+target datalayout = "e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-p2:32:32-p3:32:32:32:16"
define i64 @test_inbounds(ptr %base, i64 %idx) {
; CHECK-LABEL: @test_inbounds(
@@ -505,6 +505,23 @@ define i64 @negative_zext_ptrtoint_sub_ptrtoint_as2_nuw(i32 %offset) {
ret i64 %D
}
+define i64 @negative_zext_ptrtoint_sub_zext_ptrtoint_as2_nuw_truncating(i32 %offset) {
+; CHECK-LABEL: @negative_zext_ptrtoint_sub_zext_ptrtoint_as2_nuw_truncating(
+; CHECK-NEXT: [[A:%.*]] = getelementptr nuw bfloat, ptr addrspace(2) @Arr_as2, i32 [[OFFSET:%.*]]
+; CHECK-NEXT: [[A_IDX:%.*]] = ptrtoint ptr addrspace(2) [[A]] to i32
+; CHECK-NEXT: [[E:%.*]] = zext i32 [[A_IDX]] to i64
+; CHECK-NEXT: [[D:%.*]] = zext i16 ptrtoint (ptr addrspace(2) @Arr_as2 to i16) to i64
+; CHECK-NEXT: [[E1:%.*]] = sub nsw i64 [[E]], [[D]]
+; CHECK-NEXT: ret i64 [[E1]]
+;
+ %A = getelementptr nuw bfloat, ptr addrspace(2) @Arr_as2, i32 %offset
+ %B = ptrtoint ptr addrspace(2) %A to i32
+ %C = zext i32 %B to i64
+ %D = zext i16 ptrtoint (ptr addrspace(2) @Arr_as2 to i16) to i64
+ %E = sub i64 %C, %D
+ ret i64 %E
+}
+
define i64 @ptrtoint_sub_zext_ptrtoint_as2_inbounds_local(ptr addrspace(2) %p, i32 %offset) {
; CHECK-LABEL: @ptrtoint_sub_zext_ptrtoint_as2_inbounds_local(
; CHECK-NEXT: [[A:%.*]] = getelementptr inbounds bfloat, ptr addrspace(2) [[P:%.*]], i32 [[OFFSET:%.*]]
@@ -614,6 +631,20 @@ define i64 @negative_zext_ptrtoint_sub_ptrtoint_as2_nuw_local(ptr addrspace(2) %
ret i64 %D
}
+define i64 @zext_ptrtoint_sub_ptrtoint_as3_nuw_local(ptr addrspace(3) %p, i16 %offset) {
+; CHECK-LABEL: @zext_ptrtoint_sub_ptrtoint_as3_nuw_local(
+; CHECK-NEXT: [[SUB:%.*]] = zext i16 [[GEP_IDX:%.*]] to i64
+; CHECK-NEXT: ret i64 [[SUB]]
+;
+ %gep = getelementptr nuw i8, ptr addrspace(3) %p, i16 %offset
+ %gep.int = ptrtoint ptr addrspace(3) %gep to i32
+ %p.int = ptrtoint ptr addrspace(3) %p to i32
+ %gep.int.ext = zext i32 %gep.int to i64
+ %p.int.ext = zext i32 %p.int to i64
+ %sub = sub i64 %gep.int.ext, %p.int.ext
+ ret i64 %sub
+}
+
define i64 @test30(ptr %foo, i64 %i, i64 %j) {
; CHECK-LABEL: @test30(
; CHECK-NEXT: [[GEP1_IDX:%.*]] = shl nsw i64 [[I:%.*]], 2
diff --git a/llvm/test/Transforms/JumpThreading/ddt-crash3.ll b/llvm/test/Transforms/JumpThreading/ddt-crash3.ll
index edaade32..26ba857 100644
--- a/llvm/test/Transforms/JumpThreading/ddt-crash3.ll
+++ b/llvm/test/Transforms/JumpThreading/ddt-crash3.ll
@@ -9,9 +9,9 @@ target triple = "x86_64-unknown-linux-gnu"
@global.2 = external local_unnamed_addr global i64, align 8
; Function Attrs: norecurse noreturn nounwind uwtable
-define void @hoge() local_unnamed_addr #0 {
+define void @hoge() local_unnamed_addr {
; CHECK-LABEL: define void @hoge(
-; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: ) local_unnamed_addr {
; CHECK-NEXT: [[BB:.*:]]
; CHECK-NEXT: br label %[[BB1:.*]]
; CHECK: [[BB1]]:
@@ -48,8 +48,6 @@ bb27: ; preds = %bb1
br label %bb26
}
-attributes #0 = { norecurse noreturn nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.ident = !{!0}
!0 = !{!"clang version 7.0.0 "}
diff --git a/llvm/test/Transforms/LICM/volatile-alias.ll b/llvm/test/Transforms/LICM/volatile-alias.ll
index 410f3be..35e8844 100644
--- a/llvm/test/Transforms/LICM/volatile-alias.ll
+++ b/llvm/test/Transforms/LICM/volatile-alias.ll
@@ -10,7 +10,7 @@
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
; Function Attrs: nounwind uwtable
-define i32 @foo(ptr dereferenceable(4) nonnull %p, ptr %q, i32 %n) #0 {
+define i32 @foo(ptr dereferenceable(4) nonnull %p, ptr %q, i32 %n) {
entry:
%p.addr = alloca ptr, align 8
%q.addr = alloca ptr, align 8
@@ -51,5 +51,3 @@ for.end: ; preds = %for.cond
%8 = load i32, ptr %s, align 4
ret i32 %8
}
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/LoopRotate/noalias.ll b/llvm/test/Transforms/LoopRotate/noalias.ll
index f709bba..d6b2414 100644
--- a/llvm/test/Transforms/LoopRotate/noalias.ll
+++ b/llvm/test/Transforms/LoopRotate/noalias.ll
@@ -146,11 +146,7 @@ for.end: ; preds = %for.cond
}
; Function Attrs: inaccessiblememonly nounwind
-declare void @llvm.experimental.noalias.scope.decl(metadata) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { inaccessiblememonly nounwind }
-attributes #2 = { nounwind readnone speculatable }
+declare void @llvm.experimental.noalias.scope.decl(metadata)
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll
index 5423368..25ded4b 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll
@@ -157,4 +157,4 @@ bb:
br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph
}
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hawaii" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hawaii" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/pr17473.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/pr17473.ll
index 6c34d1b..a878fc2 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/X86/pr17473.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/X86/pr17473.ll
@@ -62,6 +62,6 @@ for.end: ; preds = %fn3.exit
; Function Attrs: nounwind optsize
declare i32 @printf(ptr nocapture readonly, ...) #1
-attributes #0 = { nounwind optsize ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind optsize ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
attributes #2 = { nounwind optsize }
diff --git a/llvm/test/Transforms/LoopStrengthReduce/pr18165.ll b/llvm/test/Transforms/LoopStrengthReduce/pr18165.ll
index 914ea7a..9c20685 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/pr18165.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/pr18165.ll
@@ -71,8 +71,8 @@ fn1.exit: ; preds = %lor.end.i
; Function Attrs: nounwind optsize
declare i32 @printf(ptr nocapture readonly, ...) #1
-attributes #0 = { nounwind optsize ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind optsize ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
attributes #2 = { nounwind optsize }
!llvm.ident = !{!0}
diff --git a/llvm/test/Transforms/LoopStrengthReduce/two-combinations-bug.ll b/llvm/test/Transforms/LoopStrengthReduce/two-combinations-bug.ll
index 3364465..37e2f68 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/two-combinations-bug.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/two-combinations-bug.ll
@@ -46,7 +46,7 @@ for.body3: ; preds = %for.body3, %for.bod
br i1 %exitcond, label %for.cond.loopexit, label %for.body3
}
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
!0 = !{!1, !2, i64 16}
!1 = !{!"planet", !2, i64 0, !2, i64 8, !2, i64 16, !2, i64 24, !2, i64 32, !2, i64 40, !2, i64 48}
diff --git a/llvm/test/Transforms/LoopUnroll/runtime-epilog-debuginfo.ll b/llvm/test/Transforms/LoopUnroll/runtime-epilog-debuginfo.ll
index ee28aa1..df4dfa6 100644
--- a/llvm/test/Transforms/LoopUnroll/runtime-epilog-debuginfo.ll
+++ b/llvm/test/Transforms/LoopUnroll/runtime-epilog-debuginfo.ll
@@ -76,7 +76,7 @@ lee1.exit: ; preds = %lee1.exit.loopexit,
; Function Attrs: nounwind readnone
declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="arm7tdmi" "target-features"="+neon,+strict-align,+vfp3,-crypto,-fp-armv8,-fp16,-vfp4" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="arm7tdmi" "target-features"="+neon,+strict-align,+vfp3,-crypto,-fp-armv8,-fp16,-vfp4" "use-soft-float"="false" }
attributes #1 = { nounwind readnone }
!llvm.dbg.cu = !{!0}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
index 27ca414..199203a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
@@ -86,7 +86,8 @@ define i64 @test_two_ivs(ptr %a, ptr %b, i64 %start) #0 {
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ]
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK: Cost for VF 16: 41
+; CHECK: Cost of 1 for VF 16: EXPRESSION vp<%11> = ir<%sum> + partial.reduce.add (mul nuw nsw (ir<%1> zext to i64), (ir<%0> zext to i64))
+; CHECK: Cost for VF 16: 3
; CHECK: LV: Selecting VF: 16
entry:
br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll b/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll
index 2d15431..8109d068 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll
@@ -6,8 +6,7 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64-none-unknown-elf"
define i32 @dotp(ptr %a, ptr %b) #0 {
-; CHECK-REGS-VP-NOT: LV(REG): Not considering vector loop of width vscale x 16 because it uses too many registers
-; CHECK-REGS-VP: LV: Selecting VF: vscale x 8.
+; CHECK-REGS-VP: LV: Selecting VF: vscale x 16.
;
; CHECK-NOREGS-VP: LV(REG): Not considering vector loop of width vscale x 8 because it uses too many registers
; CHECK-NOREGS-VP: LV(REG): Not considering vector loop of width vscale x 16 because it uses too many registers
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
index 3dfa6df..287226f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
@@ -31,9 +31,9 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
+; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
; CHECK-NEON-NEXT: [[TMP13:%.*]] = sub <16 x i32> zeroinitializer, [[TMP12]]
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP13]])
@@ -52,37 +52,34 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
-; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE: vector.ph:
-; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-SVE: vector.body:
; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
-; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
-; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]]
-; CHECK-SVE-NEXT: [[TMP17:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]]
-; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP15]]
-; CHECK-SVE-NEXT: [[TMP19]] = sub <vscale x 4 x i32> [[TMP17]], [[TMP18]]
-; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
+; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]]
+; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]])
+; CHECK-SVE-NEXT: [[TMP6:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-SVE-NEXT: [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP6]]
+; CHECK-SVE-NEXT: [[TMP12:%.*]] = sub <16 x i32> zeroinitializer, [[TMP11]]
+; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP12]])
+; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-SVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-SVE: middle.block:
-; CHECK-SVE-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
+; CHECK-SVE-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]])
; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK-SVE: scalar.ph:
@@ -114,10 +111,10 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1
; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
-; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP16]])
-; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP11:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP11]]
; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = sub <vscale x 8 x i32> zeroinitializer, [[TMP17]]
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP18]])
; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
@@ -184,9 +181,9 @@ define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
+; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP11]])
; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
@@ -221,9 +218,9 @@ define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-SVE-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-SVE-NEXT: [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]]
; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]])
+; CHECK-SVE-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-SVE-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP5]]
; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP10]])
; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
@@ -262,10 +259,10 @@ define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1
; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
-; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP16]])
-; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP11:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP11]]
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP17]])
; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -331,11 +328,11 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
; CHECK-NEON-NEXT: [[TMP11:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP10]]
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
-; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
+; CHECK-NEON-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP13]]
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP12]])
; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-NEON-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -352,37 +349,34 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
-; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE: vector.ph:
-; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-SVE: vector.body:
; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
-; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
-; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]]
-; CHECK-SVE-NEXT: [[TMP17:%.*]] = sub <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]]
-; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP15]]
-; CHECK-SVE-NEXT: [[TMP19]] = add <vscale x 4 x i32> [[TMP17]], [[TMP18]]
-; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
+; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]]
+; CHECK-SVE-NEXT: [[TMP6:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP5]]
+; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]])
+; CHECK-SVE-NEXT: [[TMP11:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-SVE-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP11]]
+; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP12]])
+; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-SVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK-SVE: middle.block:
-; CHECK-SVE-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
+; CHECK-SVE-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]])
; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK-SVE: scalar.ph:
@@ -414,11 +408,11 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1
; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
-; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = sub nsw <vscale x 8 x i32> zeroinitializer, [[TMP16]]
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP17]])
-; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP12]]
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP18]])
; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -486,11 +480,11 @@ define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
; CHECK-NEON-NEXT: [[TMP11:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP10]]
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
-; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
+; CHECK-NEON-NEXT: [[TMP16:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP16]]
; CHECK-NEON-NEXT: [[TMP13:%.*]] = sub <16 x i32> zeroinitializer, [[TMP12]]
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP13]])
; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
@@ -508,37 +502,35 @@ define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
-; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE: vector.ph:
-; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-SVE: vector.body:
; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
-; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
-; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]]
-; CHECK-SVE-NEXT: [[TMP17:%.*]] = sub <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]]
-; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP15]]
-; CHECK-SVE-NEXT: [[TMP19]] = sub <vscale x 4 x i32> [[TMP17]], [[TMP18]]
-; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
+; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]]
+; CHECK-SVE-NEXT: [[TMP6:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP5]]
+; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]])
+; CHECK-SVE-NEXT: [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-SVE-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP10]]
+; CHECK-SVE-NEXT: [[TMP13:%.*]] = sub <16 x i32> zeroinitializer, [[TMP12]]
+; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP13]])
+; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-SVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK-SVE: middle.block:
-; CHECK-SVE-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
+; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]])
; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK-SVE: scalar.ph:
@@ -570,11 +562,11 @@ define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1
; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
-; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = sub nsw <vscale x 8 x i32> zeroinitializer, [[TMP16]]
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP17]])
-; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP12]]
; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = sub <vscale x 8 x i32> zeroinitializer, [[TMP18]]
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP19]])
; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
@@ -644,12 +636,12 @@ define i32 @chained_partial_reduce_add_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #
; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
-; CHECK-NEON-NEXT: [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
+; CHECK-NEON-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NEON-NEXT: [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP15]]
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP11]])
-; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP8]], [[TMP9]]
+; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP8]], [[TMP15]]
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE4]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE3]], <16 x i32> [[TMP12]])
; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-NEON-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -683,9 +675,9 @@ define i32 @chained_partial_reduce_add_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #
; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-SVE-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-SVE-NEXT: [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]]
; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]])
+; CHECK-SVE-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-SVE-NEXT: [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP5]]
; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP11]])
; CHECK-SVE-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP4]], [[TMP5]]
@@ -726,12 +718,12 @@ define i32 @chained_partial_reduce_add_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1
; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
-; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP16]])
-; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP11:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP11]]
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP17]])
-; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP14]], [[TMP15]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP14]], [[TMP11]]
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE4]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE3]], <vscale x 8 x i32> [[TMP18]])
; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -802,13 +794,13 @@ define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #
; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
; CHECK-NEON-NEXT: [[TMP11:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP10]]
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
-; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
+; CHECK-NEON-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP13]]
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP12]])
-; CHECK-NEON-NEXT: [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP8]], [[TMP9]]
+; CHECK-NEON-NEXT: [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP8]], [[TMP13]]
; CHECK-NEON-NEXT: [[TMP15:%.*]] = sub <16 x i32> zeroinitializer, [[TMP14]]
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE4]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE3]], <16 x i32> [[TMP15]])
; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
@@ -826,39 +818,37 @@ define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #
; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
-; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE: vector.ph:
-; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-SVE: vector.body:
; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE4:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
-; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
-; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]]
-; CHECK-SVE-NEXT: [[TMP17:%.*]] = sub <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]]
-; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP15]]
-; CHECK-SVE-NEXT: [[TMP19:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[TMP18]]
-; CHECK-SVE-NEXT: [[TMP20:%.*]] = mul nsw <vscale x 4 x i32> [[TMP14]], [[TMP15]]
-; CHECK-SVE-NEXT: [[TMP21]] = sub <vscale x 4 x i32> [[TMP19]], [[TMP20]]
-; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
+; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]]
+; CHECK-SVE-NEXT: [[TMP6:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP5]]
+; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]])
+; CHECK-SVE-NEXT: [[TMP11:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-SVE-NEXT: [[TMP13:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP11]]
+; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP13]])
+; CHECK-SVE-NEXT: [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP4]], [[TMP11]]
+; CHECK-SVE-NEXT: [[TMP10:%.*]] = sub <16 x i32> zeroinitializer, [[TMP14]]
+; CHECK-SVE-NEXT: [[PARTIAL_REDUCE4]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE3]], <16 x i32> [[TMP10]])
+; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-SVE-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK-SVE: middle.block:
-; CHECK-SVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP21]])
+; CHECK-SVE-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE4]])
; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK-SVE: scalar.ph:
@@ -890,13 +880,13 @@ define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1
; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
-; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = sub nsw <vscale x 8 x i32> zeroinitializer, [[TMP16]]
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP17]])
-; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP12]]
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP18]])
-; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = mul nsw <vscale x 8 x i32> [[TMP14]], [[TMP15]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = mul nsw <vscale x 8 x i32> [[TMP14]], [[TMP12]]
; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = sub <vscale x 8 x i32> zeroinitializer, [[TMP19]]
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE4]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE3]], <vscale x 8 x i32> [[TMP20]])
; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
@@ -969,9 +959,9 @@ define i32 @chained_partial_reduce_madd_extadd(ptr %a, ptr %b, ptr %c, i32 %N) #
; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
+; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP9]])
; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-NEON-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1005,9 +995,9 @@ define i32 @chained_partial_reduce_madd_extadd(ptr %a, ptr %b, ptr %c, i32 %N) #
; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-SVE-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-SVE-NEXT: [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]]
; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]])
+; CHECK-SVE-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP5]])
; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-SVE-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1045,9 +1035,9 @@ define i32 @chained_partial_reduce_madd_extadd(ptr %a, ptr %b, ptr %c, i32 %N) #
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1
; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
-; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP16]])
+; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP15]])
; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1110,8 +1100,8 @@ define i32 @chained_partial_reduce_extadd_extadd(ptr %a, ptr %b, i32 %N) #0 {
; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
; CHECK-NEON-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
; CHECK-NEON-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-NEON-NEXT: [[TMP6:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]])
+; CHECK-NEON-NEXT: [[TMP6:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE2]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP6]])
; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-NEON-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1142,8 +1132,8 @@ define i32 @chained_partial_reduce_extadd_extadd(ptr %a, ptr %b, i32 %N) #0 {
; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
; CHECK-SVE-NEXT: [[TMP2:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP2]])
+; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
; CHECK-SVE-NEXT: [[PARTIAL_REDUCE2]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP3]])
; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-SVE-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1178,8 +1168,8 @@ define i32 @chained_partial_reduce_extadd_extadd(ptr %a, ptr %b, i32 %N) #0 {
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
; CHECK-SVE-MAXBW-NEXT: [[TMP11:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP11]])
+; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE2]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP12]])
; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1240,10 +1230,10 @@ define i32 @chained_partial_reduce_extadd_madd(ptr %a, ptr %b, ptr %c, i32 %N) #
; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
; CHECK-NEON-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
-; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP9]])
+; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP10]])
; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
@@ -1276,10 +1266,10 @@ define i32 @chained_partial_reduce_extadd_madd(ptr %a, ptr %b, ptr %c, i32 %N) #
; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
-; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
; CHECK-SVE-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]])
+; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
; CHECK-SVE-NEXT: [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]]
; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP6]])
; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
@@ -1316,10 +1306,10 @@ define i32 @chained_partial_reduce_extadd_madd(ptr %a, ptr %b, ptr %c, i32 %N) #
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1
-; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP15]])
+; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP16]])
; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll
index b033f60..b430efc 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll
@@ -467,3 +467,83 @@ loop:
exit:
ret i32 %red.next
}
+
+define i64 @partial_reduction_mul_two_users(i64 %n, ptr %a, i16 %b, i32 %c) {
+; CHECK-LABEL: define i64 @partial_reduction_mul_two_users(
+; CHECK-SAME: i64 [[N:%.*]], ptr [[A:%.*]], i16 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[B]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT]] to <8 x i32>
+; CHECK-NEXT: [[TMP2:%.*]] = mul <8 x i32> [[TMP1]], [[TMP1]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr [[A]], align 2
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i64> @llvm.vector.partial.reduce.add.v4i64.v8i64(<4 x i64> [[VEC_PHI]], <8 x i64> [[TMP3]])
+; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32>
+; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i32> [[TMP5]] to <8 x i64>
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[PARTIAL_REDUCE]])
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP6]], i32 7
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[RES1:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[LOAD_EXT_EXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[RES2:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[LOAD:%.*]] = load i16, ptr [[A]], align 2
+; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[B]] to i32
+; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[CONV]], [[CONV]]
+; CHECK-NEXT: [[MUL_EXT:%.*]] = zext i32 [[MUL]] to i64
+; CHECK-NEXT: [[ADD]] = add i64 [[RES2]], [[MUL_EXT]]
+; CHECK-NEXT: [[OR:%.*]] = or i32 [[MUL]], [[C]]
+; CHECK-NEXT: [[LOAD_EXT:%.*]] = sext i16 [[LOAD]] to i32
+; CHECK-NEXT: [[LOAD_EXT_EXT]] = sext i32 [[LOAD_EXT]] to i64
+; CHECK-NEXT: [[EXITCOND740_NOT:%.*]] = icmp eq i64 [[IV]], [[N]]
+; CHECK-NEXT: br i1 [[EXITCOND740_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: ret i64 [[ADD_LCSSA]]
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %res1 = phi i64 [ 0, %entry ], [ %load.ext.ext, %loop ]
+ %res2 = phi i64 [ 0, %entry ], [ %add, %loop ]
+ %load = load i16, ptr %a, align 2
+ %iv.next = add i64 %iv, 1
+ %conv = sext i16 %b to i32
+ %mul = mul i32 %conv, %conv
+ %mul.ext = zext i32 %mul to i64
+ %add = add i64 %res2, %mul.ext
+ %second_use = or i32 %mul, %c ; this value is otherwise unused, but that's sufficient for the test
+ %load.ext = sext i16 %load to i32
+ %load.ext.ext = sext i32 %load.ext to i64
+ %exitcond740.not = icmp eq i64 %iv, %n
+ br i1 %exitcond740.not, label %exit, label %loop
+
+exit:
+ ret i64 %add
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
index 8ece59a..d8f1a86 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
@@ -16,10 +16,10 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
-; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
; CHECK-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP8]], [[TMP5]]
; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP9]])
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
@@ -68,7 +68,6 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 {
; CHECK-NEXT: [[IV_NEXT:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP1:%.*]] = sext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32>
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -77,11 +76,12 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 {
; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <16 x i8> poison, i8 [[TMP2]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT2]], <16 x i8> poison, <16 x i32> zeroinitializer
; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[BROADCAST_SPLAT3]] to <16 x i32>
+; CHECK-NEXT: [[TMP1:%.*]] = sext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32>
; CHECK-NEXT: [[TMP4:%.*]] = mul <16 x i32> [[TMP3]], [[TMP1]]
; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]])
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[IV_NEXT]]
-; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[IV_NEXT]]
@@ -89,7 +89,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 {
; CHECK: vec.epilog.iter.check:
; CHECK-NEXT: [[IND_END6:%.*]] = add i64 [[IDX_NEG]], [[IV_NEXT]]
; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
-; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF4:![0-9]+]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT]], [[WHILE_BODY]] ], [ 0, [[ENTRY]] ]
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP6]], [[WHILE_BODY]] ], [ 0, [[ENTRY]] ]
@@ -112,7 +112,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 {
; CHECK-NEXT: [[TMP13]] = add <4 x i32> [[TMP14]], [[VEC_PHI9]]
; CHECK-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX9]], 4
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC5]]
-; CHECK-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: vec.epilog.middle.block:
; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]])
; CHECK-NEXT: [[CMP_N15:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC5]]
@@ -136,7 +136,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 {
; CHECK-NEXT: [[CMP_IV_NEG:%.*]] = icmp ugt i64 [[IV_NEG]], 0
; CHECK-NEXT: [[CMP_IV:%.*]] = icmp ne i64 [[ACCUM1]], -1
; CHECK-NEXT: [[EXITCOND:%.*]] = and i1 [[CMP_IV_NEG]], [[CMP_IV]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[WHILE_BODY1]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[WHILE_BODY1]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: while.end.loopexit:
; CHECK-NEXT: [[RESULT:%.*]] = phi i32 [ [[ADD]], [[WHILE_BODY1]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ [[TMP15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret void
@@ -495,7 +495,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) {
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16)
; CHECK-NEXT: [[TMP181:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP181]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP181]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[TMP182:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
; CHECK-NEXT: br label [[EXIT:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll
index 09b41fb..26e630f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll
@@ -26,26 +26,26 @@ define i32 @sudot(ptr %a, ptr %b) #0 {
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6]], i64 [[TMP9]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP6]], align 1
; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1
-; CHECK-NEXT: [[TMP11:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
-; CHECK-NEXT: [[TMP7:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 4
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]]
; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP13]], align 1
; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP17]], align 1
-; CHECK-NEXT: [[TMP12:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32>
-; CHECK-NEXT: [[TMP18:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
-; CHECK-NEXT: [[TMP14:%.*]] = mul <vscale x 16 x i32> [[TMP12]], [[TMP11]]
-; CHECK-NEXT: [[TMP19:%.*]] = mul <vscale x 16 x i32> [[TMP18]], [[TMP7]]
-; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP14]])
-; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI1]], <vscale x 16 x i32> [[TMP19]])
+; CHECK-NEXT: [[TMP18:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP11:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP12:%.*]] = mul <vscale x 16 x i32> [[TMP18]], [[TMP11]]
+; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP12]])
+; CHECK-NEXT: [[TMP19:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP14:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP20:%.*]] = mul <vscale x 16 x i32> [[TMP19]], [[TMP14]]
+; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI1]], <vscale x 16 x i32> [[TMP20]])
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
-; CHECK-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
+; CHECK-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
; CHECK: scalar.ph:
@@ -62,8 +62,8 @@ define i32 @sudot(ptr %a, ptr %b) #0 {
; CHECK-NOI8MM-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-NOI8MM: vector.body:
; CHECK-NOI8MM-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
; CHECK-NOI8MM-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; CHECK-NOI8MM-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NOI8MM-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3
@@ -71,25 +71,25 @@ define i32 @sudot(ptr %a, ptr %b) #0 {
; CHECK-NOI8MM-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP6]], align 1
; CHECK-NOI8MM-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
; CHECK-NOI8MM-NEXT: [[TMP11:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-NOI8MM-NEXT: [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-NOI8MM-NEXT: [[TMP7:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
; CHECK-NOI8MM-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
; CHECK-NOI8MM-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NOI8MM-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 3
; CHECK-NOI8MM-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]]
; CHECK-NOI8MM-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
; CHECK-NOI8MM-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1
-; CHECK-NOI8MM-NEXT: [[TMP18:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
-; CHECK-NOI8MM-NEXT: [[TMP19:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
-; CHECK-NOI8MM-NEXT: [[TMP20:%.*]] = mul <vscale x 8 x i32> [[TMP18]], [[TMP11]]
-; CHECK-NOI8MM-NEXT: [[TMP21:%.*]] = mul <vscale x 8 x i32> [[TMP19]], [[TMP12]]
-; CHECK-NOI8MM-NEXT: [[TMP22]] = add <vscale x 8 x i32> [[TMP20]], [[VEC_PHI]]
-; CHECK-NOI8MM-NEXT: [[TMP23]] = add <vscale x 8 x i32> [[TMP21]], [[VEC_PHI1]]
+; CHECK-NOI8MM-NEXT: [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
+; CHECK-NOI8MM-NEXT: [[TMP21:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
+; CHECK-NOI8MM-NEXT: [[TMP14:%.*]] = mul <vscale x 8 x i32> [[TMP12]], [[TMP11]]
+; CHECK-NOI8MM-NEXT: [[TMP22:%.*]] = mul <vscale x 8 x i32> [[TMP21]], [[TMP7]]
+; CHECK-NOI8MM-NEXT: [[TMP18]] = add <vscale x 8 x i32> [[TMP14]], [[VEC_PHI]]
+; CHECK-NOI8MM-NEXT: [[TMP20]] = add <vscale x 8 x i32> [[TMP22]], [[VEC_PHI1]]
; CHECK-NOI8MM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-NOI8MM-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NOI8MM-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-NOI8MM: middle.block:
-; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <vscale x 8 x i32> [[TMP23]], [[TMP22]]
-; CHECK-NOI8MM-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]])
+; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <vscale x 8 x i32> [[TMP20]], [[TMP18]]
+; CHECK-NOI8MM-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]])
; CHECK-NOI8MM-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
; CHECK-NOI8MM-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
; CHECK-NOI8MM: scalar.ph:
@@ -137,26 +137,26 @@ define i32 @usdot(ptr %a, ptr %b) #0 {
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6]], i64 [[TMP9]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP6]], align 1
; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1
-; CHECK-NEXT: [[TMP11:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
-; CHECK-NEXT: [[TMP7:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 4
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]]
; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP13]], align 1
; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP17]], align 1
-; CHECK-NEXT: [[TMP12:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32>
-; CHECK-NEXT: [[TMP18:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
-; CHECK-NEXT: [[TMP14:%.*]] = mul <vscale x 16 x i32> [[TMP12]], [[TMP11]]
-; CHECK-NEXT: [[TMP19:%.*]] = mul <vscale x 16 x i32> [[TMP18]], [[TMP7]]
-; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP14]])
-; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI1]], <vscale x 16 x i32> [[TMP19]])
+; CHECK-NEXT: [[TMP18:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP11:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP12:%.*]] = mul <vscale x 16 x i32> [[TMP18]], [[TMP11]]
+; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP12]])
+; CHECK-NEXT: [[TMP19:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP14:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP20:%.*]] = mul <vscale x 16 x i32> [[TMP19]], [[TMP14]]
+; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI1]], <vscale x 16 x i32> [[TMP20]])
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
-; CHECK-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
+; CHECK-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
; CHECK: scalar.ph:
@@ -173,8 +173,8 @@ define i32 @usdot(ptr %a, ptr %b) #0 {
; CHECK-NOI8MM-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-NOI8MM: vector.body:
; CHECK-NOI8MM-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
; CHECK-NOI8MM-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; CHECK-NOI8MM-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NOI8MM-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3
@@ -182,25 +182,25 @@ define i32 @usdot(ptr %a, ptr %b) #0 {
; CHECK-NOI8MM-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP6]], align 1
; CHECK-NOI8MM-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
; CHECK-NOI8MM-NEXT: [[TMP11:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-NOI8MM-NEXT: [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-NOI8MM-NEXT: [[TMP7:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
; CHECK-NOI8MM-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
; CHECK-NOI8MM-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NOI8MM-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 3
; CHECK-NOI8MM-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]]
; CHECK-NOI8MM-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
; CHECK-NOI8MM-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1
-; CHECK-NOI8MM-NEXT: [[TMP18:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
-; CHECK-NOI8MM-NEXT: [[TMP19:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
-; CHECK-NOI8MM-NEXT: [[TMP20:%.*]] = mul <vscale x 8 x i32> [[TMP18]], [[TMP11]]
-; CHECK-NOI8MM-NEXT: [[TMP21:%.*]] = mul <vscale x 8 x i32> [[TMP19]], [[TMP12]]
-; CHECK-NOI8MM-NEXT: [[TMP22]] = add <vscale x 8 x i32> [[TMP20]], [[VEC_PHI]]
-; CHECK-NOI8MM-NEXT: [[TMP23]] = add <vscale x 8 x i32> [[TMP21]], [[VEC_PHI1]]
+; CHECK-NOI8MM-NEXT: [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
+; CHECK-NOI8MM-NEXT: [[TMP21:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
+; CHECK-NOI8MM-NEXT: [[TMP14:%.*]] = mul <vscale x 8 x i32> [[TMP12]], [[TMP11]]
+; CHECK-NOI8MM-NEXT: [[TMP22:%.*]] = mul <vscale x 8 x i32> [[TMP21]], [[TMP7]]
+; CHECK-NOI8MM-NEXT: [[TMP18]] = add <vscale x 8 x i32> [[TMP14]], [[VEC_PHI]]
+; CHECK-NOI8MM-NEXT: [[TMP20]] = add <vscale x 8 x i32> [[TMP22]], [[VEC_PHI1]]
; CHECK-NOI8MM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-NOI8MM-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NOI8MM-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK-NOI8MM: middle.block:
-; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <vscale x 8 x i32> [[TMP23]], [[TMP22]]
-; CHECK-NOI8MM-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]])
+; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <vscale x 8 x i32> [[TMP20]], [[TMP18]]
+; CHECK-NOI8MM-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]])
; CHECK-NOI8MM-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
; CHECK-NOI8MM-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
; CHECK-NOI8MM: scalar.ph:
@@ -242,18 +242,18 @@ define i32 @sudot_neon(ptr %a, ptr %b) #1 {
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 16
; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
-; CHECK-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
-; CHECK-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
-; CHECK-NEXT: [[TMP10:%.*]] = mul <16 x i32> [[TMP8]], [[TMP3]]
+; CHECK-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-NEXT: [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]]
-; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
-; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP11]])
+; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
+; CHECK-NEXT: [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NEXT: [[TMP14:%.*]] = mul <16 x i32> [[TMP10]], [[TMP8]]
+; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP14]])
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -337,18 +337,18 @@ define i32 @usdot_neon(ptr %a, ptr %b) #1 {
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 16
; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
-; CHECK-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
-; CHECK-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
-; CHECK-NEXT: [[TMP10:%.*]] = mul <16 x i32> [[TMP8]], [[TMP3]]
+; CHECK-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-NEXT: [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]]
-; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
-; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP11]])
+; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
+; CHECK-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NEXT: [[TMP14:%.*]] = mul <16 x i32> [[TMP10]], [[TMP8]]
+; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP14]])
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
index 801eb81..b847631 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
@@ -18,10 +18,10 @@ define i32 @dotp(ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]])
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
@@ -47,17 +47,17 @@ define i32 @dotp(ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]]
-; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]]
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = mul <16 x i32> [[TMP7]], [[TMP10]]
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]])
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -80,10 +80,10 @@ define i32 @dotp(ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
-; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]])
; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
@@ -720,26 +720,26 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]]
; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]])
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]]
; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]])
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]]
; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
; CHECK-INTERLEAVE1-NEXT: [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]]
; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]])
@@ -788,59 +788,59 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP38]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP16]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD10]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP13]]
+; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP14]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE1]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP19]])
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE11]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP20]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE1]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP20]])
+; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = sext <16 x i8> [[WIDE_LOAD10]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP19]], [[TMP17]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE11]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP21]])
; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = sext <16 x i8> [[WIDE_LOAD13]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD15:%.*]] = load <16 x i8>, ptr [[TMP26]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = sext <16 x i8> [[WIDE_LOAD14]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD15]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP27]]
+; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD14]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = mul nsw <16 x i32> [[TMP24]], [[TMP28]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP29]])
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP30]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP30]])
+; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = sext <16 x i8> [[WIDE_LOAD13]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD15]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = mul nsw <16 x i32> [[TMP27]], [[TMP25]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP29]])
; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD18:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD19:%.*]] = load <16 x i8>, ptr [[TMP32]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = sext <16 x i8> [[WIDE_LOAD18]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = sext <16 x i8> [[WIDE_LOAD19]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD20:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD21:%.*]] = load <16 x i8>, ptr [[TMP36]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = sext <16 x i8> [[WIDE_LOAD20]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = sext <16 x i8> [[WIDE_LOAD21]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = mul nsw <16 x i32> [[TMP33]], [[TMP37]]
+; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = sext <16 x i8> [[WIDE_LOAD18]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = sext <16 x i8> [[WIDE_LOAD20]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = mul nsw <16 x i32> [[TMP34]], [[TMP56]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP39]])
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP40]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP40]])
+; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = sext <16 x i8> [[WIDE_LOAD19]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = sext <16 x i8> [[WIDE_LOAD21]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = mul nsw <16 x i32> [[TMP35]], [[TMP33]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP37]])
; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD24:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD25:%.*]] = load <16 x i8>, ptr [[TMP42]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = sext <16 x i8> [[WIDE_LOAD24]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = sext <16 x i8> [[WIDE_LOAD25]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD26:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD27:%.*]] = load <16 x i8>, ptr [[TMP46]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = sext <16 x i8> [[WIDE_LOAD26]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = sext <16 x i8> [[WIDE_LOAD27]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP49:%.*]] = mul nsw <16 x i32> [[TMP43]], [[TMP47]]
+; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = sext <16 x i8> [[WIDE_LOAD24]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = sext <16 x i8> [[WIDE_LOAD26]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP50:%.*]] = mul nsw <16 x i32> [[TMP44]], [[TMP48]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP49]])
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP50]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP50]])
+; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = sext <16 x i8> [[WIDE_LOAD25]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = sext <16 x i8> [[WIDE_LOAD27]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = mul nsw <16 x i32> [[TMP43]], [[TMP41]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP45]])
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP51]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -884,26 +884,26 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
-; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-MAXBW-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]]
; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]])
; CHECK-MAXBW-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
-; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
; CHECK-MAXBW-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]]
; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]])
; CHECK-MAXBW-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
-; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
; CHECK-MAXBW-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
; CHECK-MAXBW-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]]
; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
; CHECK-MAXBW-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
-; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
; CHECK-MAXBW-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1
+; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
; CHECK-MAXBW-NEXT: [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]]
; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]])
@@ -2025,7 +2025,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: [[TMP8]] = add <16 x i32> [[TMP7]], [[VEC_PHI]]
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]])
; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15
@@ -2062,7 +2062,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[TMP14]] = add <16 x i32> [[TMP12]], [[VEC_PHI1]]
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-INTERLEAVED-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP14]], [[TMP13]]
; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
@@ -2091,7 +2091,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[TMP8]] = add <16 x i32> [[TMP7]], [[VEC_PHI]]
; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-MAXBW-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-MAXBW-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK-MAXBW: middle.block:
; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]])
; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index 1ace7d4..4636c1b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -18,11 +18,11 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX1]]
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP16]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX1]]
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP20]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = mul <16 x i32> [[TMP3]], [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = mul <16 x i32> [[TMP2]], [[TMP3]]
; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]])
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], 16
; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT1]], 1024
@@ -47,17 +47,17 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP20]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP20]], align 1
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX1]]
-; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP28]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP28]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP28]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = mul <16 x i32> [[TMP4]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]])
; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = mul <16 x i32> [[TMP6]], [[TMP2]]
-; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP7]], [[TMP3]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP8]])
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP7]], [[TMP8]]
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP9]])
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], 32
; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT1]], 1024
@@ -75,26 +75,26 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
; CHECK-MAXBW-NEXT: br label [[VECTOR_PH:%.*]]
; CHECK-MAXBW: vector.ph:
; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-MAXBW: vector.body:
; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
-; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP7]], align 1
; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
-; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = mul <vscale x 8 x i32> [[TMP20]], [[TMP13]]
-; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE5]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1]], <vscale x 8 x i32> [[TMP22]])
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP14]], align 1
+; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32>
+; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
+; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = mul <vscale x 16 x i32> [[TMP4]], [[TMP5]]
+; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP6]])
; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-MAXBW-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-MAXBW: middle.block:
-; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE5]])
+; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]])
; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
; CHECK-MAXBW: scalar.ph:
@@ -134,10 +134,10 @@ define i64 @not_dotp_i8_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %b
; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP1]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
-; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = mul nuw nsw <16 x i64> [[TMP1]], [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
+; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = mul nuw nsw <16 x i64> [[TMP0]], [[TMP1]]
; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP2]])
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -157,54 +157,78 @@ define i64 @not_dotp_i8_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %b
; CHECK-INTERLEAVED: vector.body:
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE6:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE14:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
+; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 32
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 48
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP1]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = mul nuw nsw <16 x i64> [[TMP1]], [[TMP0]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP2]])
-; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-INTERLEAVED-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD7:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i32 32
+; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i32 48
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[NEXT_GEP1]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP18]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = mul nuw nsw <16 x i64> [[TMP2]], [[TMP3]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP4]])
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD5]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul nuw nsw <16 x i64> [[TMP5]], [[TMP6]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE6]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI1]], <16 x i64> [[TMP7]])
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext <16 x i8> [[WIDE_LOAD10]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext <16 x i8> [[WIDE_LOAD6]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = mul nuw nsw <16 x i64> [[TMP12]], [[TMP13]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE13]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI2]], <16 x i64> [[TMP14]])
+; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext <16 x i8> [[WIDE_LOAD11]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = zext <16 x i8> [[WIDE_LOAD7]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = mul nuw nsw <16 x i64> [[TMP15]], [[TMP16]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE14]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI3]], <16 x i64> [[TMP17]])
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE6]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX15:%.*]] = add <2 x i64> [[PARTIAL_REDUCE13]], [[BIN_RDX]]
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX16:%.*]] = add <2 x i64> [[PARTIAL_REDUCE14]], [[BIN_RDX15]]
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX16]])
; CHECK-INTERLEAVED-NEXT: br label [[FOR_EXIT:%.*]]
; CHECK-INTERLEAVED: for.exit:
-; CHECK-INTERLEAVED-NEXT: ret i64 [[TMP4]]
+; CHECK-INTERLEAVED-NEXT: ret i64 [[TMP9]]
;
; CHECK-MAXBW-LABEL: define i64 @not_dotp_i8_to_i64_has_neon_dotprod(
; CHECK-MAXBW-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR1:[0-9]+]] {
; CHECK-MAXBW-NEXT: entry:
; CHECK-MAXBW-NEXT: br label [[VECTOR_PH:%.*]]
; CHECK-MAXBW: vector.ph:
-; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
-; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
-; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[B]], i64 [[N_VEC]]
; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-MAXBW: vector.body:
; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-MAXBW-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; CHECK-MAXBW-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i64>
-; CHECK-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[NEXT_GEP1]], align 1
-; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i64>
-; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = mul nuw nsw <vscale x 8 x i64> [[TMP13]], [[TMP11]]
-; CHECK-MAXBW-NEXT: [[TMP15]] = add <vscale x 8 x i64> [[TMP14]], [[VEC_PHI]]
-; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP1]], align 1
+; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
+; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
+; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = mul nuw nsw <16 x i64> [[TMP0]], [[TMP1]]
+; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP2]])
+; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-MAXBW-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK-MAXBW: middle.block:
-; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64(<vscale x 8 x i64> [[TMP15]])
-; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
-; CHECK-MAXBW: scalar.ph:
+; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
+; CHECK-MAXBW-NEXT: br label [[FOR_EXIT:%.*]]
+; CHECK-MAXBW: for.exit:
+; CHECK-MAXBW-NEXT: ret i64 [[TMP4]]
;
entry:
br label %for.body
@@ -245,10 +269,10 @@ define i64 @not_dotp_i16_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %
; CHECK-INTERLEAVE1-NEXT: [[OFFSET_IDX1:%.*]] = mul i64 [[INDEX]], 2
; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX1]]
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2
-; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i16>, ptr [[NEXT_GEP2]], align 2
-; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = zext <8 x i16> [[WIDE_LOAD3]] to <8 x i64>
-; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = mul nuw nsw <8 x i64> [[TMP1]], [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = zext <8 x i16> [[WIDE_LOAD3]] to <8 x i64>
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
+; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = mul nuw nsw <8 x i64> [[TMP0]], [[TMP1]]
; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP2]])
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -269,30 +293,50 @@ define i64 @not_dotp_i16_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE15:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[OFFSET_IDX]]
; CHECK-INTERLEAVED-NEXT: [[OFFSET_IDX2:%.*]] = mul i64 [[INDEX]], 2
; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX2]]
; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i32 8
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i32 24
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2
-; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[WIDE_LOAD4]] to <8 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i32 8
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x i16>, ptr [[TMP10]], align 2
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i16>, ptr [[TMP11]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i32 8
+; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i32 24
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i16>, ptr [[NEXT_GEP3]], align 2
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2
-; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <8 x i16> [[WIDE_LOAD5]] to <8 x i64>
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD11:%.*]] = load <8 x i16>, ptr [[TMP18]], align 2
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i16>, ptr [[TMP19]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[WIDE_LOAD5]] to <8 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = mul nuw nsw <8 x i64> [[TMP2]], [[TMP3]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP4]])
; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <8 x i16> [[WIDE_LOAD6]] to <8 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = mul nuw nsw <8 x i64> [[TMP4]], [[TMP1]]
-; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul nuw nsw <8 x i64> [[TMP5]], [[TMP2]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP6]])
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <8 x i16> [[WIDE_LOAD4]] to <8 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul nuw nsw <8 x i64> [[TMP5]], [[TMP6]]
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI1]], <8 x i64> [[TMP7]])
-; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext <8 x i16> [[WIDE_LOAD11]] to <8 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext <8 x i16> [[WIDE_LOAD7]] to <8 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = mul nuw nsw <8 x i64> [[TMP12]], [[TMP13]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE14]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI2]], <8 x i64> [[TMP14]])
+; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext <8 x i16> [[WIDE_LOAD12]] to <8 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = zext <8 x i16> [[WIDE_LOAD8]] to <8 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = mul nuw nsw <8 x i64> [[TMP15]], [[TMP16]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE15]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI3]], <8 x i64> [[TMP17]])
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE7]], [[PARTIAL_REDUCE]]
-; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX16:%.*]] = add <2 x i64> [[PARTIAL_REDUCE14]], [[BIN_RDX]]
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX17:%.*]] = add <2 x i64> [[PARTIAL_REDUCE15]], [[BIN_RDX16]]
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX17]])
; CHECK-INTERLEAVED-NEXT: br label [[FOR_EXIT:%.*]]
; CHECK-INTERLEAVED: for.exit:
; CHECK-INTERLEAVED-NEXT: ret i64 [[TMP9]]
@@ -302,36 +346,28 @@ define i64 @not_dotp_i16_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %
; CHECK-MAXBW-NEXT: entry:
; CHECK-MAXBW-NEXT: br label [[VECTOR_PH:%.*]]
; CHECK-MAXBW: vector.ph:
-; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = mul i64 [[N_VEC]], 2
-; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]]
-; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = mul i64 [[N_VEC]], 2
-; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-MAXBW: vector.body:
; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-MAXBW-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
; CHECK-MAXBW-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[OFFSET_IDX]]
; CHECK-MAXBW-NEXT: [[OFFSET_IDX1:%.*]] = mul i64 [[INDEX]], 2
; CHECK-MAXBW-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX1]]
-; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[NEXT_GEP]], align 2
-; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = zext <vscale x 4 x i16> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; CHECK-MAXBW-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i16>, ptr [[NEXT_GEP2]], align 2
-; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = zext <vscale x 4 x i16> [[WIDE_LOAD3]] to <vscale x 4 x i64>
-; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = mul nuw nsw <vscale x 4 x i64> [[TMP15]], [[TMP13]]
-; CHECK-MAXBW-NEXT: [[TMP17]] = add <vscale x 4 x i64> [[TMP16]], [[VEC_PHI]]
-; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i16>, ptr [[NEXT_GEP2]], align 2
+; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = zext <8 x i16> [[WIDE_LOAD3]] to <8 x i64>
+; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
+; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = mul nuw nsw <8 x i64> [[TMP0]], [[TMP1]]
+; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP2]])
+; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-MAXBW-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK-MAXBW: middle.block:
-; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP17]])
-; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
-; CHECK-MAXBW: scalar.ph:
+; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
+; CHECK-MAXBW-NEXT: br label [[FOR_EXIT:%.*]]
+; CHECK-MAXBW: for.exit:
+; CHECK-MAXBW-NEXT: ret i64 [[TMP4]]
;
entry:
br label %for.body
@@ -687,7 +723,7 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 {
; CHECK-MAXBW-NEXT: [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]]
; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-MAXBW-NEXT: [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-MAXBW-NEXT: br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-MAXBW-NEXT: br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK-MAXBW: middle.block:
; CHECK-MAXBW-NEXT: [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP138]])
; CHECK-MAXBW-NEXT: br label [[FOR_EXIT:%.*]]
@@ -835,7 +871,7 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
; CHECK-MAXBW-NEXT: [[TMP27:%.*]] = add <vscale x 8 x i32> [[TMP25]], [[TMP26]]
; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-MAXBW-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK-MAXBW: middle.block:
; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = call i32 @llvm.vscale.i32()
; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = mul nuw i32 [[TMP20]], 8
@@ -964,7 +1000,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = add <vscale x 8 x i32> [[TMP20]], [[TMP19]]
; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-MAXBW-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
; CHECK-MAXBW: middle.block:
; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = call i32 @llvm.vscale.i32()
; CHECK-MAXBW-NEXT: [[TMP24:%.*]] = mul nuw i32 [[TMP23]], 8
@@ -1024,26 +1060,26 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul nsw <16 x i32> [[TMP12]], [[TMP23]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP12]]
; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP13]])
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP14]], [[TMP15]]
; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP16]])
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP18]]
; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP19]])
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP20]], [[TMP21]]
; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP22]])
@@ -1092,58 +1128,58 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP43]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = mul nsw <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP15]])
; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = sext <16 x i8> [[WIDE_LOAD10]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP12]]
-; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = mul nsw <16 x i32> [[TMP16]], [[TMP13]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP17]])
+; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = mul nsw <16 x i32> [[TMP16]], [[TMP17]]
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE11]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP18]])
; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = sext <16 x i8> [[WIDE_LOAD13]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD15:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD14]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = sext <16 x i8> [[WIDE_LOAD15]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = mul nsw <16 x i32> [[TMP20]], [[TMP23]]
-; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP21]], [[TMP24]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP25]])
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD15:%.*]] = load <16 x i8>, ptr [[TMP20]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = sext <16 x i8> [[WIDE_LOAD14]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = mul nsw <16 x i32> [[TMP21]], [[TMP22]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP23]])
+; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = sext <16 x i8> [[WIDE_LOAD13]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD15]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP24]], [[TMP25]]
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP26]])
; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD18:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD19:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD18]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = sext <16 x i8> [[WIDE_LOAD19]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD20:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD21:%.*]] = load <16 x i8>, ptr [[TMP30]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = sext <16 x i8> [[WIDE_LOAD20]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = sext <16 x i8> [[WIDE_LOAD21]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP31]]
-; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = mul nsw <16 x i32> [[TMP29]], [[TMP48]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE22]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP33]])
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD21:%.*]] = load <16 x i8>, ptr [[TMP28]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = sext <16 x i8> [[WIDE_LOAD18]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD20]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP29]], [[TMP30]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE22]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP31]])
+; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = sext <16 x i8> [[WIDE_LOAD19]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = sext <16 x i8> [[WIDE_LOAD21]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = mul nsw <16 x i32> [[TMP48]], [[TMP33]]
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE23]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP34]])
; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD24:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD25:%.*]] = load <16 x i8>, ptr [[TMP35]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = sext <16 x i8> [[WIDE_LOAD24]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = sext <16 x i8> [[WIDE_LOAD25]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD26:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD27:%.*]] = load <16 x i8>, ptr [[TMP38]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = sext <16 x i8> [[WIDE_LOAD26]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = sext <16 x i8> [[WIDE_LOAD27]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = mul nsw <16 x i32> [[TMP36]], [[TMP39]]
-; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = mul nsw <16 x i32> [[TMP37]], [[TMP40]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE28]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP41]])
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD27:%.*]] = load <16 x i8>, ptr [[TMP36]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = sext <16 x i8> [[WIDE_LOAD24]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = sext <16 x i8> [[WIDE_LOAD26]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = mul nsw <16 x i32> [[TMP37]], [[TMP38]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE28]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP39]])
+; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = sext <16 x i8> [[WIDE_LOAD25]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = sext <16 x i8> [[WIDE_LOAD27]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = mul nsw <16 x i32> [[TMP40]], [[TMP41]]
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE29]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP42]])
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1165,21 +1201,21 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
; CHECK-MAXBW-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-MAXBW-NEXT: entry:
; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3
+; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP1]]
; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-MAXBW: vector.ph:
; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP3]]
; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-MAXBW: vector.body:
; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT: [[VEC_PHI4:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE16:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT: [[VEC_PHI5:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE17:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT: [[VEC_PHI6:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT: [[VEC_PHI7:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VEC_PHI2:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = or disjoint i64 [[INDEX]], 1
@@ -1191,38 +1227,38 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = or disjoint i64 [[INDEX]], 3
; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]]
; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]]
-; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
-; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT: [[WIDE_LOAD9:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
-; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD9]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = mul nsw <vscale x 8 x i32> [[TMP29]], [[TMP23]]
-; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE11]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI7]], <vscale x 8 x i32> [[TMP31]])
-; CHECK-MAXBW-NEXT: [[WIDE_LOAD12:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
-; CHECK-MAXBW-NEXT: [[TMP37:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD12]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT: [[WIDE_LOAD14:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
-; CHECK-MAXBW-NEXT: [[TMP43:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD14]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT: [[TMP45:%.*]] = mul nsw <vscale x 8 x i32> [[TMP37]], [[TMP43]]
-; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI6]], <vscale x 8 x i32> [[TMP45]])
-; CHECK-MAXBW-NEXT: [[WIDE_LOAD18:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
-; CHECK-MAXBW-NEXT: [[TMP51:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD18]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT: [[WIDE_LOAD20:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
-; CHECK-MAXBW-NEXT: [[TMP57:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD20]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT: [[TMP59:%.*]] = mul nsw <vscale x 8 x i32> [[TMP51]], [[TMP57]]
-; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE17]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI5]], <vscale x 8 x i32> [[TMP59]])
-; CHECK-MAXBW-NEXT: [[WIDE_LOAD24:%.*]] = load <vscale x 8 x i8>, ptr [[TMP16]], align 1
-; CHECK-MAXBW-NEXT: [[TMP65:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD24]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT: [[WIDE_LOAD26:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1
-; CHECK-MAXBW-NEXT: [[TMP71:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD26]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT: [[TMP73:%.*]] = mul nsw <vscale x 8 x i32> [[TMP65]], [[TMP71]]
-; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE16]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI4]], <vscale x 8 x i32> [[TMP73]])
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP7]], align 1
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1
+; CHECK-MAXBW-NEXT: [[TMP27:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
+; CHECK-MAXBW-NEXT: [[TMP32:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
+; CHECK-MAXBW-NEXT: [[TMP33:%.*]] = mul nsw <vscale x 16 x i32> [[TMP27]], [[TMP32]]
+; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI3]], <vscale x 16 x i32> [[TMP33]])
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD5:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD6:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
+; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD5]] to <vscale x 16 x i32>
+; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD6]] to <vscale x 16 x i32>
+; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = mul nsw <vscale x 16 x i32> [[TMP18]], [[TMP19]]
+; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE7]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI2]], <vscale x 16 x i32> [[TMP20]])
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD8:%.*]] = load <vscale x 16 x i8>, ptr [[TMP13]], align 1
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD9:%.*]] = load <vscale x 16 x i8>, ptr [[TMP14]], align 1
+; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD8]] to <vscale x 16 x i32>
+; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD9]] to <vscale x 16 x i32>
+; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = mul nsw <vscale x 16 x i32> [[TMP21]], [[TMP22]]
+; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE10]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI1]], <vscale x 16 x i32> [[TMP23]])
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD11:%.*]] = load <vscale x 16 x i8>, ptr [[TMP16]], align 1
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD12:%.*]] = load <vscale x 16 x i8>, ptr [[TMP17]], align 1
+; CHECK-MAXBW-NEXT: [[TMP24:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD11]] to <vscale x 16 x i32>
+; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD12]] to <vscale x 16 x i32>
+; CHECK-MAXBW-NEXT: [[TMP26:%.*]] = mul nsw <vscale x 16 x i32> [[TMP24]], [[TMP25]]
+; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE13]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP26]])
; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-MAXBW-NEXT: [[TMP74:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT: br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-MAXBW-NEXT: br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
; CHECK-MAXBW: middle.block:
-; CHECK-MAXBW-NEXT: [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE16]])
-; CHECK-MAXBW-NEXT: [[TMP40:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE17]])
-; CHECK-MAXBW-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]])
-; CHECK-MAXBW-NEXT: [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE11]])
+; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE13]])
+; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE10]])
+; CHECK-MAXBW-NEXT: [[TMP30:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE7]])
+; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]])
; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-MAXBW: scalar.ph:
@@ -1390,7 +1426,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]])
; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = xor i1 [[TMP19]], true
-; CHECK-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
; CHECK-MAXBW: middle.block:
; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]])
; CHECK-MAXBW-NEXT: br label [[EXIT:%.*]]
@@ -1525,7 +1561,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
; CHECK-MAXBW-NEXT: [[TMP24]] = add <vscale x 8 x i32> [[TMP22]], [[VEC_PHI1]]
; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-MAXBW-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
; CHECK-MAXBW: middle.block:
; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP24]])
; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
@@ -1565,110 +1601,93 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-NEXT: entry:
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVE1: vector.ph:
-; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
-; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 41, [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 41, [[N_MOD_VF]]
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVE1: vector.body:
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[TMP7]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64>
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = add nuw nsw i64 [[INDEX]], 1
; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD1]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP13]], [[TMP9]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP15]] = add <vscale x 2 x i64> [[VEC_PHI]], [[TMP14]]
-; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i64>
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
+; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul nuw nsw <16 x i64> [[TMP3]], [[TMP4]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP5]])
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
-; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP15]])
-; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH:%.*]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVE1-NEXT: br label [[SCALAR_PH:%.*]]
; CHECK-INTERLEAVE1: scalar.ph:
;
; CHECK-INTERLEAVED-LABEL: define i64 @dotp_cost_disagreement(
; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-INTERLEAVED-NEXT: entry:
-; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 41, [[TMP1]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVED: vector.ph:
-; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 41, [[TMP3]]
-; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 41, [[N_MOD_VF]]
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVED: vector.body:
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = shl nuw i64 [[TMP9]], 1
-; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i64 [[TMP10]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[TMP7]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 2 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD2]] to <vscale x 2 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = add nuw nsw i64 [[INDEX]], 1
; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP14]]
-; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = shl nuw i64 [[TMP17]], 1
-; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP15]], i64 [[TMP18]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i8>, ptr [[TMP15]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 2 x i8>, ptr [[TMP19]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD3]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD4]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP20]], [[TMP12]]
-; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP21]], [[TMP13]]
-; CHECK-INTERLEAVED-NEXT: [[TMP24]] = add <vscale x 2 x i64> [[VEC_PHI]], [[TMP22]]
-; CHECK-INTERLEAVED-NEXT: [[TMP25]] = add <vscale x 2 x i64> [[VEC_PHI1]], [[TMP23]]
-; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP15]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP15]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = mul nuw nsw <16 x i64> [[TMP5]], [[TMP6]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP13]])
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul nuw nsw <16 x i64> [[TMP8]], [[TMP9]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI1]], <16 x i64> [[TMP10]])
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <vscale x 2 x i64> [[TMP25]], [[TMP24]]
-; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT: br label [[SCALAR_PH:%.*]]
; CHECK-INTERLEAVED: scalar.ph:
;
; CHECK-MAXBW-LABEL: define i64 @dotp_cost_disagreement(
; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-MAXBW-NEXT: entry:
; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3
+; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 41, [[TMP1]]
; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-MAXBW: vector.ph:
; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 41, [[TMP3]]
; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 41, [[N_MOD_VF]]
; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-MAXBW: vector.body:
; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
-; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i64>
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP7]], align 1
; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add nuw nsw i64 [[INDEX]], 1
; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP10]]
-; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
-; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i64>
-; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = mul nuw nsw <vscale x 8 x i64> [[TMP8]], [[TMP9]]
-; CHECK-MAXBW-NEXT: [[TMP14]] = add <vscale x 8 x i64> [[VEC_PHI]], [[TMP13]]
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
+; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i64>
+; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i64>
+; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = mul nuw nsw <vscale x 16 x i64> [[TMP12]], [[TMP8]]
+; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv16i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 16 x i64> [[TMP9]])
; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-MAXBW-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
; CHECK-MAXBW: middle.block:
-; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64(<vscale x 8 x i64> [[TMP14]])
+; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE]])
; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]]
; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-MAXBW: scalar.ph:
@@ -1971,7 +1990,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 {
; CHECK-MAXBW-NEXT: [[TMP11]] = add <vscale x 4 x i64> [[TMP10]], [[VEC_PHI]]
; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
; CHECK-MAXBW: middle.block:
; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP11]])
; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -2104,7 +2123,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 {
; CHECK-MAXBW-NEXT: [[TMP11]] = add <vscale x 4 x i64> [[TMP10]], [[VEC_PHI]]
; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
; CHECK-MAXBW: middle.block:
; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP11]])
; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -2160,10 +2179,10 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 {
; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP1]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
-; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = mul nuw nsw <16 x i64> [[TMP6]], [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
+; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
+; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = mul nuw nsw <16 x i64> [[TMP5]], [[TMP6]]
; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP10]])
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -2181,10 +2200,10 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 {
; CHECK-INTERLEAVED-NEXT: br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
; CHECK-INTERLEAVED: for.body.preheader:
; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
-; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16
+; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32
; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVED: vector.ph:
-; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 32
; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = trunc i64 [[N_VEC]] to i32
; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
@@ -2194,19 +2213,29 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 {
; CHECK-INTERLEAVED: vector.body:
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ [[TMP4]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE6:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP2]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul nuw nsw <16 x i64> [[TMP6]], [[TMP5]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP10]])
-; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[NEXT_GEP2]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = mul nuw nsw <16 x i64> [[TMP13]], [[TMP15]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP16]])
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD5]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = mul nuw nsw <16 x i64> [[TMP10]], [[TMP11]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE6]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI1]], <16 x i64> [[TMP12]])
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE6]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]])
; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; CHECK-INTERLEAVED: scalar.ph:
@@ -2219,35 +2248,35 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 {
; CHECK-MAXBW: for.body.preheader:
; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
+; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-MAXBW: vector.ph:
; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 8
+; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 16
; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = trunc i64 [[N_VEC]] to i32
; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[N_VEC]]
-; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = insertelement <vscale x 8 x i64> zeroinitializer, i64 [[COST]], i32 0
+; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = insertelement <vscale x 2 x i64> zeroinitializer, i64 [[COST]], i32 0
; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-MAXBW: vector.body:
; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i64> [ [[TMP10]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ [[TMP12]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-MAXBW-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; CHECK-MAXBW-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i64>
-; CHECK-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[NEXT_GEP1]], align 1
-; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i64>
-; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = mul nuw nsw <vscale x 8 x i64> [[TMP16]], [[TMP14]]
-; CHECK-MAXBW-NEXT: [[TMP20]] = add <vscale x 8 x i64> [[TMP17]], [[VEC_PHI]]
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[NEXT_GEP]], align 1
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[NEXT_GEP1]], align 1
+; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i64>
+; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i64>
+; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = mul nuw nsw <vscale x 16 x i64> [[TMP14]], [[TMP10]]
+; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv16i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 16 x i64> [[TMP11]])
; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; CHECK-MAXBW-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
; CHECK-MAXBW: middle.block:
-; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64(<vscale x 8 x i64> [[TMP20]])
+; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE]])
; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; CHECK-MAXBW: scalar.ph:
@@ -2319,17 +2348,16 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum,
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVE1: vector.body:
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ [[TMP1]], [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ [[TMP2]], [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ [[TMP5]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP0]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ [[TMP1]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE20:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ [[TMP2]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE18:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ [[TMP5]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE15:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = shl nsw i64 [[INDEX]], 3
; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP11]]
; CHECK-INTERLEAVE1-NEXT: [[WIDE_VEC:%.*]] = load <128 x i8>, ptr [[TMP12]], align 1
@@ -2341,42 +2369,43 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum,
; CHECK-INTERLEAVE1-NEXT: [[STRIDED_VEC12:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61, i32 69, i32 77, i32 85, i32 93, i32 101, i32 109, i32 117, i32 125>
; CHECK-INTERLEAVE1-NEXT: [[STRIDED_VEC13:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62, i32 70, i32 78, i32 86, i32 94, i32 102, i32 110, i32 118, i32 126>
; CHECK-INTERLEAVE1-NEXT: [[STRIDED_VEC14:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63, i32 71, i32 79, i32 87, i32 95, i32 103, i32 111, i32 119, i32 127>
-; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[STRIDED_VEC]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP13]], [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP15]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP14]])
-; CHECK-INTERLEAVE1-NEXT: [[TMP29:%.*]] = sext <16 x i8> [[STRIDED_VEC8]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP29]], [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP18]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP16]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP35:%.*]] = sext <16 x i8> [[STRIDED_VEC]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP44:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul nsw <16 x i32> [[TMP35]], [[TMP44]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP13]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = sext <16 x i8> [[STRIDED_VEC8]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP14]], [[TMP44]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE15]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP16]])
; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = sext <16 x i8> [[STRIDED_VEC9]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP31]])
-; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = sext <16 x i8> [[STRIDED_VEC10]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = mul nsw <16 x i32> [[TMP19]], [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP24]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP20]])
-; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = sext <16 x i8> [[STRIDED_VEC11]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP32]], [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP27]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP22]])
-; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[STRIDED_VEC12]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP34:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP30]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP34]])
-; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[STRIDED_VEC13]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP25]], [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP33]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
-; CHECK-INTERLEAVE1-NEXT: [[TMP35:%.*]] = sext <16 x i8> [[STRIDED_VEC14]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = mul nsw <16 x i32> [[TMP35]], [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP36]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP28]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP44]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP19]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[STRIDED_VEC10]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP20]], [[TMP44]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP22]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[STRIDED_VEC11]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP44]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE18]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP25]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = sext <16 x i8> [[STRIDED_VEC12]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = mul nsw <16 x i32> [[TMP26]], [[TMP44]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE19]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP28]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP29:%.*]] = sext <16 x i8> [[STRIDED_VEC13]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP29]], [[TMP44]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE20]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP31]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = sext <16 x i8> [[STRIDED_VEC14]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP34:%.*]] = mul nsw <16 x i32> [[TMP32]], [[TMP44]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP34]])
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-INTERLEAVE1-NEXT: [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
-; CHECK-INTERLEAVE1-NEXT: [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP36]])
-; CHECK-INTERLEAVE1-NEXT: [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP33]])
-; CHECK-INTERLEAVE1-NEXT: [[TMP40:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP30]])
-; CHECK-INTERLEAVE1-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP27]])
-; CHECK-INTERLEAVE1-NEXT: [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP24]])
-; CHECK-INTERLEAVE1-NEXT: [[TMP43:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP21]])
-; CHECK-INTERLEAVE1-NEXT: [[TMP44:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP18]])
-; CHECK-INTERLEAVE1-NEXT: [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP15]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE19]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE18]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP40:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE17]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE16]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE15]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP43:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
; CHECK-INTERLEAVE1: scalar.ph:
@@ -2429,7 +2458,6 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum,
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = shl nsw i64 [[INDEX]], 3
; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP11]]
; CHECK-INTERLEAVED-NEXT: [[WIDE_VEC:%.*]] = load <128 x i8>, ptr [[TMP12]], align 1
@@ -2441,42 +2469,43 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum,
; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC12:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61, i32 69, i32 77, i32 85, i32 93, i32 101, i32 109, i32 117, i32 125>
; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC13:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62, i32 70, i32 78, i32 86, i32 94, i32 102, i32 110, i32 118, i32 126>
; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC14:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63, i32 71, i32 79, i32 87, i32 95, i32 103, i32 111, i32 119, i32 127>
-; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[STRIDED_VEC]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP13]], [[TMP10]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP14]])
-; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[STRIDED_VEC8]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP10]]
+; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = sext <16 x i8> [[STRIDED_VEC]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = mul nsw <16 x i32> [[TMP35]], [[TMP44]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP13]])
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = sext <16 x i8> [[STRIDED_VEC8]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP14]], [[TMP44]]
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE15]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP16]])
; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = sext <16 x i8> [[STRIDED_VEC9]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP10]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP18]])
-; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = sext <16 x i8> [[STRIDED_VEC10]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = mul nsw <16 x i32> [[TMP19]], [[TMP10]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP20]])
-; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = sext <16 x i8> [[STRIDED_VEC11]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP21]], [[TMP10]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE18]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP22]])
-; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[STRIDED_VEC12]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP10]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE19]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP24]])
-; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[STRIDED_VEC13]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP25]], [[TMP10]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE20]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
-; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = sext <16 x i8> [[STRIDED_VEC14]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = mul nsw <16 x i32> [[TMP27]], [[TMP10]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP28]])
+; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP44]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP19]])
+; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[STRIDED_VEC10]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP20]], [[TMP44]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP22]])
+; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[STRIDED_VEC11]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP44]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE18]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP25]])
+; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = sext <16 x i8> [[STRIDED_VEC12]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = mul nsw <16 x i32> [[TMP26]], [[TMP44]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE19]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP28]])
+; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = sext <16 x i8> [[STRIDED_VEC13]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP45]], [[TMP44]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE20]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP31]])
+; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = sext <16 x i8> [[STRIDED_VEC14]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = mul nsw <16 x i32> [[TMP32]], [[TMP44]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP34]])
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]])
-; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]])
-; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE19]])
-; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE18]])
-; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE17]])
-; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE16]])
-; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE15]])
-; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]])
+; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]])
+; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE19]])
+; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE18]])
+; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE17]])
+; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE16]])
+; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE15]])
+; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
; CHECK-INTERLEAVED: scalar.ph:
@@ -2529,7 +2558,6 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum,
; CHECK-MAXBW-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
-; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = shl nsw i64 [[INDEX]], 3
; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP11]]
; CHECK-MAXBW-NEXT: [[WIDE_VEC:%.*]] = load <128 x i8>, ptr [[TMP12]], align 1
@@ -2541,42 +2569,43 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum,
; CHECK-MAXBW-NEXT: [[STRIDED_VEC12:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61, i32 69, i32 77, i32 85, i32 93, i32 101, i32 109, i32 117, i32 125>
; CHECK-MAXBW-NEXT: [[STRIDED_VEC13:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62, i32 70, i32 78, i32 86, i32 94, i32 102, i32 110, i32 118, i32 126>
; CHECK-MAXBW-NEXT: [[STRIDED_VEC14:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63, i32 71, i32 79, i32 87, i32 95, i32 103, i32 111, i32 119, i32 127>
-; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[STRIDED_VEC]] to <16 x i32>
-; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP13]], [[TMP10]]
-; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP14]])
-; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[STRIDED_VEC8]] to <16 x i32>
-; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP10]]
+; CHECK-MAXBW-NEXT: [[TMP35:%.*]] = sext <16 x i8> [[STRIDED_VEC]] to <16 x i32>
+; CHECK-MAXBW-NEXT: [[TMP44:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = mul nsw <16 x i32> [[TMP35]], [[TMP44]]
+; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP13]])
+; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = sext <16 x i8> [[STRIDED_VEC8]] to <16 x i32>
+; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP14]], [[TMP44]]
; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE15]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP16]])
; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = sext <16 x i8> [[STRIDED_VEC9]] to <16 x i32>
-; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP10]]
-; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP18]])
-; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = sext <16 x i8> [[STRIDED_VEC10]] to <16 x i32>
-; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = mul nsw <16 x i32> [[TMP19]], [[TMP10]]
-; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP20]])
-; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = sext <16 x i8> [[STRIDED_VEC11]] to <16 x i32>
-; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP21]], [[TMP10]]
-; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE18]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP22]])
-; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[STRIDED_VEC12]] to <16 x i32>
-; CHECK-MAXBW-NEXT: [[TMP24:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP10]]
-; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE19]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP24]])
-; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[STRIDED_VEC13]] to <16 x i32>
-; CHECK-MAXBW-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP25]], [[TMP10]]
-; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE20]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
-; CHECK-MAXBW-NEXT: [[TMP27:%.*]] = sext <16 x i8> [[STRIDED_VEC14]] to <16 x i32>
-; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = mul nsw <16 x i32> [[TMP27]], [[TMP10]]
-; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP28]])
+; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP44]]
+; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP19]])
+; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[STRIDED_VEC10]] to <16 x i32>
+; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP20]], [[TMP44]]
+; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP22]])
+; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[STRIDED_VEC11]] to <16 x i32>
+; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP44]]
+; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE18]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP25]])
+; CHECK-MAXBW-NEXT: [[TMP26:%.*]] = sext <16 x i8> [[STRIDED_VEC12]] to <16 x i32>
+; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = mul nsw <16 x i32> [[TMP26]], [[TMP44]]
+; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE19]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP28]])
+; CHECK-MAXBW-NEXT: [[TMP45:%.*]] = sext <16 x i8> [[STRIDED_VEC13]] to <16 x i32>
+; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP45]], [[TMP44]]
+; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE20]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP31]])
+; CHECK-MAXBW-NEXT: [[TMP32:%.*]] = sext <16 x i8> [[STRIDED_VEC14]] to <16 x i32>
+; CHECK-MAXBW-NEXT: [[TMP34:%.*]] = mul nsw <16 x i32> [[TMP32]], [[TMP44]]
+; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP34]])
; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
+; CHECK-MAXBW-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
; CHECK-MAXBW: middle.block:
-; CHECK-MAXBW-NEXT: [[TMP30:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]])
-; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]])
-; CHECK-MAXBW-NEXT: [[TMP32:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE19]])
-; CHECK-MAXBW-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE18]])
-; CHECK-MAXBW-NEXT: [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE17]])
-; CHECK-MAXBW-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE16]])
-; CHECK-MAXBW-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE15]])
-; CHECK-MAXBW-NEXT: [[TMP37:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-MAXBW-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]])
+; CHECK-MAXBW-NEXT: [[TMP37:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]])
+; CHECK-MAXBW-NEXT: [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE19]])
+; CHECK-MAXBW-NEXT: [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE18]])
+; CHECK-MAXBW-NEXT: [[TMP40:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE17]])
+; CHECK-MAXBW-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE16]])
+; CHECK-MAXBW-NEXT: [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE15]])
+; CHECK-MAXBW-NEXT: [[TMP43:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
; CHECK-MAXBW: scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-interleave.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-interleave.ll
index b308b92..bd9fae6 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-interleave.ll
@@ -23,12 +23,12 @@ define i32 @partial_reduce_with_non_constant_start_value(ptr %src, i32 %rdx.star
; IC2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
; IC2-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
; IC2-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
-; IC2-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; IC2-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
-; IC2-NEXT: [[TMP6:%.*]] = mul nuw nsw <16 x i32> [[TMP4]], [[TMP4]]
+; IC2-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; IC2-NEXT: [[TMP7:%.*]] = mul nuw nsw <16 x i32> [[TMP5]], [[TMP5]]
-; IC2-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]])
-; IC2-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP7]])
+; IC2-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]])
+; IC2-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; IC2-NEXT: [[TMP6:%.*]] = mul nuw nsw <16 x i32> [[TMP8]], [[TMP8]]
+; IC2-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP6]])
; IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; IC2-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; IC2-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -80,18 +80,18 @@ define i32 @partial_reduce_with_non_constant_start_value(ptr %src, i32 %rdx.star
; IC4-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
; IC4-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
; IC4-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; IC4-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; IC4-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; IC4-NEXT: [[TMP13:%.*]] = mul nuw nsw <16 x i32> [[TMP9]], [[TMP9]]
+; IC4-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP13]])
; IC4-NEXT: [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
-; IC4-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
-; IC4-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
-; IC4-NEXT: [[TMP10:%.*]] = mul nuw nsw <16 x i32> [[TMP6]], [[TMP6]]
; IC4-NEXT: [[TMP11:%.*]] = mul nuw nsw <16 x i32> [[TMP7]], [[TMP7]]
-; IC4-NEXT: [[TMP12:%.*]] = mul nuw nsw <16 x i32> [[TMP8]], [[TMP8]]
-; IC4-NEXT: [[TMP13:%.*]] = mul nuw nsw <16 x i32> [[TMP9]], [[TMP9]]
-; IC4-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
; IC4-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP11]])
+; IC4-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
+; IC4-NEXT: [[TMP12:%.*]] = mul nuw nsw <16 x i32> [[TMP10]], [[TMP10]]
; IC4-NEXT: [[PARTIAL_REDUCE8]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP12]])
-; IC4-NEXT: [[PARTIAL_REDUCE9]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP13]])
+; IC4-NEXT: [[TMP14:%.*]] = zext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
+; IC4-NEXT: [[TMP16:%.*]] = mul nuw nsw <16 x i32> [[TMP14]], [[TMP14]]
+; IC4-NEXT: [[PARTIAL_REDUCE9]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]])
; IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
; IC4-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; IC4-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll
index 7bb4715..6dae09e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll
@@ -18,11 +18,11 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = mul <16 x i32> [[TMP3]], [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = mul <16 x i32> [[TMP2]], [[TMP3]]
; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = sub <16 x i32> zeroinitializer, [[TMP4]]
; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]])
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
@@ -48,19 +48,19 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP7]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP14]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP14]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = mul <16 x i32> [[TMP6]], [[TMP2]]
-; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP15]], [[TMP3]]
-; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = sub <16 x i32> zeroinitializer, [[TMP8]]
-; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = sub <16 x i32> zeroinitializer, [[TMP9]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = mul <16 x i32> [[TMP4]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = sub <16 x i32> zeroinitializer, [[TMP6]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP15]])
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul <16 x i32> [[TMP8]], [[TMP9]]
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = sub <16 x i32> zeroinitializer, [[TMP10]]
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP11]])
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -78,27 +78,27 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
; CHECK-MAXBW-NEXT: br label [[ENTRY:%.*]]
; CHECK-MAXBW: vector.ph:
; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]]
; CHECK-MAXBW: vector.body:
; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[PARTIAL_REDUCE:%.*]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[PARTIAL_REDUCE:%.*]], [[FOR_BODY]] ]
; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
-; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
-; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP7]], align 1
; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
-; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
-; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = mul <vscale x 8 x i32> [[TMP12]], [[TMP9]]
-; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = sub <vscale x 8 x i32> zeroinitializer, [[TMP13]]
-; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP14]])
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1
+; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32>
+; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
+; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = mul <vscale x 16 x i32> [[TMP4]], [[TMP5]]
+; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[TMP6]]
+; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP8]])
; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], [[TMP3]]
; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
; CHECK-MAXBW-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-MAXBW: middle.block:
-; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]])
+; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]])
; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
; CHECK-MAXBW: scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
index 70532ad..46ec858 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
@@ -45,8 +45,8 @@ define i32 @zext_add_reduc_i8_i32_sve(ptr %a) #0 {
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]])
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP4]])
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -138,8 +138,8 @@ define i32 @zext_add_reduc_i8_i32_neon(ptr %a) #2 {
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]])
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP4]])
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -227,8 +227,8 @@ define i64 @zext_add_reduc_i8_i64(ptr %a) #0 {
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP4]])
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI1]], <16 x i64> [[TMP5]])
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -321,8 +321,8 @@ define i64 @zext_add_reduc_i16_i64(ptr %a) #0 {
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2
; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <8 x i16> [[WIDE_LOAD2]] to <8 x i64>
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP4]])
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <8 x i16> [[WIDE_LOAD2]] to <8 x i64>
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI1]], <8 x i64> [[TMP5]])
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -421,12 +421,12 @@ define i32 @zext_add_reduc_i8_i32_has_neon_dotprod(ptr %a) #1 {
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]])
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP5]])
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE8]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP10]])
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE9]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP7]])
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -811,8 +811,8 @@ define i32 @sext_add_reduc_i8_i32(ptr %a) #0 {
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]])
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP5]])
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -1000,11 +1000,10 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 {
; CHECK-INTERLEAVE1: vector.ph:
; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], 16
; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP0]], [[N_MOD_VF]]
-; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[C]], i64 0
-; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = add i32 [[D]], [[N_VEC]]
; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[A]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[C]], i64 0
+; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]]
; CHECK-INTERLEAVE1: vector.body:
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ]
@@ -1012,7 +1011,8 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 {
; CHECK-INTERLEAVE1-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[D]], [[INDEX]]
; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[OFFSET_IDX]]
; CHECK-INTERLEAVE1-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP4]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]])
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
@@ -1031,23 +1031,23 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 {
; CHECK-INTERLEAVED: vector.ph:
; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], 32
; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = add i32 [[D]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[A]], i32 0
; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[C]], i64 0
; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
-; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = add i32 [[D]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[A]], i32 0
-; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVED: vector.body:
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP5]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE2:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[D]], [[VEC_PHI1]]
; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[OFFSET_IDX]]
; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16
; CHECK-INTERLEAVED-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP4]], align 1
; CHECK-INTERLEAVED-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP6]], align 1
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]])
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE2]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP3]])
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE2]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP5]])
; CHECK-INTERLEAVED-NEXT: [[TMP22]] = add nuw i32 [[VEC_PHI1]], 32
; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP22]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
@@ -1071,11 +1071,10 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 {
; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = mul nuw i32 [[TMP3]], 16
; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], [[TMP4]]
; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP0]], [[N_MOD_VF]]
-; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[C]], i64 0
-; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = add i32 [[D]], [[N_VEC]]
; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[A]], i32 0
-; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = zext <vscale x 16 x i8> [[BROADCAST_SPLAT]] to <vscale x 16 x i32>
+; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[C]], i64 0
+; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]]
; CHECK-MAXBW: vector.body:
; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ]
@@ -1083,6 +1082,7 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 {
; CHECK-MAXBW-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[D]], [[INDEX]]
; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[OFFSET_IDX]]
; CHECK-MAXBW-NEXT: store <vscale x 16 x i8> zeroinitializer, ptr [[TMP10]], align 1
+; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = zext <vscale x 16 x i8> [[BROADCAST_SPLAT]] to <vscale x 16 x i32>
; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP9]])
; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP4]]
; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -1164,12 +1164,12 @@ define i64 @sext_reduction_i32_to_i64(ptr %arr, i64 %n) #1 {
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP14]], align 4
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = sext <4 x i32> [[WIDE_LOAD4]] to <4 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = sext <4 x i32> [[WIDE_LOAD5]] to <4 x i64>
-; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = sext <4 x i32> [[WIDE_LOAD6]] to <4 x i64>
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI]], <4 x i64> [[TMP15]])
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = sext <4 x i32> [[WIDE_LOAD4]] to <4 x i64>
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI1]], <4 x i64> [[TMP5]])
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = sext <4 x i32> [[WIDE_LOAD5]] to <4 x i64>
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE8]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI2]], <4 x i64> [[TMP6]])
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = sext <4 x i32> [[WIDE_LOAD6]] to <4 x i64>
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE9]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI3]], <4 x i64> [[TMP7]])
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll
index ebf4a4f..ab1486d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll
@@ -37,7 +37,7 @@ for.end: ; preds = %for.body
ret i32 %conv27
}
-attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
!llvm.ident = !{!0}
!0 = !{!"clang"}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll
index 25ee100..70685c1 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll
@@ -192,7 +192,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
; CHECK: LV(REG): VF = 16
; CHECK-NEXT: LV(REG): Found max usage: 2 item
; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 9 registers
-; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 12 registers
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 6 registers
; CHECK-NEXT: LV(REG): Found invariant usage: 1 item
entry:
br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
index 11cc971..fb7890a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
@@ -417,21 +417,17 @@ for.end: ; preds = %for.body, %entry
; Note: This test was added to ensure we always check the legality of reductions (end emit a warning if necessary) before checking for memory dependencies
; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop.
-; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2)
+; CHECK-REMARK: Ignoring user-specified interleave count due to possibly unsafe dependencies in the loop.
+; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 1)
define i32 @memory_dependence(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i64 %n) {
; CHECK-LABEL: @memory_dependence
; CHECK: vector.body:
; CHECK: %[[LOAD1:.*]] = load <4 x i32>
; CHECK: %[[LOAD2:.*]] = load <4 x i32>
-; CHECK: %[[LOAD3:.*]] = load <4 x i32>
-; CHECK: %[[LOAD4:.*]] = load <4 x i32>
-; CHECK: %[[ADD1:.*]] = add nsw <4 x i32> %[[LOAD3]], %[[LOAD1]]
-; CHECK: %[[ADD2:.*]] = add nsw <4 x i32> %[[LOAD4]], %[[LOAD2]]
-; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD3]]
-; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD4]]
+; CHECK: %[[ADD1:.*]] = add nsw <4 x i32> %[[LOAD2]], %[[LOAD1]]
+; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD2]]
; CHECK: middle.block:
-; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]]
-; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]])
+; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[MUL1]])
entry:
br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index 786a2aa..28d2a27 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -1630,4 +1630,4 @@ for.body: ; preds = %for.body, %entry
}
attributes #1 = { "target-features"="+sve" vscale_range(1, 16) }
-attributes #0 = { "unsafe-fp-math"="true" "target-features"="+sve" vscale_range(1, 16) }
+attributes #0 = { "target-features"="+sve" vscale_range(1, 16) }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll
index ae8dc2d..005ca8c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll
@@ -175,18 +175,28 @@ define void @test_add_double_same_var_args_1(ptr %res, ptr noalias %A, ptr noali
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 2
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP0]]
-; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = load <2 x double>, ptr [[TMP1]], align 4
-; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = load <2 x double>, ptr [[TMP2]], align 4
+; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP1]], align 4
+; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP2]], align 4
+; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[STRIDED_VEC3]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[STRIDED_VEC1]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[STRIDED_VEC4]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP0]]
-; CHECK-NEXT: store <2 x double> [[TMP5]], ptr [[TMP7]], align 4
-; CHECK-NEXT: store <2 x double> [[TMP6]], ptr [[TMP8]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4
+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC5]], ptr [[TMP8]], align 4
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
@@ -227,18 +237,28 @@ define void @test_add_double_same_var_args_2(ptr %res, ptr noalias %A, ptr noali
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 2
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP0]]
-; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = load <2 x double>, ptr [[TMP1]], align 4
-; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = load <2 x double>, ptr [[TMP2]], align 4
+; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP1]], align 4
+; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP2]], align 4
+; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC]]
+; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC3]]
; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC1]]
; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC4]]
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP0]]
-; CHECK-NEXT: store <2 x double> [[TMP5]], ptr [[TMP7]], align 4
-; CHECK-NEXT: store <2 x double> [[TMP6]], ptr [[TMP8]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4
+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC5]], ptr [[TMP8]], align 4
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
index b23702d..2a19402 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
@@ -319,46 +319,46 @@ define void @single_fmul_used_by_each_member(ptr noalias %A, ptr noalias %B, ptr
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP22:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP22:%.*]] = add i64 [[INDEX]], 6
; CHECK-NEXT: [[TMP23:%.*]] = getelementptr double, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP33:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP21]]
-; CHECK-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP20]]
-; CHECK-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP22]]
-; CHECK-NEXT: [[TMP24:%.*]] = load double, ptr [[TMP23]], align 8
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x double> poison, double [[TMP24]], i64 0
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT1]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP25:%.*]] = load double, ptr [[TMP33]], align 8
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <2 x double> poison, double [[TMP25]], i64 0
-; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT12]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP26:%.*]] = load double, ptr [[TMP37]], align 8
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <2 x double> poison, double [[TMP26]], i64 0
-; CHECK-NEXT: [[WIDE_LOAD13:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT14]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP27:%.*]] = load double, ptr [[TMP39]], align 8
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT16:%.*]] = insertelement <2 x double> poison, double [[TMP27]], i64 0
-; CHECK-NEXT: [[WIDE_LOAD14:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT16]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP25:%.*]] = getelementptr double, ptr [[TMP23]], i32 2
+; CHECK-NEXT: [[TMP26:%.*]] = getelementptr double, ptr [[TMP23]], i32 4
+; CHECK-NEXT: [[TMP27:%.*]] = getelementptr double, ptr [[TMP23]], i32 6
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP23]], align 8
+; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <2 x double>, ptr [[TMP25]], align 8
+; CHECK-NEXT: [[WIDE_LOAD13:%.*]] = load <2 x double>, ptr [[TMP26]], align 8
+; CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load <2 x double>, ptr [[TMP27]], align 8
; CHECK-NEXT: [[TMP28:%.*]] = fmul <2 x double> [[WIDE_LOAD]], splat (double 5.000000e+00)
; CHECK-NEXT: [[TMP29:%.*]] = fmul <2 x double> [[WIDE_LOAD12]], splat (double 5.000000e+00)
; CHECK-NEXT: [[TMP30:%.*]] = fmul <2 x double> [[WIDE_LOAD13]], splat (double 5.000000e+00)
; CHECK-NEXT: [[TMP31:%.*]] = fmul <2 x double> [[WIDE_LOAD14]], splat (double 5.000000e+00)
; CHECK-NEXT: [[TMP32:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT: [[TMP33:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[TMP20]]
; CHECK-NEXT: [[TMP34:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[TMP21]]
-; CHECK-NEXT: [[TMP38:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[TMP20]]
; CHECK-NEXT: [[TMP35:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[TMP22]]
-; CHECK-NEXT: store <2 x double> [[TMP28]], ptr [[TMP32]], align 8
-; CHECK-NEXT: store <2 x double> [[TMP29]], ptr [[TMP34]], align 8
-; CHECK-NEXT: store <2 x double> [[TMP30]], ptr [[TMP38]], align 8
-; CHECK-NEXT: store <2 x double> [[TMP31]], ptr [[TMP35]], align 8
+; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <2 x double> [[TMP28]], <2 x double> [[TMP28]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP36]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP32]], align 8
+; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <2 x double> [[TMP29]], <2 x double> [[TMP29]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[INTERLEAVED_VEC15:%.*]] = shufflevector <4 x double> [[TMP37]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC15]], ptr [[TMP33]], align 8
+; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <2 x double> [[TMP30]], <2 x double> [[TMP30]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[INTERLEAVED_VEC16:%.*]] = shufflevector <4 x double> [[TMP38]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC16]], ptr [[TMP34]], align 8
+; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <2 x double> [[TMP31]], <2 x double> [[TMP31]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[INTERLEAVED_VEC17:%.*]] = shufflevector <4 x double> [[TMP39]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC17]], ptr [[TMP35]], align 8
; CHECK-NEXT: [[TMP40:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP42:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[TMP21]]
; CHECK-NEXT: [[TMP41:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[TMP20]]
+; CHECK-NEXT: [[TMP42:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[TMP21]]
; CHECK-NEXT: [[TMP43:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[TMP22]]
-; CHECK-NEXT: store <2 x double> [[TMP28]], ptr [[TMP40]], align 8
-; CHECK-NEXT: store <2 x double> [[TMP29]], ptr [[TMP42]], align 8
-; CHECK-NEXT: store <2 x double> [[TMP30]], ptr [[TMP41]], align 8
-; CHECK-NEXT: store <2 x double> [[TMP31]], ptr [[TMP43]], align 8
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP40]], align 8
+; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC15]], ptr [[TMP41]], align 8
+; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC16]], ptr [[TMP42]], align 8
+; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC17]], ptr [[TMP43]], align 8
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP44]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
@@ -435,7 +435,7 @@ exit:
ret void
}
-; We should interleave by 2 after narrowing interleave groups to saturate
+; FIXME: We should interleave by 2 after narrowing interleave groups to saturate
; load/store units.
define void @test_interleave_after_narrowing(i32 %n, ptr %x, ptr noalias %y) {
; CHECK-LABEL: define void @test_interleave_after_narrowing(
@@ -447,18 +447,12 @@ define void @test_interleave_after_narrowing(i32 %n, ptr %x, ptr noalias %y) {
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 4
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[TMP5]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP7]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = fneg <4 x float> [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP4:%.*]] = fneg <4 x float> [[WIDE_LOAD1]]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[TMP5]]
; CHECK-NEXT: store <4 x float> [[TMP1]], ptr [[TMP2]], align 4
-; CHECK-NEXT: store <4 x float> [[TMP4]], ptr [[TMP6]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
; CHECK-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-multi-block.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-multi-block.ll
new file mode 100644
index 0000000..46b0ebd
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-multi-block.ll
@@ -0,0 +1,276 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
+; RUN: opt -p loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=VF2IC1 %s
+; RUN: opt -p loop-vectorize -force-vector-width=2 -force-vector-interleave=2 -S %s | FileCheck --check-prefixes=VF2IC2 %s
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "arm64-apple-macosx"
+
+define void @load_store_interleave_group_block_invar_cond(ptr noalias %data, ptr noalias %dst.0, ptr noalias %dst.1, i1 %c) {
+; VF2IC1-LABEL: define void @load_store_interleave_group_block_invar_cond(
+; VF2IC1-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[DST_0:%.*]], ptr noalias [[DST_1:%.*]], i1 [[C:%.*]]) {
+; VF2IC1-NEXT: [[ENTRY:.*:]]
+; VF2IC1-NEXT: br label %[[VECTOR_PH:.*]]
+; VF2IC1: [[VECTOR_PH]]:
+; VF2IC1-NEXT: br label %[[VECTOR_BODY:.*]]
+; VF2IC1: [[VECTOR_BODY]]:
+; VF2IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE3:.*]] ]
+; VF2IC1-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2IC1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP0]]
+; VF2IC1-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
+; VF2IC1-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; VF2IC1-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; VF2IC1-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2IC1-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2IC1-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
+; VF2IC1-NEXT: br i1 [[C]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF2IC1: [[PRED_STORE_IF]]:
+; VF2IC1-NEXT: store i8 1, ptr [[DST_0]], align 1
+; VF2IC1-NEXT: br label %[[PRED_STORE_CONTINUE]]
+; VF2IC1: [[PRED_STORE_CONTINUE]]:
+; VF2IC1-NEXT: br i1 [[C]], label %[[PRED_STORE_IF2:.*]], label %[[PRED_STORE_CONTINUE3]]
+; VF2IC1: [[PRED_STORE_IF2]]:
+; VF2IC1-NEXT: store i8 1, ptr [[DST_0]], align 1
+; VF2IC1-NEXT: br label %[[PRED_STORE_CONTINUE3]]
+; VF2IC1: [[PRED_STORE_CONTINUE3]]:
+; VF2IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[DST_1]], i64 [[INDEX]]
+; VF2IC1-NEXT: store <2 x i8> zeroinitializer, ptr [[TMP2]], align 1
+; VF2IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2IC1-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2IC1-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF2IC1: [[MIDDLE_BLOCK]]:
+; VF2IC1-NEXT: br label %[[EXIT:.*]]
+; VF2IC1: [[EXIT]]:
+; VF2IC1-NEXT: ret void
+;
+; VF2IC2-LABEL: define void @load_store_interleave_group_block_invar_cond(
+; VF2IC2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[DST_0:%.*]], ptr noalias [[DST_1:%.*]], i1 [[C:%.*]]) {
+; VF2IC2-NEXT: [[ENTRY:.*:]]
+; VF2IC2-NEXT: br label %[[VECTOR_PH:.*]]
+; VF2IC2: [[VECTOR_PH]]:
+; VF2IC2-NEXT: br label %[[VECTOR_BODY:.*]]
+; VF2IC2: [[VECTOR_BODY]]:
+; VF2IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE11:.*]] ]
+; VF2IC2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 2
+; VF2IC2-NEXT: [[TMP1:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2IC2-NEXT: [[TMP2:%.*]] = shl nsw i64 [[TMP0]], 1
+; VF2IC2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
+; VF2IC2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP2]]
+; VF2IC2-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
+; VF2IC2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; VF2IC2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; VF2IC2-NEXT: [[WIDE_VEC2:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; VF2IC2-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <4 x i64> [[WIDE_VEC2]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; VF2IC2-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <4 x i64> [[WIDE_VEC2]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; VF2IC2-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2IC2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2IC2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
+; VF2IC2-NEXT: [[TMP9:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC3]], <2 x i64> [[STRIDED_VEC4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2IC2-NEXT: [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2IC2-NEXT: store <4 x i64> [[INTERLEAVED_VEC5]], ptr [[TMP4]], align 8
+; VF2IC2-NEXT: br i1 [[C]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF2IC2: [[PRED_STORE_IF]]:
+; VF2IC2-NEXT: store i8 1, ptr [[DST_0]], align 1
+; VF2IC2-NEXT: br label %[[PRED_STORE_CONTINUE]]
+; VF2IC2: [[PRED_STORE_CONTINUE]]:
+; VF2IC2-NEXT: br i1 [[C]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7:.*]]
+; VF2IC2: [[PRED_STORE_IF6]]:
+; VF2IC2-NEXT: store i8 1, ptr [[DST_0]], align 1
+; VF2IC2-NEXT: br label %[[PRED_STORE_CONTINUE7]]
+; VF2IC2: [[PRED_STORE_CONTINUE7]]:
+; VF2IC2-NEXT: br i1 [[C]], label %[[PRED_STORE_IF8:.*]], label %[[PRED_STORE_CONTINUE9:.*]]
+; VF2IC2: [[PRED_STORE_IF8]]:
+; VF2IC2-NEXT: store i8 1, ptr [[DST_0]], align 1
+; VF2IC2-NEXT: br label %[[PRED_STORE_CONTINUE9]]
+; VF2IC2: [[PRED_STORE_CONTINUE9]]:
+; VF2IC2-NEXT: br i1 [[C]], label %[[PRED_STORE_IF10:.*]], label %[[PRED_STORE_CONTINUE11]]
+; VF2IC2: [[PRED_STORE_IF10]]:
+; VF2IC2-NEXT: store i8 1, ptr [[DST_0]], align 1
+; VF2IC2-NEXT: br label %[[PRED_STORE_CONTINUE11]]
+; VF2IC2: [[PRED_STORE_CONTINUE11]]:
+; VF2IC2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[DST_1]], i64 [[INDEX]]
+; VF2IC2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 2
+; VF2IC2-NEXT: store <2 x i8> zeroinitializer, ptr [[TMP5]], align 1
+; VF2IC2-NEXT: store <2 x i8> zeroinitializer, ptr [[TMP6]], align 1
+; VF2IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF2IC2-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2IC2-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF2IC2: [[MIDDLE_BLOCK]]:
+; VF2IC2-NEXT: br label %[[EXIT:.*]]
+; VF2IC2: [[EXIT]]:
+; VF2IC2-NEXT: ret void
+;
+entry:
+ br label %loop.header
+
+loop.header:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+ %mul.2 = shl nsw i64 %iv, 1
+ %data.0 = getelementptr inbounds i64, ptr %data, i64 %mul.2
+ %l.0 = load i64, ptr %data.0, align 8
+ store i64 %l.0, ptr %data.0, align 8
+ %add.1 = or disjoint i64 %mul.2, 1
+ %data.1 = getelementptr inbounds i64, ptr %data, i64 %add.1
+ %l.1 = load i64, ptr %data.1, align 8
+ store i64 %l.1, ptr %data.1, align 8
+ br i1 %c, label %then, label %loop.latch
+
+then:
+ store i8 1, ptr %dst.0
+ br label %loop.latch
+
+loop.latch:
+ %gep.dst.1 = getelementptr inbounds i8, ptr %dst.1, i64 %iv
+ store i8 0, ptr %gep.dst.1
+ %iv.next = add nuw nsw i64 %iv, 1
+ %ec = icmp eq i64 %iv.next, 100
+ br i1 %ec, label %exit, label %loop.header
+
+exit:
+ ret void
+}
+
+define void @load_store_interleave_group_block_var_cond(ptr noalias %data, ptr %masks, ptr noalias %dst) {
+; VF2IC1-LABEL: define void @load_store_interleave_group_block_var_cond(
+; VF2IC1-SAME: ptr noalias [[DATA:%.*]], ptr [[MASKS:%.*]], ptr noalias [[DST:%.*]]) {
+; VF2IC1-NEXT: [[ENTRY:.*:]]
+; VF2IC1-NEXT: br label %[[VECTOR_PH:.*]]
+; VF2IC1: [[VECTOR_PH]]:
+; VF2IC1-NEXT: br label %[[VECTOR_BODY:.*]]
+; VF2IC1: [[VECTOR_BODY]]:
+; VF2IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE3:.*]] ]
+; VF2IC1-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2IC1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP0]]
+; VF2IC1-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
+; VF2IC1-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; VF2IC1-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; VF2IC1-NEXT: [[TMP11:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2IC1-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP11]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2IC1-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
+; VF2IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[MASKS]], i64 [[INDEX]]
+; VF2IC1-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i8>, ptr [[TMP2]], align 1
+; VF2IC1-NEXT: [[TMP3:%.*]] = icmp eq <2 x i8> [[WIDE_LOAD1]], zeroinitializer
+; VF2IC1-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0
+; VF2IC1-NEXT: br i1 [[TMP4]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF2IC1: [[PRED_STORE_IF]]:
+; VF2IC1-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0
+; VF2IC1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[MASKS]], i64 [[TMP5]]
+; VF2IC1-NEXT: store i8 1, ptr [[TMP6]], align 1
+; VF2IC1-NEXT: br label %[[PRED_STORE_CONTINUE]]
+; VF2IC1: [[PRED_STORE_CONTINUE]]:
+; VF2IC1-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1
+; VF2IC1-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF2:.*]], label %[[PRED_STORE_CONTINUE3]]
+; VF2IC1: [[PRED_STORE_IF2]]:
+; VF2IC1-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 1
+; VF2IC1-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[MASKS]], i64 [[TMP8]]
+; VF2IC1-NEXT: store i8 1, ptr [[TMP9]], align 1
+; VF2IC1-NEXT: br label %[[PRED_STORE_CONTINUE3]]
+; VF2IC1: [[PRED_STORE_CONTINUE3]]:
+; VF2IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2IC1-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2IC1-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF2IC1: [[MIDDLE_BLOCK]]:
+; VF2IC1-NEXT: br label %[[EXIT:.*]]
+; VF2IC1: [[EXIT]]:
+; VF2IC1-NEXT: ret void
+;
+; VF2IC2-LABEL: define void @load_store_interleave_group_block_var_cond(
+; VF2IC2-SAME: ptr noalias [[DATA:%.*]], ptr [[MASKS:%.*]], ptr noalias [[DST:%.*]]) {
+; VF2IC2-NEXT: [[ENTRY:.*:]]
+; VF2IC2-NEXT: br label %[[VECTOR_PH:.*]]
+; VF2IC2: [[VECTOR_PH]]:
+; VF2IC2-NEXT: br label %[[VECTOR_BODY:.*]]
+; VF2IC2: [[VECTOR_BODY]]:
+; VF2IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE12:.*]] ]
+; VF2IC2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 2
+; VF2IC2-NEXT: [[TMP1:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2IC2-NEXT: [[TMP2:%.*]] = shl nsw i64 [[TMP0]], 1
+; VF2IC2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
+; VF2IC2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP2]]
+; VF2IC2-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
+; VF2IC2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; VF2IC2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; VF2IC2-NEXT: [[WIDE_VEC2:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; VF2IC2-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <4 x i64> [[WIDE_VEC2]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; VF2IC2-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <4 x i64> [[WIDE_VEC2]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; VF2IC2-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2IC2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2IC2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
+; VF2IC2-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC3]], <2 x i64> [[STRIDED_VEC4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2IC2-NEXT: [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2IC2-NEXT: store <4 x i64> [[INTERLEAVED_VEC5]], ptr [[TMP4]], align 8
+; VF2IC2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[MASKS]], i64 [[INDEX]]
+; VF2IC2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 2
+; VF2IC2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP7]], align 1
+; VF2IC2-NEXT: [[WIDE_LOAD6:%.*]] = load <2 x i8>, ptr [[TMP8]], align 1
+; VF2IC2-NEXT: [[TMP9:%.*]] = icmp eq <2 x i8> [[WIDE_LOAD]], zeroinitializer
+; VF2IC2-NEXT: [[TMP10:%.*]] = icmp eq <2 x i8> [[WIDE_LOAD6]], zeroinitializer
+; VF2IC2-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP9]], i32 0
+; VF2IC2-NEXT: br i1 [[TMP11]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF2IC2: [[PRED_STORE_IF]]:
+; VF2IC2-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; VF2IC2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[MASKS]], i64 [[TMP12]]
+; VF2IC2-NEXT: store i8 1, ptr [[TMP13]], align 1
+; VF2IC2-NEXT: br label %[[PRED_STORE_CONTINUE]]
+; VF2IC2: [[PRED_STORE_CONTINUE]]:
+; VF2IC2-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP9]], i32 1
+; VF2IC2-NEXT: br i1 [[TMP14]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; VF2IC2: [[PRED_STORE_IF7]]:
+; VF2IC2-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 1
+; VF2IC2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[MASKS]], i64 [[TMP15]]
+; VF2IC2-NEXT: store i8 1, ptr [[TMP16]], align 1
+; VF2IC2-NEXT: br label %[[PRED_STORE_CONTINUE8]]
+; VF2IC2: [[PRED_STORE_CONTINUE8]]:
+; VF2IC2-NEXT: [[TMP17:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0
+; VF2IC2-NEXT: br i1 [[TMP17]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; VF2IC2: [[PRED_STORE_IF9]]:
+; VF2IC2-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], 2
+; VF2IC2-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[MASKS]], i64 [[TMP18]]
+; VF2IC2-NEXT: store i8 1, ptr [[TMP19]], align 1
+; VF2IC2-NEXT: br label %[[PRED_STORE_CONTINUE10]]
+; VF2IC2: [[PRED_STORE_CONTINUE10]]:
+; VF2IC2-NEXT: [[TMP20:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1
+; VF2IC2-NEXT: br i1 [[TMP20]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12]]
+; VF2IC2: [[PRED_STORE_IF11]]:
+; VF2IC2-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 3
+; VF2IC2-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[MASKS]], i64 [[TMP21]]
+; VF2IC2-NEXT: store i8 1, ptr [[TMP22]], align 1
+; VF2IC2-NEXT: br label %[[PRED_STORE_CONTINUE12]]
+; VF2IC2: [[PRED_STORE_CONTINUE12]]:
+; VF2IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF2IC2-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2IC2-NEXT: br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF2IC2: [[MIDDLE_BLOCK]]:
+; VF2IC2-NEXT: br label %[[EXIT:.*]]
+; VF2IC2: [[EXIT]]:
+; VF2IC2-NEXT: ret void
+;
+entry:
+ br label %loop.header
+
+loop.header:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+ %mul.2 = shl nsw i64 %iv, 1
+ %data.0 = getelementptr inbounds i64, ptr %data, i64 %mul.2
+ %l.0 = load i64, ptr %data.0, align 8
+ store i64 %l.0, ptr %data.0, align 8
+ %add.1 = or disjoint i64 %mul.2, 1
+ %data.1 = getelementptr inbounds i64, ptr %data, i64 %add.1
+ %l.1 = load i64, ptr %data.1, align 8
+ store i64 %l.1, ptr %data.1, align 8
+ %gep.mask = getelementptr inbounds i8, ptr %masks, i64 %iv
+ %l.mask = load i8, ptr %gep.mask
+ %c = icmp eq i8 %l.mask, 0
+ br i1 %c, label %then, label %loop.latch
+
+then:
+ store i8 1, ptr %gep.mask
+ br label %loop.latch
+
+loop.latch:
+ %iv.next = add nuw nsw i64 %iv, 1
+ %ec = icmp eq i64 %iv.next, 100
+ br i1 %ec, label %exit, label %loop.header
+
+exit:
+ ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll
index 2865495..c261760 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll
@@ -13,8 +13,12 @@ define void @load_store_interleave_group_tc_2(ptr noalias %data) {
; VF2: [[VECTOR_PH]]:
; VF2-NEXT: br label %[[VECTOR_BODY:.*]]
; VF2: [[VECTOR_BODY]]:
-; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[DATA]], align 8
-; VF2-NEXT: store <2 x i64> [[WIDE_LOAD]], ptr [[DATA]], align 8
+; VF2-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[DATA]], align 8
+; VF2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; VF2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; VF2-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[DATA]], align 8
; VF2-NEXT: br label %[[MIDDLE_BLOCK:.*]]
; VF2: [[MIDDLE_BLOCK]]:
; VF2-NEXT: br label %[[EXIT:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll
index 305a692..b63e03d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll
@@ -1,81 +1,29 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph:" --version 5
-; RUN: opt -p loop-vectorize -force-vector-interleave=1 -S -mcpu=neoverse-512tvb %s | FileCheck --check-prefixes=IC1 %s
-; RUN: opt -p loop-vectorize -S -mcpu=neoverse-512tvb %s | FileCheck --check-prefixes=CHECK %s
+; RUN: opt -p loop-vectorize -force-vector-interleave=1 -S -mcpu=neoverse-512tvb %s | FileCheck --check-prefixes=CHECK %s
target triple = "aarch64-unknown-linux"
define void @load_store_interleave_group(ptr noalias %data) {
-; IC1-LABEL: define void @load_store_interleave_group(
-; IC1-SAME: ptr noalias [[DATA:%.*]]) #[[ATTR0:[0-9]+]] {
-; IC1-NEXT: [[ENTRY:.*:]]
-; IC1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; IC1-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1
-; IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP1]]
-; IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; IC1: [[VECTOR_PH]]:
-; IC1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; IC1-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
-; IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 100, [[TMP3]]
-; IC1-NEXT: [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]]
-; IC1-NEXT: br label %[[VECTOR_BODY:.*]]
-; IC1: [[VECTOR_BODY]]:
-; IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; IC1-NEXT: [[TMP4:%.*]] = shl nsw i64 [[INDEX]], 1
-; IC1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
-; IC1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP5]], align 8
-; IC1-NEXT: store <vscale x 2 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 8
-; IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
-; IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; IC1: [[MIDDLE_BLOCK]]:
-; IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 100, [[N_VEC]]
-; IC1-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
-; IC1: [[SCALAR_PH]]:
-;
; CHECK-LABEL: define void @load_store_interleave_group(
; CHECK-SAME: ptr noalias [[DATA:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 3
+; CHECK-NEXT: [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 1
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP5]]
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
+; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 100, [[TMP3]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]]
-; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP2]], 4
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[TMP2]], 0
-; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP20]], 1
-; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP2]], 2
-; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP24]], 0
-; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 1
-; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP2]], 3
-; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[TMP12]], 0
-; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 1
-; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], [[TMP14]]
; CHECK-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
-; CHECK-NEXT: [[TMP17:%.*]] = shl nsw i64 [[TMP7]], 1
-; CHECK-NEXT: [[TMP18:%.*]] = shl nsw i64 [[TMP11]], 1
-; CHECK-NEXT: [[TMP19:%.*]] = shl nsw i64 [[TMP15]], 1
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP17]]
-; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP18]]
-; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP19]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP1]], align 8
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i64>, ptr [[TMP21]], align 8
-; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 2 x i64>, ptr [[TMP22]], align 8
-; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i64>, ptr [[TMP23]], align 8
; CHECK-NEXT: store <vscale x 2 x i64> [[WIDE_LOAD]], ptr [[TMP1]], align 8
-; CHECK-NEXT: store <vscale x 2 x i64> [[WIDE_LOAD1]], ptr [[TMP21]], align 8
-; CHECK-NEXT: store <vscale x 2 x i64> [[WIDE_LOAD2]], ptr [[TMP22]], align 8
-; CHECK-NEXT: store <vscale x 2 x i64> [[WIDE_LOAD3]], ptr [[TMP23]], align 8
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
@@ -105,82 +53,27 @@ exit:
}
define void @test_2xi64_unary_op_load_interleave_group(ptr noalias %data, ptr noalias %factor) {
-; IC1-LABEL: define void @test_2xi64_unary_op_load_interleave_group(
-; IC1-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) #[[ATTR0]] {
-; IC1-NEXT: [[ENTRY:.*:]]
-; IC1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; IC1-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1
-; IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1111, [[TMP1]]
-; IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; IC1: [[VECTOR_PH]]:
-; IC1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; IC1-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
-; IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1111, [[TMP3]]
-; IC1-NEXT: [[N_VEC:%.*]] = sub i64 1111, [[N_MOD_VF]]
-; IC1-NEXT: br label %[[VECTOR_BODY:.*]]
-; IC1: [[VECTOR_BODY]]:
-; IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; IC1-NEXT: [[TMP4:%.*]] = shl nsw i64 [[INDEX]], 1
-; IC1-NEXT: [[TMP5:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP4]]
-; IC1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, ptr [[TMP5]], align 8
-; IC1-NEXT: [[TMP6:%.*]] = fneg <vscale x 2 x double> [[WIDE_LOAD]]
-; IC1-NEXT: store <vscale x 2 x double> [[TMP6]], ptr [[TMP5]], align 8
-; IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
-; IC1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IC1-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; IC1: [[MIDDLE_BLOCK]]:
-; IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1111, [[N_VEC]]
-; IC1-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
-; IC1: [[SCALAR_PH]]:
-;
; CHECK-LABEL: define void @test_2xi64_unary_op_load_interleave_group(
; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 3
+; CHECK-NEXT: [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 1
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1111, [[TMP5]]
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
+; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1111, [[TMP3]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1111, [[N_MOD_VF]]
-; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP2]], 4
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[TMP2]], 0
-; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP20]], 1
-; CHECK-NEXT: [[TMP24:%.*]] = add i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP2]], 2
-; CHECK-NEXT: [[TMP28:%.*]] = add i64 [[TMP8]], 0
-; CHECK-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 1
-; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP29]]
-; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP2]], 3
-; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[TMP12]], 0
-; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 1
-; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], [[TMP14]]
; CHECK-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
-; CHECK-NEXT: [[TMP17:%.*]] = shl nsw i64 [[TMP24]], 1
-; CHECK-NEXT: [[TMP18:%.*]] = shl nsw i64 [[TMP11]], 1
-; CHECK-NEXT: [[TMP19:%.*]] = shl nsw i64 [[TMP15]], 1
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP17]]
-; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP18]]
-; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP19]]
; CHECK-NEXT: [[TMP7:%.*]] = load <vscale x 2 x double>, ptr [[TMP1]], align 8
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 2 x double>, ptr [[TMP21]], align 8
-; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 2 x double>, ptr [[TMP22]], align 8
-; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 2 x double>, ptr [[TMP23]], align 8
; CHECK-NEXT: [[TMP9:%.*]] = fneg <vscale x 2 x double> [[TMP7]]
-; CHECK-NEXT: [[TMP25:%.*]] = fneg <vscale x 2 x double> [[WIDE_LOAD1]]
-; CHECK-NEXT: [[TMP26:%.*]] = fneg <vscale x 2 x double> [[WIDE_LOAD2]]
-; CHECK-NEXT: [[TMP27:%.*]] = fneg <vscale x 2 x double> [[WIDE_LOAD3]]
; CHECK-NEXT: store <vscale x 2 x double> [[TMP9]], ptr [[TMP1]], align 8
-; CHECK-NEXT: store <vscale x 2 x double> [[TMP25]], ptr [[TMP21]], align 8
-; CHECK-NEXT: store <vscale x 2 x double> [[TMP26]], ptr [[TMP22]], align 8
-; CHECK-NEXT: store <vscale x 2 x double> [[TMP27]], ptr [[TMP23]], align 8
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll
index abfb44d..d290f2d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll
@@ -60,26 +60,32 @@ define void @test_2xi64_with_wide_load(ptr noalias %data, ptr noalias %factor) {
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 2
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP4]], align 8
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 2
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = load <2 x i64>, ptr [[TMP1]], align 8
+; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8
; CHECK-NEXT: [[TMP6:%.*]] = shl nsw i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP7:%.*]] = shl nsw i64 [[TMP0]], 1
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP7]]
-; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = load <2 x i64>, ptr [[TMP8]], align 8
-; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = load <2 x i64>, ptr [[TMP9]], align 8
+; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP8]], align 8
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT: [[WIDE_VEC3:%.*]] = load <4 x i64>, ptr [[TMP9]], align 8
+; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC3]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <4 x i64> [[WIDE_VEC3]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT: [[TMP10:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT]], [[WIDE_LOAD]]
+; CHECK-NEXT: [[TMP11:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT3]], [[WIDE_LOAD1]]
; CHECK-NEXT: [[TMP15:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT]], [[STRIDED_VEC2]]
; CHECK-NEXT: [[TMP16:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT3]], [[STRIDED_VEC5]]
-; CHECK-NEXT: store <2 x i64> [[TMP15]], ptr [[TMP8]], align 8
-; CHECK-NEXT: store <2 x i64> [[TMP16]], ptr [[TMP9]], align 8
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <2 x i64> [[TMP10]], <2 x i64> [[TMP15]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP17]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 8
+; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x i64> [[TMP11]], <2 x i64> [[TMP16]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[INTERLEAVED_VEC6:%.*]] = shufflevector <4 x i64> [[TMP18]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT: store <4 x i64> [[INTERLEAVED_VEC6]], ptr [[TMP9]], align 8
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
index f2e689c..75980ba 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
@@ -328,8 +328,10 @@ define void @same_live_in_store_interleave_group(i64 %x, ptr noalias %dst) {
; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; VF2-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
; VF2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
-; VF2-NEXT: store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 8
-; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
+; VF2-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLAT]], <2 x i64> [[BROADCAST_SPLAT]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
+; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; VF2-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
; VF2-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; VF2: [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
index d4e5dea..49f663f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
@@ -23,18 +23,15 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
; CHECK-NEXT: <x1> vector loop: {
; CHECK-NEXT: vector.body:
; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
-; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[ACC:%.+]]> = phi vp<[[RDX_START]]>, ir<[[REDUCE:%.+]]> (VF scaled by 1/4)
+; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[ACC:%.+]]> = phi vp<[[RDX_START]]>, vp<[[REDUCE:%.+]]> (VF scaled by 1/4)
; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]>
; CHECK-NEXT: CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[STEPS]]>
; CHECK-NEXT: vp<[[PTR_A:%.+]]> = vector-pointer ir<%gep.a>
; CHECK-NEXT: WIDEN ir<%load.a> = load vp<[[PTR_A]]>
-; CHECK-NEXT: WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32
; CHECK-NEXT: CLONE ir<%gep.b> = getelementptr ir<%b>, vp<[[STEPS]]>
; CHECK-NEXT: vp<[[PTR_B:%.+]]> = vector-pointer ir<%gep.b>
; CHECK-NEXT: WIDEN ir<%load.b> = load vp<[[PTR_B]]>
-; CHECK-NEXT: WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32
-; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a>
-; CHECK-NEXT: PARTIAL-REDUCE ir<[[REDUCE]]> = add ir<[[ACC]]>, ir<%mul>
+; CHECK-NEXT: EXPRESSION vp<[[REDUCE]]> = ir<[[ACC]]> + partial.reduce.add (mul (ir<%load.b> zext to i32), (ir<%load.a> zext to i32))
; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
; CHECK-NEXT: No successors
@@ -42,7 +39,7 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
; CHECK-NEXT: Successor(s): middle.block
; CHECK-EMPTY:
; CHECK-NEXT: middle.block:
-; CHECK-NEXT: EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<[[ACC]]>, ir<[[REDUCE]]>
+; CHECK-NEXT: EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<[[ACC]]>, vp<[[REDUCE]]>
; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<1024>, vp<[[VEC_TC]]>
; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]>
; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
@@ -89,10 +86,10 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%accum> = phi vp<[[RDX_START]]>, ir<%add> (VF scaled by 1/4)
; CHECK-NEXT: CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[EP_IV]]>
; CHECK-NEXT: WIDEN ir<%load.a> = load ir<%gep.a>
-; CHECK-NEXT: WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32
; CHECK-NEXT: CLONE ir<%gep.b> = getelementptr ir<%b>, vp<[[EP_IV]]>
; CHECK-NEXT: WIDEN ir<%load.b> = load ir<%gep.b>
; CHECK-NEXT: WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32
+; CHECK-NEXT: WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32
; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a>
; CHECK-NEXT: PARTIAL-REDUCE ir<%add> = add ir<%accum>, ir<%mul>
; CHECK-NEXT: EMIT vp<[[EP_IV_NEXT:%.+]]> = add nuw vp<[[EP_IV]]>, ir<16>
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll b/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll
index 0f398a6..2579918 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll
@@ -327,5 +327,5 @@ for.end: ; preds = %for.body, %entry
declare float @fabsf(float)
-attributes #1 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll
index 0a9b1e0..61e3a18 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll
@@ -60,10 +60,10 @@ define i32 @vqdot(ptr %a, ptr %b) #0 {
; ZVQDOTQ-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 1 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; ZVQDOTQ-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; ZVQDOTQ-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP6]], align 1
-; ZVQDOTQ-NEXT: [[TMP8:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
; ZVQDOTQ-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
; ZVQDOTQ-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
; ZVQDOTQ-NEXT: [[TMP11:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
+; ZVQDOTQ-NEXT: [[TMP8:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
; ZVQDOTQ-NEXT: [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
; ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 1 x i32> @llvm.vector.partial.reduce.add.nxv1i32.nxv4i32(<vscale x 1 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP12]])
; ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
@@ -125,17 +125,17 @@ define i32 @vqdot(ptr %a, ptr %b) #0 {
; FIXED-ZVQDOTQ-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1
; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
-; FIXED-ZVQDOTQ-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
-; FIXED-ZVQDOTQ-NEXT: [[TMP4:%.*]] = sext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
; FIXED-ZVQDOTQ-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
; FIXED-ZVQDOTQ-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8
; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1
; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1
; FIXED-ZVQDOTQ-NEXT: [[TMP8:%.*]] = sext <8 x i8> [[WIDE_LOAD3]] to <8 x i32>
-; FIXED-ZVQDOTQ-NEXT: [[TMP9:%.*]] = sext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
; FIXED-ZVQDOTQ-NEXT: [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]]
-; FIXED-ZVQDOTQ-NEXT: [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]]
; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP10]])
+; FIXED-ZVQDOTQ-NEXT: [[TMP9:%.*]] = sext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT: [[TMP12:%.*]] = sext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT: [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP12]]
; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP11]])
; FIXED-ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; FIXED-ZVQDOTQ-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -222,10 +222,10 @@ define i32 @vqdotu(ptr %a, ptr %b) #0 {
; ZVQDOTQ-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 1 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; ZVQDOTQ-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; ZVQDOTQ-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP6]], align 1
-; ZVQDOTQ-NEXT: [[TMP8:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
; ZVQDOTQ-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
; ZVQDOTQ-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
; ZVQDOTQ-NEXT: [[TMP11:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
+; ZVQDOTQ-NEXT: [[TMP8:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
; ZVQDOTQ-NEXT: [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
; ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 1 x i32> @llvm.vector.partial.reduce.add.nxv1i32.nxv4i32(<vscale x 1 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP12]])
; ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
@@ -287,17 +287,17 @@ define i32 @vqdotu(ptr %a, ptr %b) #0 {
; FIXED-ZVQDOTQ-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1
; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
-; FIXED-ZVQDOTQ-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
-; FIXED-ZVQDOTQ-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
; FIXED-ZVQDOTQ-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
; FIXED-ZVQDOTQ-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8
; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1
; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1
; FIXED-ZVQDOTQ-NEXT: [[TMP8:%.*]] = zext <8 x i8> [[WIDE_LOAD3]] to <8 x i32>
-; FIXED-ZVQDOTQ-NEXT: [[TMP9:%.*]] = zext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
; FIXED-ZVQDOTQ-NEXT: [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]]
-; FIXED-ZVQDOTQ-NEXT: [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]]
; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP10]])
+; FIXED-ZVQDOTQ-NEXT: [[TMP9:%.*]] = zext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT: [[TMP12:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT: [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP12]]
; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP11]])
; FIXED-ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; FIXED-ZVQDOTQ-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -384,10 +384,10 @@ define i32 @vqdotsu(ptr %a, ptr %b) #0 {
; ZVQDOTQ-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 1 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; ZVQDOTQ-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; ZVQDOTQ-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP6]], align 1
-; ZVQDOTQ-NEXT: [[TMP8:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
; ZVQDOTQ-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
; ZVQDOTQ-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
; ZVQDOTQ-NEXT: [[TMP11:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
+; ZVQDOTQ-NEXT: [[TMP8:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
; ZVQDOTQ-NEXT: [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
; ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 1 x i32> @llvm.vector.partial.reduce.add.nxv1i32.nxv4i32(<vscale x 1 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP12]])
; ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
@@ -449,18 +449,18 @@ define i32 @vqdotsu(ptr %a, ptr %b) #0 {
; FIXED-ZVQDOTQ-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1
; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
-; FIXED-ZVQDOTQ-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
-; FIXED-ZVQDOTQ-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
; FIXED-ZVQDOTQ-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
; FIXED-ZVQDOTQ-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8
; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1
; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1
-; FIXED-ZVQDOTQ-NEXT: [[TMP8:%.*]] = sext <8 x i8> [[WIDE_LOAD3]] to <8 x i32>
-; FIXED-ZVQDOTQ-NEXT: [[TMP9:%.*]] = sext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
-; FIXED-ZVQDOTQ-NEXT: [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]]
+; FIXED-ZVQDOTQ-NEXT: [[TMP9:%.*]] = sext <8 x i8> [[WIDE_LOAD3]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
; FIXED-ZVQDOTQ-NEXT: [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]]
-; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP10]])
-; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP11]])
+; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP11]])
+; FIXED-ZVQDOTQ-NEXT: [[TMP10:%.*]] = sext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT: [[TMP8:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT: [[TMP14:%.*]] = mul <8 x i32> [[TMP10]], [[TMP8]]
+; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP14]])
; FIXED-ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; FIXED-ZVQDOTQ-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; FIXED-ZVQDOTQ-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -545,10 +545,10 @@ define i32 @vqdotsu2(ptr %a, ptr %b) #0 {
; ZVQDOTQ-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 1 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; ZVQDOTQ-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; ZVQDOTQ-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP6]], align 1
-; ZVQDOTQ-NEXT: [[TMP8:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
; ZVQDOTQ-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
; ZVQDOTQ-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
; ZVQDOTQ-NEXT: [[TMP11:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
+; ZVQDOTQ-NEXT: [[TMP8:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
; ZVQDOTQ-NEXT: [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
; ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 1 x i32> @llvm.vector.partial.reduce.add.nxv1i32.nxv4i32(<vscale x 1 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP12]])
; ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
@@ -610,18 +610,18 @@ define i32 @vqdotsu2(ptr %a, ptr %b) #0 {
; FIXED-ZVQDOTQ-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1
; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
-; FIXED-ZVQDOTQ-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
-; FIXED-ZVQDOTQ-NEXT: [[TMP4:%.*]] = sext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
; FIXED-ZVQDOTQ-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
; FIXED-ZVQDOTQ-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8
; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1
; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1
-; FIXED-ZVQDOTQ-NEXT: [[TMP8:%.*]] = zext <8 x i8> [[WIDE_LOAD3]] to <8 x i32>
-; FIXED-ZVQDOTQ-NEXT: [[TMP9:%.*]] = zext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
-; FIXED-ZVQDOTQ-NEXT: [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]]
+; FIXED-ZVQDOTQ-NEXT: [[TMP9:%.*]] = zext <8 x i8> [[WIDE_LOAD3]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT: [[TMP4:%.*]] = sext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
; FIXED-ZVQDOTQ-NEXT: [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]]
-; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP10]])
-; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP11]])
+; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP11]])
+; FIXED-ZVQDOTQ-NEXT: [[TMP10:%.*]] = zext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT: [[TMP8:%.*]] = sext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT: [[TMP14:%.*]] = mul <8 x i32> [[TMP10]], [[TMP8]]
+; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP14]])
; FIXED-ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; FIXED-ZVQDOTQ-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; FIXED-ZVQDOTQ-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll b/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
index e42e2c7..b26e9cf 100644
--- a/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
@@ -1779,7 +1779,7 @@ for.body: ; preds = %entry, %for.body
; CHECK: LV: Scalar loop costs: 24
; CHECK: LV: Vector loop of width 2 costs: 33
; CHECK: LV: Vector loop of width 4 costs: 30
-; CHECK: LV: Selecting VF: 4
+; CHECK: LV: Selecting VF: 1
define hidden void @four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
entry:
%cmp45.not = icmp eq i32 %N, 0
diff --git a/llvm/test/Transforms/LoopVectorize/WebAssembly/partial-reduce-accumulate.ll b/llvm/test/Transforms/LoopVectorize/WebAssembly/partial-reduce-accumulate.ll
new file mode 100644
index 0000000..2338da5
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/WebAssembly/partial-reduce-accumulate.ll
@@ -0,0 +1,126 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -mattr=+simd128 -passes=loop-vectorize %s -S | FileCheck %s
+; RUN: opt -mattr=+simd128 -passes=loop-vectorize -vectorizer-maximize-bandwidth %s -S | FileCheck %s --check-prefix=CHECK-MAX-BANDWIDTH
+
+target triple = "wasm32"
+
+define hidden i32 @accumulate_add_u8_u8(ptr noundef readonly %a, ptr noundef readonly %b, i32 noundef %N) {
+; CHECK-LABEL: define hidden i32 @accumulate_add_u8_u8(
+; CHECK-SAME: ptr noundef readonly [[A:%.*]], ptr noundef readonly [[B:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i32 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i32>
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i32 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEXT: [[TMP6:%.*]] = zext <4 x i8> [[WIDE_LOAD1]] to <4 x i32>
+; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[VEC_PHI]], [[TMP3]]
+; CHECK-NEXT: [[TMP8]] = add <4 x i32> [[TMP7]], [[TMP6]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]])
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ [[ADD3:%.*]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: ret i32 [[RESULT_0_LCSSA]]
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT: [[RED:%.*]] = phi i32 [ [[ADD3]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i32 [[IV]]
+; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP11]] to i32
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i32 [[IV]]
+; CHECK-NEXT: [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT: [[CONV2:%.*]] = zext i8 [[TMP12]] to i32
+; CHECK-NEXT: [[ADD:%.*]] = add i32 [[RED]], [[CONV]]
+; CHECK-NEXT: [[ADD3]] = add i32 [[ADD]], [[CONV2]]
+; CHECK-NEXT: [[INC]] = add nuw i32 [[IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+;
+; CHECK-MAX-BANDWIDTH-LABEL: define hidden i32 @accumulate_add_u8_u8(
+; CHECK-MAX-BANDWIDTH-SAME: ptr noundef readonly [[A:%.*]], ptr noundef readonly [[B:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-MAX-BANDWIDTH-NEXT: [[ENTRY:.*]]:
+; CHECK-MAX-BANDWIDTH-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 16
+; CHECK-MAX-BANDWIDTH-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-MAX-BANDWIDTH: [[VECTOR_PH]]:
+; CHECK-MAX-BANDWIDTH-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 16
+; CHECK-MAX-BANDWIDTH-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; CHECK-MAX-BANDWIDTH-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-MAX-BANDWIDTH: [[VECTOR_BODY]]:
+; CHECK-MAX-BANDWIDTH-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-MAX-BANDWIDTH-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-MAX-BANDWIDTH-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i32 [[INDEX]]
+; CHECK-MAX-BANDWIDTH-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-MAX-BANDWIDTH-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i32 [[INDEX]]
+; CHECK-MAX-BANDWIDTH-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-MAX-BANDWIDTH-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-MAX-BANDWIDTH-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]])
+; CHECK-MAX-BANDWIDTH-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-MAX-BANDWIDTH-NEXT: [[PARTIAL_REDUCE2]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP6]])
+; CHECK-MAX-BANDWIDTH-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
+; CHECK-MAX-BANDWIDTH-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAX-BANDWIDTH-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-MAX-BANDWIDTH: [[MIDDLE_BLOCK]]:
+; CHECK-MAX-BANDWIDTH-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE2]])
+; CHECK-MAX-BANDWIDTH-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; CHECK-MAX-BANDWIDTH-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
+; CHECK-MAX-BANDWIDTH: [[SCALAR_PH]]:
+; CHECK-MAX-BANDWIDTH-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-MAX-BANDWIDTH-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-MAX-BANDWIDTH-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK-MAX-BANDWIDTH: [[FOR_COND_CLEANUP]]:
+; CHECK-MAX-BANDWIDTH-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ [[ADD3:%.*]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
+; CHECK-MAX-BANDWIDTH-NEXT: ret i32 [[RESULT_0_LCSSA]]
+; CHECK-MAX-BANDWIDTH: [[FOR_BODY]]:
+; CHECK-MAX-BANDWIDTH-NEXT: [[IV:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-MAX-BANDWIDTH-NEXT: [[RED:%.*]] = phi i32 [ [[ADD3]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-MAX-BANDWIDTH-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i32 [[IV]]
+; CHECK-MAX-BANDWIDTH-NEXT: [[TMP11:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-MAX-BANDWIDTH-NEXT: [[CONV:%.*]] = zext i8 [[TMP11]] to i32
+; CHECK-MAX-BANDWIDTH-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i32 [[IV]]
+; CHECK-MAX-BANDWIDTH-NEXT: [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-MAX-BANDWIDTH-NEXT: [[CONV2:%.*]] = zext i8 [[TMP12]] to i32
+; CHECK-MAX-BANDWIDTH-NEXT: [[ADD:%.*]] = add i32 [[RED]], [[CONV]]
+; CHECK-MAX-BANDWIDTH-NEXT: [[ADD3]] = add i32 [[ADD]], [[CONV2]]
+; CHECK-MAX-BANDWIDTH-NEXT: [[INC]] = add nuw i32 [[IV]], 1
+; CHECK-MAX-BANDWIDTH-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-MAX-BANDWIDTH-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+;
+entry:
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body
+ ret i32 %add3
+
+for.body: ; preds = %entry, %for.body
+ %iv = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %red = phi i32 [ %add3, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds nuw i8, ptr %a, i32 %iv
+ %0 = load i8, ptr %arrayidx, align 1
+ %conv = zext i8 %0 to i32
+ %arrayidx1 = getelementptr inbounds nuw i8, ptr %b, i32 %iv
+ %1 = load i8, ptr %arrayidx1, align 1
+ %conv2 = zext i8 %1 to i32
+ %add = add i32 %red, %conv
+ %add3 = add i32 %add, %conv2
+ %inc = add nuw i32 %iv, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll b/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
index 9168ebf..08d39ea 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
@@ -214,7 +214,7 @@ for.end15: ; preds = %for.end.us, %entry
ret void
}
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
!3 = !{!4, !5}
!4 = !{!4}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/int128_no_gather.ll b/llvm/test/Transforms/LoopVectorize/X86/int128_no_gather.ll
index 44e9d3e..b7f8be1 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/int128_no_gather.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/int128_no_gather.ll
@@ -71,6 +71,6 @@ declare i32 @printf(ptr, ...) #1
; Function Attrs: nounwind
declare i32 @puts(ptr nocapture readonly) #2
-attributes #0 = { noinline nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "use-soft-float"="false" }
+attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "use-soft-float"="false" }
attributes #2 = { nounwind }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll
index f066000..52e90e4 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll
@@ -362,4 +362,4 @@ for.body:
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}
-attributes #0 = { "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "unsafe-fp-math"="false" }
+attributes #0 = { "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
index 005696a..e405fe7 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
@@ -335,4 +335,4 @@ for.body: ; preds = %for.body.preheader,
br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit99
}
-attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="broadwell" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512f,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+evex512,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-vzeroupper" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="broadwell" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512f,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+evex512,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-vzeroupper" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll b/llvm/test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll
index b98a2ea..c7550ca 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll
@@ -143,7 +143,7 @@ for.inc:
br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !11
}
-attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
index f4d80af..2a3ce03 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
@@ -8,69 +8,15 @@ target triple = "x86_64-unknown-linux"
define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
; CHECK-LABEL: define void @test_4xi64(
; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: [[ITER_CHECK:.*]]:
+; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
-; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
-; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 16
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
-; CHECK-NEXT: [[N_MOD_VF1:%.*]] = urem i64 [[N]], 16
-; CHECK-NEXT: [[N_VEC1:%.*]] = sub i64 [[N]], [[N_MOD_VF1]]
-; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
-; CHECK: [[VECTOR_BODY]]:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT1:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 3
-; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP2]]
-; CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP20]], align 8
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP7]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP21]], align 8
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i64> poison, i64 [[TMP8]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT5]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP22]], align 8
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i64> poison, i64 [[TMP9]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT7]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 8
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x i64> poison, i64 [[TMP10]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT9]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[INDEX]], i32 0
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[TMP2]], i32 0
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP11]], align 8
-; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP12]], align 8
-; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP13]], align 8
-; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP23]], align 8
-; CHECK-NEXT: [[TMP15:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT1]], [[WIDE_LOAD1]]
-; CHECK-NEXT: [[TMP16:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT6]], [[WIDE_LOAD2]]
-; CHECK-NEXT: [[TMP17:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT8]], [[WIDE_LOAD3]]
-; CHECK-NEXT: [[TMP18:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT10]], [[WIDE_LOAD4]]
-; CHECK-NEXT: store <4 x i64> [[TMP15]], ptr [[TMP11]], align 8
-; CHECK-NEXT: store <4 x i64> [[TMP16]], ptr [[TMP12]], align 8
-; CHECK-NEXT: store <4 x i64> [[TMP17]], ptr [[TMP13]], align 8
-; CHECK-NEXT: store <4 x i64> [[TMP18]], ptr [[TMP23]], align 8
-; CHECK-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC1]]
-; CHECK-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: [[MIDDLE_BLOCK]]:
-; CHECK-NEXT: [[CMP_N1:%.*]] = icmp eq i64 [[N]], [[N_VEC1]]
-; CHECK-NEXT: br i1 [[CMP_N1]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
-; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF1]], 4
-; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
-; CHECK: [[VEC_EPILOG_PH]]:
-; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC1]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
-; CHECK: [[VEC_EPILOG_VECTOR_BODY]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP5]], i64 0
@@ -81,15 +27,15 @@ define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
; CHECK-NEXT: store <4 x i64> [[TMP4]], ptr [[TMP3]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 1
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP14]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
-; CHECK: [[VEC_EPILOG_SCALAR_PH]]:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC1]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[DATA_2:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]]
; CHECK-NEXT: [[L_2:%.*]] = load i64, ptr [[DATA_2]], align 8
; CHECK-NEXT: [[DATA_0:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 0
@@ -110,7 +56,7 @@ define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
; CHECK-NEXT: store i64 [[MUL_3]], ptr [[DATA_3]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -171,7 +117,7 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -194,7 +140,7 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
; CHECK-NEXT: store i64 [[MUL_1]], ptr [[DATA_1]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -249,7 +195,7 @@ define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr no
; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -272,7 +218,7 @@ define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr no
; CHECK-NEXT: store i64 [[MUL_1]], ptr [[DATA_1]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -327,7 +273,7 @@ define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %fa
; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -350,7 +296,7 @@ define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %fa
; CHECK-NEXT: store i64 [[MUL_0]], ptr [[DATA_1]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -405,7 +351,7 @@ define void @test_2xi64_store_order_flipped_2(ptr noalias %data, ptr noalias %fa
; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -428,7 +374,7 @@ define void @test_2xi64_store_order_flipped_2(ptr noalias %data, ptr noalias %fa
; CHECK-NEXT: store i64 [[MUL_1]], ptr [[DATA_0]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -489,7 +435,7 @@ define void @test_2xi64_different_loads_feeding_fmul(ptr noalias %data, ptr noal
; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP16]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[SCALAR_PH]]
; CHECK: [[SCALAR_PH]]:
@@ -513,7 +459,7 @@ define void @test_2xi64_different_loads_feeding_fmul(ptr noalias %data, ptr noal
; CHECK-NEXT: store i64 [[MUL_1]], ptr [[DATA_1]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -573,7 +519,7 @@ define void @test_3xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
; CHECK-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -598,7 +544,7 @@ define void @test_3xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
; CHECK-NEXT: store i64 [[MUL_2]], ptr [[DATA_2]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -707,7 +653,7 @@ define void @test_3xi32(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
; CHECK-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[SCALAR_PH]]
; CHECK: [[SCALAR_PH]]:
@@ -731,7 +677,7 @@ define void @test_3xi32(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
; CHECK-NEXT: store i32 [[MUL_2]], ptr [[DATA_2]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -765,20 +711,19 @@ exit:
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
-; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]}
-; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
-; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META1]]}
-; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]]}
-; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META2]], [[META1]]}
-; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]]}
-; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META2]], [[META1]]}
-; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]], [[META2]]}
-; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META2]], [[META1]]}
-; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]], [[META2]]}
-; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META2]], [[META1]]}
-; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META1]], [[META2]]}
-; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META2]], [[META1]]}
-; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]]}
-; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META2]], [[META1]]}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
+; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
+; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
+; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]}
;.
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll
index 2c187ca..9471e4a 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll
@@ -72,7 +72,7 @@ for.end: ; preds = %for.body, %entry
ret void, !dbg !27
}
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "use-soft-float"="false" }
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!7, !8}
diff --git a/llvm/test/Transforms/LoopVectorize/constantfolder.ll b/llvm/test/Transforms/LoopVectorize/constantfolder.ll
index 66592b0..fdeb497 100644
--- a/llvm/test/Transforms/LoopVectorize/constantfolder.ll
+++ b/llvm/test/Transforms/LoopVectorize/constantfolder.ll
@@ -288,3 +288,72 @@ loop.latch:
exit:
ret void
}
+
+define void @const_fold_binaryintrinsic(ptr %dst, i64 %d) {
+; CHECK-LABEL: define void @const_fold_binaryintrinsic(
+; CHECK-SAME: ptr [[DST:%.*]], i64 [[D:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: store i64 3, ptr [[DST]], align 2
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; CHECK-NEXT: br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %const.0 = xor i64 %d, %d
+ %trunc = call i64 @llvm.umax.i64(i64 %const.0, i64 3)
+ store i64 %trunc, ptr %dst, align 2
+ %iv.next = add i64 %iv, 1
+ %cmp = icmp ult i64 %iv.next, 100
+ br i1 %cmp, label %loop, label %exit
+
+exit:
+ ret void
+}
+
+define void @const_fold_widegep(ptr noalias %A, ptr noalias %B, i64 %d) {
+; CHECK-LABEL: define void @const_fold_widegep(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[D:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: store ptr [[A]], ptr [[B]], align 8
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; CHECK-NEXT: br i1 [[TMP0]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %const.0 = xor i64 %d, %d
+ %gep.A = getelementptr i64, ptr %A, i64 %const.0
+ %gep.B = getelementptr i64, ptr %B, i64 %const.0
+ store ptr %gep.A, ptr %gep.B
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exit.cond = icmp ult i64 %iv.next, 100
+ br i1 %exit.cond, label %loop, label %exit
+
+exit:
+ ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/diag-missing-instr-debug-loc.ll b/llvm/test/Transforms/LoopVectorize/diag-missing-instr-debug-loc.ll
index 8ce2a97..0656de4 100644
--- a/llvm/test/Transforms/LoopVectorize/diag-missing-instr-debug-loc.ll
+++ b/llvm/test/Transforms/LoopVectorize/diag-missing-instr-debug-loc.ll
@@ -50,7 +50,7 @@ for.body: ; preds = %for.body.preheader,
br i1 %exitcond, label %for.cond.cleanup, label %for.body, !dbg !9, !llvm.loop !18
}
-attributes #0 = { norecurse nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "use-soft-float"="false" }
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4}
diff --git a/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll
index 65bd36c..5e51624f 100644
--- a/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll
+++ b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll
@@ -134,7 +134,7 @@ for.cond.cleanup:
ret void, !dbg !44
}
-attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "use-soft-float"="false" }
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll
index 4b7b714..77ec95a 100644
--- a/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll
+++ b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll
@@ -145,7 +145,7 @@ for.body: ; preds = %entry, %for.body
br i1 %exitcond, label %for.cond.cleanup, label %for.body, !dbg !43, !llvm.loop !55
}
-attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "use-soft-float"="false" }
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll b/llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll
index 5cf99b8..3487331 100644
--- a/llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll
+++ b/llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll
@@ -135,7 +135,7 @@ thread-pre-split5: ; preds = %.lr.ph
ret void
}
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
!llvm.ident = !{!0}
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
index 8efe29a..fc2e233 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
@@ -783,7 +783,7 @@ for.body: ; preds = %for.body, %entry
@SA = common global i32 0, align 4
@SB = common global float 0.000000e+00, align 4
-define void @int_float_struct(ptr nocapture readonly %A) #0 {
+define void @int_float_struct(ptr nocapture readonly %A) {
; CHECK-LABEL: @int_float_struct(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[VECTOR_PH:%.*]]
@@ -1546,5 +1546,3 @@ loop:
end:
ret void
}
-
-attributes #0 = { "unsafe-fp-math"="true" }
diff --git a/llvm/test/Transforms/LoopVectorize/metadata-width.ll b/llvm/test/Transforms/LoopVectorize/metadata-width.ll
index ddf9029..22243d9 100644
--- a/llvm/test/Transforms/LoopVectorize/metadata-width.ll
+++ b/llvm/test/Transforms/LoopVectorize/metadata-width.ll
@@ -89,7 +89,7 @@ for.end: ; preds = %for.body, %entry
ret void
}
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
!0 = !{!0, !1, !5}
!1 = !{!"llvm.loop.vectorize.width", i32 8}
diff --git a/llvm/test/Transforms/LoopVectorize/multiple-address-spaces.ll b/llvm/test/Transforms/LoopVectorize/multiple-address-spaces.ll
index a1fc1b8..87b5a0b 100644
--- a/llvm/test/Transforms/LoopVectorize/multiple-address-spaces.ll
+++ b/llvm/test/Transforms/LoopVectorize/multiple-address-spaces.ll
@@ -56,4 +56,4 @@ for.end: ; preds = %for.body
ret i32 0
}
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-address-space.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-address-space.ll
index 705bbab..64ae730 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-check-address-space.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check-address-space.ll
@@ -218,4 +218,4 @@ for.end: ; preds = %for.body, %entry
ret void
}
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-readonly-address-space.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-readonly-address-space.ll
index 1effb10..872a3929 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-check-readonly-address-space.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check-readonly-address-space.ll
@@ -129,4 +129,4 @@ for.end: ; preds = %for.body
ret void
}
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll b/llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll
index 8224d6b..137e098 100644
--- a/llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll
@@ -104,8 +104,8 @@ for.end26: ; preds = %for.cond4.for.end26
}
declare i32 @fn2(double) #1
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
!0 = !{!"int", !1}
!1 = !{!"omnipotent char", !2}
diff --git a/llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll b/llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll
new file mode 100644
index 0000000..01934b1
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll
@@ -0,0 +1,30 @@
+; RUN: opt -passes=loop-vectorize -pass-remarks-analysis=loop-vectorize -S < %s 2>&1 | FileCheck %s
+
+; Make sure the unsafe user specified interleave count is ignored.
+
+; CHECK: remark: <unknown>:0:0: Ignoring user-specified interleave count due to possibly unsafe dependencies in the loop.
+; CHECK-LABEL: @loop_distance_4
+define void @loop_distance_4(ptr %a, ptr %b) {
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 4, %entry ], [ %iv.next, %loop ]
+ %0 = getelementptr i32, ptr %b, i64 %iv
+ %arrayidx = getelementptr i8, ptr %0, i64 -16
+ %1 = load i32, ptr %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds nuw i32, ptr %a, i64 %iv
+ %2 = load i32, ptr %arrayidx2, align 4
+ %add = add nsw i32 %2, %1
+ store i32 %add, ptr %0, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, 64
+ br i1 %exitcond.not, label %for.end, label %loop, !llvm.loop !1
+
+for.end:
+ ret void
+}
+
+!1 = !{!1, !2, !3}
+!2 = !{!"llvm.loop.interleave.count", i32 4}
+!3 = !{!"llvm.loop.vectorize.width", i32 4}
diff --git a/llvm/test/Transforms/MergeICmps/X86/int64-and-ptr.ll b/llvm/test/Transforms/MergeICmps/X86/int64-and-ptr.ll
index e2f5007..cc187de 100644
--- a/llvm/test/Transforms/MergeICmps/X86/int64-and-ptr.ll
+++ b/llvm/test/Transforms/MergeICmps/X86/int64-and-ptr.ll
@@ -35,11 +35,7 @@ if.end5: ; preds = %if.then, %entry
ret i1 %rez.0
}
; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(ptr nocapture)
; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(ptr nocapture) #1
-
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind }
+declare void @llvm.lifetime.end.p0(ptr nocapture)
diff --git a/llvm/test/Transforms/MergeICmps/X86/pr41917.ll b/llvm/test/Transforms/MergeICmps/X86/pr41917.ll
index d4fd34a..d8065e8 100644
--- a/llvm/test/Transforms/MergeICmps/X86/pr41917.ll
+++ b/llvm/test/Transforms/MergeICmps/X86/pr41917.ll
@@ -7,13 +7,13 @@ target triple = "i386-pc-windows-msvc19.11.0"
%class.a = type { i32, i32, i32, i32, i32 }
; Function Attrs: nounwind optsize
-define dso_local zeroext i1 @pr41917(ptr byval(%class.a) nocapture readonly align 4 %g, ptr byval(%class.a) nocapture readonly align 4 %p2) local_unnamed_addr #0 {
+define dso_local zeroext i1 @pr41917(ptr byval(%class.a) nocapture readonly align 4 %g, ptr byval(%class.a) nocapture readonly align 4 %p2) local_unnamed_addr {
; CHECK-LABEL: @pr41917(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CALL:%.*]] = tail call zeroext i1 @f2() #[[ATTR3:[0-9]+]]
+; CHECK-NEXT: [[CALL:%.*]] = tail call zeroext i1 @f2()
; CHECK-NEXT: br i1 [[CALL]], label [[LAND_RHS:%.*]], label %"land.end+land.rhs3"
; CHECK: land.rhs:
-; CHECK-NEXT: [[CALL1:%.*]] = tail call zeroext i1 @f2() #[[ATTR3]]
+; CHECK-NEXT: [[CALL1:%.*]] = tail call zeroext i1 @f2()
; CHECK-NEXT: br label %"land.end+land.rhs3"
; CHECK: "land.end+land.rhs3":
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[CLASS_A:%.*]], ptr [[G:%.*]], i32 0, i32 1
@@ -25,11 +25,11 @@ define dso_local zeroext i1 @pr41917(ptr byval(%class.a) nocapture readonly alig
; CHECK-NEXT: ret i1 [[TMP2]]
;
entry:
- %call = tail call zeroext i1 @f2() #2
+ %call = tail call zeroext i1 @f2()
br i1 %call, label %land.rhs, label %land.end
land.rhs: ; preds = %entry
- %call1 = tail call zeroext i1 @f2() #2
+ %call1 = tail call zeroext i1 @f2()
br label %land.end
land.end: ; preds = %land.rhs, %entry
@@ -53,11 +53,7 @@ land.end6: ; preds = %land.rhs3, %land.en
ret i1 %4
}
-declare dso_local zeroext i1 @f2() local_unnamed_addr #1
-
-attributes #0 = { nounwind optsize "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { optsize "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind optsize }
+declare dso_local zeroext i1 @f2() local_unnamed_addr
!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}
diff --git a/llvm/test/Transforms/NewGVN/basic-cyclic-opt.ll b/llvm/test/Transforms/NewGVN/basic-cyclic-opt.ll
index 1cf9fd9..12b4184 100644
--- a/llvm/test/Transforms/NewGVN/basic-cyclic-opt.ll
+++ b/llvm/test/Transforms/NewGVN/basic-cyclic-opt.ll
@@ -4,7 +4,7 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
;; Function Attrs: nounwind ssp uwtable
;; We should eliminate the sub, and one of the phi nodes
-define void @vnum_test1(ptr %data) #0 {
+define void @vnum_test1(ptr %data) {
; CHECK-LABEL: @vnum_test1(
; CHECK-NEXT: bb:
; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 3
@@ -74,7 +74,7 @@ bb19: ; preds = %bb4
;; We should eliminate the sub, one of the phi nodes, prove the store of the sub
;; and the load of data are equivalent, that the load always produces constant 0, and
;; delete the load replacing it with constant 0.
-define i32 @vnum_test2(ptr %data) #0 {
+define i32 @vnum_test2(ptr %data) {
; CHECK-LABEL: @vnum_test2(
; CHECK-NEXT: bb:
; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 3
@@ -142,11 +142,10 @@ bb21: ; preds = %bb4
ret i32 %p.0
}
-
; Function Attrs: nounwind ssp uwtable
;; Same as test 2, with a conditional store of m-n, so it has to also discover
;; that data ends up with the same value no matter what branch is taken.
-define i32 @vnum_test3(ptr %data) #0 {
+define i32 @vnum_test3(ptr %data) {
; CHECK-LABEL: @vnum_test3(
; CHECK-NEXT: bb:
; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 3
@@ -305,7 +304,6 @@ bb3: ; preds = %bb2
%tmp3 = sub i32 %tmp, %phi2
ret i32 %tmp3
}
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
!llvm.ident = !{!0, !0, !0}
diff --git a/llvm/test/Transforms/NewGVN/cond_br2-xfail.ll b/llvm/test/Transforms/NewGVN/cond_br2-xfail.ll
index 8b2d662..b6da86b 100644
--- a/llvm/test/Transforms/NewGVN/cond_br2-xfail.ll
+++ b/llvm/test/Transforms/NewGVN/cond_br2-xfail.ll
@@ -10,7 +10,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
%"union.llvm::SmallVectorBase::U" = type { x86_fp80 }
; Function Attrs: ssp uwtable
-define void @_Z4testv() #0 personality ptr @__gxx_personality_v0 {
+define void @_Z4testv() personality ptr @__gxx_personality_v0 {
; CHECK: @_Z4testv()
; CHECK: invoke.cont:
; CHECK: br i1 true, label %new.notnull.i11, label %if.end.i14
@@ -18,7 +18,7 @@ define void @_Z4testv() #0 personality ptr @__gxx_personality_v0 {
entry:
%sv = alloca %"class.llvm::SmallVector", align 16
- call void @llvm.lifetime.start.p0(ptr %sv) #1
+ call void @llvm.lifetime.start.p0(ptr %sv)
%FirstEl.i.i.i.i.i.i = getelementptr inbounds %"class.llvm::SmallVector", ptr %sv, i64 0, i32 0, i32 0, i32 0, i32 0, i32 3
store ptr %FirstEl.i.i.i.i.i.i, ptr %sv, align 16, !tbaa !4
%EndX.i.i.i.i.i.i = getelementptr inbounds %"class.llvm::SmallVector", ptr %sv, i64 0, i32 0, i32 0, i32 0, i32 0, i32 1
@@ -83,11 +83,11 @@ invoke.cont3: ; preds = %invoke.cont2
br i1 %cmp.i.i.i.i19, label %_ZN4llvm11SmallVectorIiLj8EED1Ev.exit21, label %if.then.i.i.i20
if.then.i.i.i20: ; preds = %invoke.cont3
- call void @free(ptr %5) #1
+ call void @free(ptr %5)
br label %_ZN4llvm11SmallVectorIiLj8EED1Ev.exit21
_ZN4llvm11SmallVectorIiLj8EED1Ev.exit21: ; preds = %invoke.cont3, %if.then.i.i.i20
- call void @llvm.lifetime.end.p0(ptr %sv) #1
+ call void @llvm.lifetime.end.p0(ptr %sv)
ret void
lpad: ; preds = %if.end.i14, %if.end.i, %invoke.cont2
@@ -98,7 +98,7 @@ lpad: ; preds = %if.end.i14, %if.end
br i1 %cmp.i.i.i.i, label %eh.resume, label %if.then.i.i.i
if.then.i.i.i: ; preds = %lpad
- call void @free(ptr %7) #1
+ call void @free(ptr %7)
br label %eh.resume
eh.resume: ; preds = %if.then.i.i.i, %lpad
@@ -106,24 +106,19 @@ eh.resume: ; preds = %if.then.i.i.i, %lpa
}
; Function Attrs: nounwind
-declare void @llvm.lifetime.start.p0(ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(ptr nocapture)
declare i32 @__gxx_personality_v0(...)
-declare void @_Z1gRN4llvm11SmallVectorIiLj8EEE(ptr) #2
+declare void @_Z1gRN4llvm11SmallVectorIiLj8EEE(ptr)
; Function Attrs: nounwind
-declare void @llvm.lifetime.end.p0(ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(ptr nocapture)
-declare void @_ZN4llvm15SmallVectorBase8grow_podEmm(ptr, i64, i64) #2
+declare void @_ZN4llvm15SmallVectorBase8grow_podEmm(ptr, i64, i64)
; Function Attrs: nounwind
-declare void @free(ptr nocapture) #3
-
-attributes #0 = { ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind }
-attributes #2 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+declare void @free(ptr nocapture)
!0 = !{!"any pointer", !1}
!1 = !{!"omnipotent char", !2}
diff --git a/llvm/test/Transforms/NewGVN/equivalent-phi.ll b/llvm/test/Transforms/NewGVN/equivalent-phi.ll
index ba4fc14..388b941 100644
--- a/llvm/test/Transforms/NewGVN/equivalent-phi.ll
+++ b/llvm/test/Transforms/NewGVN/equivalent-phi.ll
@@ -8,7 +8,7 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
;; one set of indexing calculations and a load
; Function Attrs: nounwind ssp uwtable
-define i32 @bar(i32 %arg, i32 %arg1, i32 %arg2) #0 {
+define i32 @bar(i32 %arg, i32 %arg1, i32 %arg2) {
; CHECK-LABEL: @bar(
; CHECK-NEXT: bb:
; CHECK-NEXT: br label [[BB3:%.*]]
@@ -59,8 +59,6 @@ bb20: ; preds = %bb17
ret i32 %tmp14
}
-attributes #0 = { nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
diff --git a/llvm/test/Transforms/NewGVN/memory-handling.ll b/llvm/test/Transforms/NewGVN/memory-handling.ll
index f83d145..71e9041 100644
--- a/llvm/test/Transforms/NewGVN/memory-handling.ll
+++ b/llvm/test/Transforms/NewGVN/memory-handling.ll
@@ -282,10 +282,10 @@ declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1) #2
; Function Attrs: inlinehint nounwind readonly uwtable
declare i32 @tolower(i32) local_unnamed_addr #3
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable }
+attributes #1 = { nounwind readnone }
attributes #2 = { argmemonly nounwind }
-attributes #3 = { inlinehint nounwind readonly uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { inlinehint nounwind readonly uwtable }
attributes #4 = { nounwind readnone }
attributes #5 = { nounwind readonly }
diff --git a/llvm/test/Transforms/NewGVN/pr31483.ll b/llvm/test/Transforms/NewGVN/pr31483.ll
index 82e9a2a..c1fb836 100644
--- a/llvm/test/Transforms/NewGVN/pr31483.ll
+++ b/llvm/test/Transforms/NewGVN/pr31483.ll
@@ -6,7 +6,7 @@ target datalayout = "E-m:e-i64:64-n32:64"
;; Ensure we do not believe the indexing increments are unreachable due to incorrect memory
;; equivalence detection. In PR31483, we were deleting those blocks as unreachable
; Function Attrs: nounwind
-define signext i32 @ham(ptr %arg, ptr %arg1) #0 {
+define signext i32 @ham(ptr %arg, ptr %arg1) {
; CHECK-LABEL: @ham(
; CHECK-NEXT: bb:
; CHECK-NEXT: [[TMP:%.*]] = alloca ptr, align 8
@@ -89,12 +89,7 @@ bb23: ; preds = %bb2
ret i32 undef
}
-declare signext i32 @zot(ptr, ...) #1
+declare signext i32 @zot(ptr, ...)
; Function Attrs: nounwind
-declare void @llvm.va_end(ptr) #2
-
-attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-vsx" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-vsx" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
-
+declare void @llvm.va_end(ptr)
diff --git a/llvm/test/Transforms/NewGVN/pr31501.ll b/llvm/test/Transforms/NewGVN/pr31501.ll
index 353c693..b2ba42b 100644
--- a/llvm/test/Transforms/NewGVN/pr31501.ll
+++ b/llvm/test/Transforms/NewGVN/pr31501.ll
@@ -49,9 +49,9 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
%struct.wombat.28 = type <{ ptr, i8, i8, [6 x i8] }>
; Function Attrs: norecurse nounwind ssp uwtable
-define weak_odr hidden ptr @quux(ptr %arg, ptr %arg1) local_unnamed_addr #0 align 2 {
+define weak_odr hidden ptr @quux(ptr %arg, ptr %arg1) local_unnamed_addr align 2 {
; CHECK-LABEL: define weak_odr hidden ptr @quux(
-; CHECK-SAME: ptr [[ARG:%.*]], ptr [[ARG1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] align 2 {
+; CHECK-SAME: ptr [[ARG:%.*]], ptr [[ARG1:%.*]]) local_unnamed_addr align 2 {
; CHECK-NEXT: [[BB:.*]]:
; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds [[STRUCT_BARNEY:%.*]], ptr [[ARG]], i64 0, i32 3, i32 0, i32 0, i32 0
; CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 8, !tbaa [[ANYPTR_TBAA2:![0-9]+]]
@@ -112,8 +112,6 @@ bb21: ; preds = %bb19, %bb
ret ptr %tmp22
}
-attributes #0 = { norecurse nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
diff --git a/llvm/test/Transforms/NewGVN/pr33187.ll b/llvm/test/Transforms/NewGVN/pr33187.ll
index 969f172..61eb7a5 100644
--- a/llvm/test/Transforms/NewGVN/pr33187.ll
+++ b/llvm/test/Transforms/NewGVN/pr33187.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
;; Ensure we don't change after value numbering by accidentally deleting the wrong expression.
; RUN: opt -passes=newgvn -S %s | FileCheck %s
-define void @fn1(i1 %arg) local_unnamed_addr #0 {
+define void @fn1(i1 %arg) local_unnamed_addr {
; CHECK-LABEL: @fn1(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[FOR_COND_PREHEADER:%.*]]
@@ -89,8 +89,7 @@ if.end18: ; preds = %L, %while.body12
br label %while.cond10
}
-
-define void @hoge() local_unnamed_addr #0 {
+define void @hoge() local_unnamed_addr {
; CHECK-LABEL: @hoge(
; CHECK-NEXT: bb:
; CHECK-NEXT: br label [[BB1:%.*]]
@@ -108,9 +107,6 @@ bb1: ; preds = %bb1, %bb
br label %bb1
}
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-
define void @a(i1 %arg) {
; CHECK-LABEL: @a(
; CHECK-NEXT: b:
@@ -143,4 +139,3 @@ f: ; preds = %d
%e = getelementptr i8, ptr %i, i64 1
br label %d
}
-
diff --git a/llvm/test/Transforms/NewGVN/pr33305.ll b/llvm/test/Transforms/NewGVN/pr33305.ll
index e742f14..ff645f8 100644
--- a/llvm/test/Transforms/NewGVN/pr33305.ll
+++ b/llvm/test/Transforms/NewGVN/pr33305.ll
@@ -16,9 +16,9 @@ target triple = "x86_64-apple-macosx10.12.0"
@str.2 = private unnamed_addr constant [8 x i8] c"Screwed\00"
; Function Attrs: nounwind optsize ssp uwtable
-define i32 @main() local_unnamed_addr #0 {
+define i32 @main() local_unnamed_addr {
; CHECK-LABEL: define i32 @main(
-; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: ) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[DOTPR_I:%.*]] = load i32, ptr @c, align 4, !tbaa [[INT_TBAA3:![0-9]+]]
; CHECK-NEXT: [[CMP13_I:%.*]] = icmp slt i32 [[DOTPR_I]], 1
@@ -77,7 +77,7 @@ define i32 @main() local_unnamed_addr #0 {
; CHECK-NEXT: br i1 [[TOBOOL]], label %[[IF_END:.*]], label %[[IF_THEN:.*]]
; CHECK: [[IF_THEN]]:
; CHECK-NEXT: [[PUTS2:%.*]] = tail call i32 @puts(ptr @str.2)
-; CHECK-NEXT: tail call void @abort() #[[ATTR3:[0-9]+]]
+; CHECK-NEXT: tail call void @abort()
; CHECK-NEXT: unreachable
; CHECK: [[IF_END]]:
; CHECK-NEXT: [[PUTS:%.*]] = tail call i32 @puts(ptr @str)
@@ -153,7 +153,7 @@ fn1.exit: ; preds = %if.then.i, %for.end
if.then: ; preds = %fn1.exit
%puts2 = tail call i32 @puts(ptr @str.2)
- tail call void @abort() #3
+ tail call void @abort()
unreachable
if.end: ; preds = %fn1.exit
@@ -162,15 +162,10 @@ if.end: ; preds = %fn1.exit
}
; Function Attrs: noreturn nounwind optsize
-declare void @abort() local_unnamed_addr #1
+declare void @abort() local_unnamed_addr
; Function Attrs: nounwind
-declare i32 @puts(ptr nocapture readonly) local_unnamed_addr #2
-
-attributes #0 = { nounwind optsize ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { noreturn nounwind optsize "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
-attributes #3 = { noreturn nounwind optsize }
+declare i32 @puts(ptr nocapture readonly) local_unnamed_addr
!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}
diff --git a/llvm/test/Transforms/NewGVN/pr34430.ll b/llvm/test/Transforms/NewGVN/pr34430.ll
index 490ba433..62d5770 100644
--- a/llvm/test/Transforms/NewGVN/pr34430.ll
+++ b/llvm/test/Transforms/NewGVN/pr34430.ll
@@ -4,7 +4,7 @@
source_filename = "bugpoint-output-e4c7d0f.bc"
; Make sure we still properly resolve phi cycles when they involve predicateinfo copies of phis.
-define void @hoge(i1 %arg) local_unnamed_addr #0 {
+define void @hoge(i1 %arg) local_unnamed_addr {
; CHECK-LABEL: @hoge(
; CHECK-NEXT: bb:
; CHECK-NEXT: br i1 %arg, label [[BB6:%.*]], label [[BB1:%.*]]
@@ -41,8 +41,6 @@ bb6: ; preds = %bb4, %bb2, %bb1, %b
br label %bb4
}
-attributes #0 = { norecurse noreturn nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.ident = !{!0}
!0 = !{!"clang version 6.0.0"}
diff --git a/llvm/test/Transforms/NewGVN/pr34452.ll b/llvm/test/Transforms/NewGVN/pr34452.ll
index 48bdd88..7c38147 100644
--- a/llvm/test/Transforms/NewGVN/pr34452.ll
+++ b/llvm/test/Transforms/NewGVN/pr34452.ll
@@ -6,9 +6,9 @@ source_filename = "bugpoint-output-09f7a24.bc"
@WHOLELINE = external local_unnamed_addr global i32, align 4
; Function Attrs: nounwind uwtable
-define void @sgrep() local_unnamed_addr #0 {
+define void @sgrep() local_unnamed_addr {
; CHECK-LABEL: define void @sgrep(
-; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: ) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr @WHOLELINE, align 4, !tbaa [[INT_TBAA1:![0-9]+]]
; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[TMP0]], 0
@@ -36,10 +36,7 @@ while.body.us: ; preds = %while.body.us, %ent
}
; Function Attrs: nounwind readnone speculatable
-declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) #1
-
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
+declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64)
!llvm.ident = !{!0}
diff --git a/llvm/test/Transforms/OpenMP/dead_use.ll b/llvm/test/Transforms/OpenMP/dead_use.ll
index 1c4b2c6..ad0f91c 100644
--- a/llvm/test/Transforms/OpenMP/dead_use.ll
+++ b/llvm/test/Transforms/OpenMP/dead_use.ll
@@ -6,9 +6,9 @@
@0 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @.str }, align 8
; Function Attrs: nounwind uwtable
-define dso_local i32 @b() #0 {
+define dso_local i32 @b() {
; CHECK-LABEL: define dso_local i32 @b(
-; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: ) {
; CHECK-NEXT: [[TMP1:%.*]] = alloca i32, align 4
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @a()
; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4
@@ -21,9 +21,9 @@ define dso_local i32 @b() #0 {
}
; Function Attrs: nounwind uwtable
-define internal i32 @a() #0 {
+define internal i32 @a() {
; CHECK-LABEL: define internal i32 @a(
-; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-SAME: ) {
; CHECK-NEXT: [[TMP1:%.*]] = alloca i32, align 4
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @b()
; CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB0:[0-9]+]], i32 0, ptr @.omp_outlined.)
@@ -38,9 +38,9 @@ define internal i32 @a() #0 {
}
; Function Attrs: norecurse nounwind uwtable
-define internal void @.omp_outlined.(ptr noalias %0, ptr noalias %1) #1 {
+define internal void @.omp_outlined.(ptr noalias %0, ptr noalias %1) {
; CHECK-LABEL: define internal void @.omp_outlined.(
-; CHECK-SAME: ptr noalias [[TMP0:%.*]], ptr noalias [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-SAME: ptr noalias [[TMP0:%.*]], ptr noalias [[TMP1:%.*]]) {
; CHECK-NEXT: [[TMP3:%.*]] = alloca ptr, align 8
; CHECK-NEXT: [[TMP4:%.*]] = alloca ptr, align 8
; CHECK-NEXT: store ptr [[TMP0]], ptr [[TMP3]], align 8, !tbaa [[ANYPTR_TBAA2:![0-9]+]]
@@ -55,11 +55,7 @@ define internal void @.omp_outlined.(ptr noalias %0, ptr noalias %1) #1 {
}
; Function Attrs: nounwind
-declare !callback !6 void @__kmpc_fork_call(ptr, i32, ptr, ...) #2
-
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
+declare !callback !6 void @__kmpc_fork_call(ptr, i32, ptr, ...)
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
diff --git a/llvm/test/Transforms/OpenMP/icv_remarks.ll b/llvm/test/Transforms/OpenMP/icv_remarks.ll
index f76d487..3caa56f 100644
--- a/llvm/test/Transforms/OpenMP/icv_remarks.ll
+++ b/llvm/test/Transforms/OpenMP/icv_remarks.ll
@@ -14,55 +14,48 @@ target triple = "x86_64-unknown-linux-gnu"
; CHECK-DAG: remark: icv_remarks.c:12:0: OpenMP ICV nthreads Value: IMPLEMENTATION_DEFINED
; CHECK-DAG: remark: icv_remarks.c:12:0: OpenMP ICV active_levels Value: 0
; CHECK-DAG: remark: icv_remarks.c:12:0: OpenMP ICV cancel Value: 0
-define dso_local void @foo(i32 %a) local_unnamed_addr #0 !dbg !17 {
+define dso_local void @foo(i32 %a) local_unnamed_addr !dbg !17 {
entry:
%.kmpc_loc.addr = alloca %struct.ident_t, align 8
call void @llvm.memcpy.p0.p0.i64(ptr nonnull align 8 dereferenceable(24) %.kmpc_loc.addr, ptr nonnull align 8 dereferenceable(24) @0, i64 16, i1 false)
call void @llvm.dbg.value(metadata i32 %a, metadata !19, metadata !DIExpression()), !dbg !21
- tail call void @omp_set_num_threads(i32 %a) #1, !dbg !22
- %call = tail call i32 @omp_get_max_threads() #1, !dbg !23
+ tail call void @omp_set_num_threads(i32 %a), !dbg !22
+ %call = tail call i32 @omp_get_max_threads(), !dbg !23
call void @llvm.dbg.value(metadata i32 %call, metadata !20, metadata !DIExpression()), !dbg !21
- tail call void @use(i32 %call) #1, !dbg !24
+ tail call void @use(i32 %call), !dbg !24
%0 = getelementptr inbounds %struct.ident_t, ptr %.kmpc_loc.addr, i64 0, i32 4, !dbg !25
store ptr @1, ptr %0, align 8, !dbg !25, !tbaa !26
- call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr nonnull %.kmpc_loc.addr, i32 0, ptr @.omp_outlined.) #1, !dbg !25
+ call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr nonnull %.kmpc_loc.addr, i32 0, ptr @.omp_outlined.), !dbg !25
ret void, !dbg !32
}
-declare !dbg !4 dso_local void @omp_set_num_threads(i32) local_unnamed_addr #1
+declare !dbg !4 dso_local void @omp_set_num_threads(i32) local_unnamed_addr
-declare !dbg !9 dso_local i32 @omp_get_max_threads() local_unnamed_addr #1
+declare !dbg !9 dso_local i32 @omp_get_max_threads() local_unnamed_addr
-declare !dbg !12 dso_local void @use(i32) local_unnamed_addr #2
+declare !dbg !12 dso_local void @use(i32) local_unnamed_addr
; CHECK-DAG: remark: icv_remarks.c:18:0: OpenMP ICV nthreads Value: IMPLEMENTATION_DEFINED
; CHECK-DAG: remark: icv_remarks.c:18:0: OpenMP ICV active_levels Value: 0
; CHECK-DAG: remark: icv_remarks.c:18:0: OpenMP ICV cancel Value: 0
-define internal void @.omp_outlined.(ptr noalias nocapture readnone %.global_tid., ptr noalias nocapture readnone %.bound_tid.) #3 !dbg !33 {
+define internal void @.omp_outlined.(ptr noalias nocapture readnone %.global_tid., ptr noalias nocapture readnone %.bound_tid.) !dbg !33 {
entry:
call void @llvm.dbg.value(metadata ptr %.global_tid., metadata !41, metadata !DIExpression()), !dbg !43
call void @llvm.dbg.value(metadata ptr %.bound_tid., metadata !42, metadata !DIExpression()), !dbg !43
- call void @llvm.dbg.value(metadata ptr undef, metadata !44, metadata !DIExpression()) #1, !dbg !50
- call void @llvm.dbg.value(metadata ptr undef, metadata !47, metadata !DIExpression()) #1, !dbg !50
- tail call void @omp_set_num_threads(i32 10) #1, !dbg !52
- %call.i = tail call i32 @omp_get_max_threads() #1, !dbg !53
- call void @llvm.dbg.value(metadata i32 %call.i, metadata !48, metadata !DIExpression()) #1, !dbg !54
- tail call void @use(i32 %call.i) #1, !dbg !55
+ call void @llvm.dbg.value(metadata ptr undef, metadata !44, metadata !DIExpression()), !dbg !50
+ call void @llvm.dbg.value(metadata ptr undef, metadata !47, metadata !DIExpression()), !dbg !50
+ tail call void @omp_set_num_threads(i32 10), !dbg !52
+ %call.i = tail call i32 @omp_get_max_threads(), !dbg !53
+ call void @llvm.dbg.value(metadata i32 %call.i, metadata !48, metadata !DIExpression()), !dbg !54
+ tail call void @use(i32 %call.i), !dbg !55
ret void, !dbg !56
}
-declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #4
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg)
-declare !callback !57 dso_local void @__kmpc_fork_call(ptr, i32, ptr, ...) local_unnamed_addr #1
+declare !callback !57 dso_local void @__kmpc_fork_call(ptr, i32, ptr, ...) local_unnamed_addr
-declare void @llvm.dbg.value(metadata, metadata, metadata) #5
-
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind }
-attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { argmemonly nounwind willreturn }
-attributes #5 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!13, !14, !15, !59}
diff --git a/llvm/test/Transforms/PGOProfile/misexpect-branch-correct.ll b/llvm/test/Transforms/PGOProfile/misexpect-branch-correct.ll
index 0a1efda..eb1cc5e 100644
--- a/llvm/test/Transforms/PGOProfile/misexpect-branch-correct.ll
+++ b/llvm/test/Transforms/PGOProfile/misexpect-branch-correct.ll
@@ -9,7 +9,6 @@
; CHECK-NOT: remark: {{.*}}
; CHECK: !{!"branch_weights", i32 0, i32 200000}
-
; ModuleID = 'misexpect-branch-correct.c'
source_filename = "misexpect-branch-correct.c"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -19,14 +18,14 @@ target triple = "x86_64-unknown-linux-gnu"
@outer_loop = constant i32 2000, align 4
; Function Attrs: nounwind
-define i32 @bar() #0 {
+define i32 @bar() {
entry:
%rando = alloca i32, align 4
%x = alloca i32, align 4
- call void @llvm.lifetime.start.p0(ptr %rando) #4
+ call void @llvm.lifetime.start.p0(ptr %rando)
%call = call i32 (...) @buzz()
store i32 %call, ptr %rando, align 4, !tbaa !3
- call void @llvm.lifetime.start.p0(ptr %x) #4
+ call void @llvm.lifetime.start.p0(ptr %x)
store i32 0, ptr %x, align 4, !tbaa !3
%0 = load i32, ptr %rando, align 4, !tbaa !3
%rem = srem i32 %0, 200000
@@ -52,31 +51,25 @@ if.else: ; preds = %entry
if.end: ; preds = %if.else, %if.then
%2 = load i32, ptr %x, align 4, !tbaa !3
- call void @llvm.lifetime.end.p0(ptr %x) #4
- call void @llvm.lifetime.end.p0(ptr %rando) #4
+ call void @llvm.lifetime.end.p0(ptr %x)
+ call void @llvm.lifetime.end.p0(ptr %rando)
ret i32 %2
}
; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(ptr nocapture)
-declare i32 @buzz(...) #2
+declare i32 @buzz(...)
; Function Attrs: nounwind readnone willreturn
-declare i64 @llvm.expect.i64(i64, i64) #3
+declare i64 @llvm.expect.i64(i64, i64)
-declare i32 @baz(i32) #2
+declare i32 @baz(i32)
-declare i32 @foo(i32) #2
+declare i32 @foo(i32)
; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(ptr nocapture) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind readnone willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.lifetime.end.p0(ptr nocapture)
!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}
diff --git a/llvm/test/Transforms/PGOProfile/misexpect-branch-overflow.ll b/llvm/test/Transforms/PGOProfile/misexpect-branch-overflow.ll
index 68b233e..3390825 100644
--- a/llvm/test/Transforms/PGOProfile/misexpect-branch-overflow.ll
+++ b/llvm/test/Transforms/PGOProfile/misexpect-branch-overflow.ll
@@ -16,14 +16,14 @@ target triple = "x86_64-unknown-linux-gnu"
@outer_loop = constant i32 2000, align 4
; Function Attrs: nounwind
-define i32 @bar() #0 !dbg !6 {
+define i32 @bar() !dbg !6 {
entry:
%rando = alloca i32, align 4
%x = alloca i32, align 4
- call void @llvm.lifetime.start.p0(ptr %rando) #4, !dbg !9
+ call void @llvm.lifetime.start.p0(ptr %rando), !dbg !9
%call = call i32 (...) @buzz(), !dbg !9
store i32 %call, ptr %rando, align 4, !dbg !9, !tbaa !10
- call void @llvm.lifetime.start.p0(ptr %x) #4, !dbg !14
+ call void @llvm.lifetime.start.p0(ptr %x), !dbg !14
store i32 0, ptr %x, align 4, !dbg !14, !tbaa !10
%0 = load i32, ptr %rando, align 4, !dbg !15, !tbaa !10
%rem = srem i32 %0, 200000, !dbg !15
@@ -49,31 +49,25 @@ if.else: ; preds = %entry
if.end: ; preds = %if.else, %if.then
%2 = load i32, ptr %x, align 4, !dbg !19, !tbaa !10
- call void @llvm.lifetime.end.p0(ptr %x) #4, !dbg !20
- call void @llvm.lifetime.end.p0(ptr %rando) #4, !dbg !20
+ call void @llvm.lifetime.end.p0(ptr %x), !dbg !20
+ call void @llvm.lifetime.end.p0(ptr %rando), !dbg !20
ret i32 %2, !dbg !19
}
; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(ptr nocapture)
-declare i32 @buzz(...) #2
+declare i32 @buzz(...)
; Function Attrs: nounwind readnone willreturn
-declare i64 @llvm.expect.i64(i64, i64) #3
+declare i64 @llvm.expect.i64(i64, i64)
-declare i32 @baz(i32) #2
+declare i32 @baz(i32)
-declare i32 @foo(i32) #2
+declare i32 @foo(i32)
; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(ptr nocapture) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind readnone willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.lifetime.end.p0(ptr nocapture)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4}
diff --git a/llvm/test/Transforms/PGOProfile/misexpect-branch-stripped.ll b/llvm/test/Transforms/PGOProfile/misexpect-branch-stripped.ll
index 2f188f5..d34708c 100644
--- a/llvm/test/Transforms/PGOProfile/misexpect-branch-stripped.ll
+++ b/llvm/test/Transforms/PGOProfile/misexpect-branch-stripped.ll
@@ -19,7 +19,6 @@
; DISABLED-NOT: warning: <unknown>:0:0: 19.98%
; DISABLED-NOT: remark: <unknown>:0:0: Potential performance regression from use of the llvm.expect intrinsic: Annotation was correct on 19.98% (399668 / 2000000) of profiled executions.
-
; ModuleID = 'misexpect-branch.c'
source_filename = "misexpect-branch.c"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -29,14 +28,14 @@ target triple = "x86_64-unknown-linux-gnu"
@outer_loop = constant i32 2000, align 4
; Function Attrs: nounwind
-define i32 @bar() #0 {
+define i32 @bar() {
entry:
%rando = alloca i32, align 4
%x = alloca i32, align 4
- call void @llvm.lifetime.start.p0(ptr %rando) #4
+ call void @llvm.lifetime.start.p0(ptr %rando)
%call = call i32 (...) @buzz()
store i32 %call, ptr %rando, align 4, !tbaa !3
- call void @llvm.lifetime.start.p0(ptr %x) #4
+ call void @llvm.lifetime.start.p0(ptr %x)
store i32 0, ptr %x, align 4, !tbaa !3
%0 = load i32, ptr %rando, align 4, !tbaa !3
%rem = srem i32 %0, 200000
@@ -62,31 +61,25 @@ if.else: ; preds = %entry
if.end: ; preds = %if.else, %if.then
%2 = load i32, ptr %x, align 4, !tbaa !3
- call void @llvm.lifetime.end.p0(ptr %x) #4
- call void @llvm.lifetime.end.p0(ptr %rando) #4
+ call void @llvm.lifetime.end.p0(ptr %x)
+ call void @llvm.lifetime.end.p0(ptr %rando)
ret i32 %2
}
; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(ptr nocapture)
-declare i32 @buzz(...) #2
+declare i32 @buzz(...)
; Function Attrs: nounwind readnone willreturn
-declare i64 @llvm.expect.i64(i64, i64) #3
+declare i64 @llvm.expect.i64(i64, i64)
-declare i32 @baz(i32) #2
+declare i32 @baz(i32)
-declare i32 @foo(i32) #2
+declare i32 @foo(i32)
; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(ptr nocapture) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind readnone willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.lifetime.end.p0(ptr nocapture)
!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}
diff --git a/llvm/test/Transforms/PGOProfile/misexpect-branch-unpredictable.ll b/llvm/test/Transforms/PGOProfile/misexpect-branch-unpredictable.ll
index 4add781..a43e632 100644
--- a/llvm/test/Transforms/PGOProfile/misexpect-branch-unpredictable.ll
+++ b/llvm/test/Transforms/PGOProfile/misexpect-branch-unpredictable.ll
@@ -7,7 +7,6 @@
; CHECK-NOT: warning: {{.*}}
; CHECK-NOT: remark: {{.*}}
-
; ModuleID = 'misexpect-branch-unpredictable.c'
source_filename = "clang/test/Profile/misexpect-branch-unpredictable.c"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -17,14 +16,14 @@ target triple = "x86_64-unknown-linux-gnu"
@outer_loop = constant i32 2000, align 4
; Function Attrs: nounwind
-define i32 @bar() #0 {
+define i32 @bar() {
entry:
%rando = alloca i32, align 4
%x = alloca i32, align 4
- call void @llvm.lifetime.start.p0(ptr %rando) #3
+ call void @llvm.lifetime.start.p0(ptr %rando)
%call = call i32 (...) @buzz()
store i32 %call, ptr %rando, align 4, !tbaa !2
- call void @llvm.lifetime.start.p0(ptr %x) #3
+ call void @llvm.lifetime.start.p0(ptr %x)
store i32 0, ptr %x, align 4, !tbaa !2
%0 = load i32, ptr %rando, align 4, !tbaa !2
%rem = srem i32 %0, 200000
@@ -49,27 +48,22 @@ if.else: ; preds = %entry
if.end: ; preds = %if.else, %if.then
%2 = load i32, ptr %x, align 4, !tbaa !2
- call void @llvm.lifetime.end.p0(ptr %x) #3
- call void @llvm.lifetime.end.p0(ptr %rando) #3
+ call void @llvm.lifetime.end.p0(ptr %x)
+ call void @llvm.lifetime.end.p0(ptr %rando)
ret i32 %2
}
; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(ptr nocapture)
-declare i32 @buzz(...) #2
+declare i32 @buzz(...)
-declare i32 @baz(i32) #2
+declare i32 @baz(i32)
-declare i32 @foo(i32) #2
+declare i32 @foo(i32)
; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(ptr nocapture) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
+declare void @llvm.lifetime.end.p0(ptr nocapture)
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
diff --git a/llvm/test/Transforms/PGOProfile/misexpect-branch.ll b/llvm/test/Transforms/PGOProfile/misexpect-branch.ll
index 5a7731b..7e01f7a 100644
--- a/llvm/test/Transforms/PGOProfile/misexpect-branch.ll
+++ b/llvm/test/Transforms/PGOProfile/misexpect-branch.ll
@@ -33,14 +33,14 @@ target triple = "x86_64-unknown-linux-gnu"
@outer_loop = constant i32 2000, align 4
; Function Attrs: nounwind
-define i32 @bar() #0 !dbg !6 {
+define i32 @bar() !dbg !6 {
entry:
%rando = alloca i32, align 4
%x = alloca i32, align 4
- call void @llvm.lifetime.start.p0(ptr %rando) #4, !dbg !9
+ call void @llvm.lifetime.start.p0(ptr %rando), !dbg !9
%call = call i32 (...) @buzz(), !dbg !9
store i32 %call, ptr %rando, align 4, !dbg !9, !tbaa !10
- call void @llvm.lifetime.start.p0(ptr %x) #4, !dbg !14
+ call void @llvm.lifetime.start.p0(ptr %x), !dbg !14
store i32 0, ptr %x, align 4, !dbg !14, !tbaa !10
%0 = load i32, ptr %rando, align 4, !dbg !15, !tbaa !10
%rem = srem i32 %0, 200000, !dbg !15
@@ -66,31 +66,25 @@ if.else: ; preds = %entry
if.end: ; preds = %if.else, %if.then
%2 = load i32, ptr %x, align 4, !dbg !19, !tbaa !10
- call void @llvm.lifetime.end.p0(ptr %x) #4, !dbg !20
- call void @llvm.lifetime.end.p0(ptr %rando) #4, !dbg !20
+ call void @llvm.lifetime.end.p0(ptr %x), !dbg !20
+ call void @llvm.lifetime.end.p0(ptr %rando), !dbg !20
ret i32 %2, !dbg !19
}
; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(ptr nocapture)
-declare i32 @buzz(...) #2
+declare i32 @buzz(...)
; Function Attrs: nounwind readnone willreturn
-declare i64 @llvm.expect.i64(i64, i64) #3
+declare i64 @llvm.expect.i64(i64, i64)
-declare i32 @baz(i32) #2
+declare i32 @baz(i32)
-declare i32 @foo(i32) #2
+declare i32 @foo(i32)
; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(ptr nocapture) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind readnone willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.lifetime.end.p0(ptr nocapture)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4}
diff --git a/llvm/test/Transforms/PGOProfile/misexpect-switch-default.ll b/llvm/test/Transforms/PGOProfile/misexpect-switch-default.ll
index 859ba72..38c27b9 100644
--- a/llvm/test/Transforms/PGOProfile/misexpect-switch-default.ll
+++ b/llvm/test/Transforms/PGOProfile/misexpect-switch-default.ll
@@ -25,7 +25,6 @@
; CORRECT-NOT: warning: {{.*}}
; CORRECT-NOT: remark: {{.*}}
-
; ModuleID = 'misexpect-switch.c'
source_filename = "misexpect-switch.c"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -37,10 +36,10 @@ target triple = "x86_64-unknown-linux-gnu"
@arry = dso_local global [25 x i32] zeroinitializer, align 16
; Function Attrs: nounwind uwtable
-define dso_local void @init_arry() #0 {
+define dso_local void @init_arry() {
entry:
%i = alloca i32, align 4
- call void @llvm.lifetime.start.p0(ptr %i) #6
+ call void @llvm.lifetime.start.p0(ptr %i)
store i32 0, ptr %i, align 4, !tbaa !4
br label %for.cond
@@ -50,7 +49,7 @@ for.cond: ; preds = %for.inc, %entry
br i1 %cmp, label %for.body, label %for.end
for.body: ; preds = %for.cond
- %call = call i32 @rand() #6
+ %call = call i32 @rand()
%rem = srem i32 %call, 10
%1 = load i32, ptr %i, align 4, !tbaa !4
%idxprom = sext i32 %1 to i64
@@ -65,24 +64,24 @@ for.inc: ; preds = %for.body
br label %for.cond
for.end: ; preds = %for.cond
- call void @llvm.lifetime.end.p0(ptr %i) #6
+ call void @llvm.lifetime.end.p0(ptr %i)
ret void
}
; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(ptr nocapture)
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #2
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
; Function Attrs: nounwind
-declare dso_local i32 @rand() #3
+declare dso_local i32 @rand()
; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(ptr nocapture)
; Function Attrs: nounwind uwtable
-define dso_local i32 @main() #0 {
+define dso_local i32 @main() {
entry:
%retval = alloca i32, align 4
%val = alloca i32, align 4
@@ -90,9 +89,9 @@ entry:
%condition = alloca i32, align 4
store i32 0, ptr %retval, align 4
call void @init_arry()
- call void @llvm.lifetime.start.p0(ptr %val) #6
+ call void @llvm.lifetime.start.p0(ptr %val)
store i32 0, ptr %val, align 4, !tbaa !4
- call void @llvm.lifetime.start.p0(ptr %j) #6
+ call void @llvm.lifetime.start.p0(ptr %j)
store i32 0, ptr %j, align 4, !tbaa !4
br label %for.cond
@@ -102,8 +101,8 @@ for.cond: ; preds = %for.inc, %entry
br i1 %cmp, label %for.body, label %for.end
for.body: ; preds = %for.cond
- call void @llvm.lifetime.start.p0(ptr %condition) #6
- %call = call i32 @rand() #6
+ call void @llvm.lifetime.start.p0(ptr %condition)
+ %call = call i32 @rand()
%rem = srem i32 %call, 5
store i32 %rem, ptr %condition, align 4, !tbaa !4
%1 = load i32, ptr %condition, align 4, !tbaa !4
@@ -138,7 +137,7 @@ sw.default: ; preds = %for.body
unreachable
sw.epilog: ; preds = %sw.bb3, %sw.bb2, %sw.bb
- call void @llvm.lifetime.end.p0(ptr %condition) #6
+ call void @llvm.lifetime.end.p0(ptr %condition)
br label %for.inc
for.inc: ; preds = %sw.epilog
@@ -148,25 +147,17 @@ for.inc: ; preds = %sw.epilog
br label %for.cond
for.end: ; preds = %for.cond
- call void @llvm.lifetime.end.p0(ptr %j) #6
- call void @llvm.lifetime.end.p0(ptr %val) #6
+ call void @llvm.lifetime.end.p0(ptr %j)
+ call void @llvm.lifetime.end.p0(ptr %val)
ret i32 0
}
; Function Attrs: nounwind readnone willreturn
-declare i64 @llvm.expect.i64(i64, i64) #4
-
-declare dso_local i32 @sum(ptr, i32) #5
+declare i64 @llvm.expect.i64(i64, i64)
-declare dso_local i32 @random_sample(ptr, i32) #5
+declare dso_local i32 @sum(ptr, i32)
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { nounwind readnone speculatable willreturn }
-attributes #3 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind readnone willreturn }
-attributes #5 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #6 = { nounwind }
+declare dso_local i32 @random_sample(ptr, i32)
!llvm.module.flags = !{!0, !1, !2}
!llvm.ident = !{!3}
diff --git a/llvm/test/Transforms/PGOProfile/misexpect-switch.ll b/llvm/test/Transforms/PGOProfile/misexpect-switch.ll
index 242d5b8..9665559 100644
--- a/llvm/test/Transforms/PGOProfile/misexpect-switch.ll
+++ b/llvm/test/Transforms/PGOProfile/misexpect-switch.ll
@@ -28,7 +28,6 @@
; CORRECT-NOT: warning: {{.*}}
; CORRECT-NOT: remark: {{.*}}
-
; ModuleID = 'misexpect-switch.c'
source_filename = "misexpect-switch.c"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -40,10 +39,10 @@ target triple = "x86_64-unknown-linux-gnu"
@arry = dso_local global [25 x i32] zeroinitializer, align 16, !dbg !12
; Function Attrs: nounwind uwtable
-define dso_local void @init_arry() #0 !dbg !21 {
+define dso_local void @init_arry() !dbg !21 {
entry:
%i = alloca i32, align 4
- call void @llvm.lifetime.start.p0(ptr %i) #6, !dbg !26
+ call void @llvm.lifetime.start.p0(ptr %i), !dbg !26
call void @llvm.dbg.declare(metadata ptr %i, metadata !25, metadata !DIExpression()), !dbg !27
store i32 0, ptr %i, align 4, !dbg !28, !tbaa !30
br label %for.cond, !dbg !34
@@ -54,7 +53,7 @@ for.cond: ; preds = %for.inc, %entry
br i1 %cmp, label %for.body, label %for.end, !dbg !38
for.body: ; preds = %for.cond
- %call = call i32 @rand() #6, !dbg !39
+ %call = call i32 @rand(), !dbg !39
%rem = srem i32 %call, 10, !dbg !41
%1 = load i32, ptr %i, align 4, !dbg !42, !tbaa !30
%idxprom = sext i32 %1 to i64, !dbg !43
@@ -69,24 +68,24 @@ for.inc: ; preds = %for.body
br label %for.cond, !dbg !47, !llvm.loop !48
for.end: ; preds = %for.cond
- call void @llvm.lifetime.end.p0(ptr %i) #6, !dbg !50
+ call void @llvm.lifetime.end.p0(ptr %i), !dbg !50
ret void, !dbg !50
}
; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(ptr nocapture)
; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #2
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
; Function Attrs: nounwind
-declare dso_local i32 @rand() #3
+declare dso_local i32 @rand()
; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(ptr nocapture)
; Function Attrs: nounwind uwtable
-define dso_local i32 @main() #0 !dbg !51 {
+define dso_local i32 @main() !dbg !51 {
entry:
%retval = alloca i32, align 4
%val = alloca i32, align 4
@@ -94,10 +93,10 @@ entry:
%condition = alloca i32, align 4
store i32 0, ptr %retval, align 4
call void @init_arry(), !dbg !62
- call void @llvm.lifetime.start.p0(ptr %val) #6, !dbg !63
+ call void @llvm.lifetime.start.p0(ptr %val), !dbg !63
call void @llvm.dbg.declare(metadata ptr %val, metadata !55, metadata !DIExpression()), !dbg !64
store i32 0, ptr %val, align 4, !dbg !64, !tbaa !30
- call void @llvm.lifetime.start.p0(ptr %j) #6, !dbg !65
+ call void @llvm.lifetime.start.p0(ptr %j), !dbg !65
call void @llvm.dbg.declare(metadata ptr %j, metadata !56, metadata !DIExpression()), !dbg !66
store i32 0, ptr %j, align 4, !dbg !67, !tbaa !30
br label %for.cond, !dbg !68
@@ -108,9 +107,9 @@ for.cond: ; preds = %for.inc, %entry
br i1 %cmp, label %for.body, label %for.end, !dbg !71
for.body: ; preds = %for.cond
- call void @llvm.lifetime.start.p0(ptr %condition) #6, !dbg !72
+ call void @llvm.lifetime.start.p0(ptr %condition), !dbg !72
call void @llvm.dbg.declare(metadata ptr %condition, metadata !57, metadata !DIExpression()), !dbg !73
- %call = call i32 @rand() #6, !dbg !74
+ %call = call i32 @rand(), !dbg !74
%rem = srem i32 %call, 5, !dbg !75
store i32 %rem, ptr %condition, align 4, !dbg !73, !tbaa !30
%1 = load i32, ptr %condition, align 4, !dbg !76, !tbaa !30
@@ -145,7 +144,7 @@ sw.default: ; preds = %for.body
unreachable, !dbg !87
sw.epilog: ; preds = %sw.bb3, %sw.bb2, %sw.bb
- call void @llvm.lifetime.end.p0(ptr %condition) #6, !dbg !88
+ call void @llvm.lifetime.end.p0(ptr %condition), !dbg !88
br label %for.inc, !dbg !89
for.inc: ; preds = %sw.epilog
@@ -155,25 +154,17 @@ for.inc: ; preds = %sw.epilog
br label %for.cond, !dbg !91, !llvm.loop !92
for.end: ; preds = %for.cond
- call void @llvm.lifetime.end.p0(ptr %j) #6, !dbg !94
- call void @llvm.lifetime.end.p0(ptr %val) #6, !dbg !94
+ call void @llvm.lifetime.end.p0(ptr %j), !dbg !94
+ call void @llvm.lifetime.end.p0(ptr %val), !dbg !94
ret i32 0, !dbg !95
}
; Function Attrs: nounwind readnone willreturn
-declare i64 @llvm.expect.i64(i64, i64) #4
-
-declare dso_local i32 @sum(ptr, i32) #5
+declare i64 @llvm.expect.i64(i64, i64)
-declare dso_local i32 @random_sample(ptr, i32) #5
+declare dso_local i32 @sum(ptr, i32)
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { nounwind readnone speculatable willreturn }
-attributes #3 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind readnone willreturn }
-attributes #5 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #6 = { nounwind }
+declare dso_local i32 @random_sample(ptr, i32)
!llvm.dbg.cu = !{!2}
!llvm.module.flags = !{!17, !18, !19}
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/reduce_muladd.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/reduce_muladd.ll
index 27e8fd0..3b61750 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/reduce_muladd.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/reduce_muladd.ll
@@ -6,7 +6,7 @@ target triple = "aarch64"
; This function (a 16x reduction of a[i] * b[i]) should be vectorized successfully.
-define dso_local nofpclass(nan inf) float @vmlaq(ptr noundef %0, ptr noundef %1) #0 {
+define dso_local nofpclass(nan inf) float @vmlaq(ptr noundef %0, ptr noundef %1) {
; CHECK-LABEL: define dso_local nofpclass(nan inf) float @vmlaq
; CHECK-SAME: (ptr noundef readonly captures(none) [[TMP0:%.*]], ptr noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[TMP3:%.*]] = load <16 x float>, ptr [[TMP0]], align 4, !tbaa [[TBAA4:![0-9]+]]
@@ -21,9 +21,9 @@ define dso_local nofpclass(nan inf) float @vmlaq(ptr noundef %0, ptr noundef %1)
%6 = alloca i32, align 4
store ptr %0, ptr %3, align 8, !tbaa !4
store ptr %1, ptr %4, align 8, !tbaa !4
- call void @llvm.lifetime.start.p0(ptr %5) #2
+ call void @llvm.lifetime.start.p0(ptr %5)
store float 0.000000e+00, ptr %5, align 4, !tbaa !9
- call void @llvm.lifetime.start.p0(ptr %6) #2
+ call void @llvm.lifetime.start.p0(ptr %6)
store i32 0, ptr %6, align 4, !tbaa !11
br label %7
@@ -33,7 +33,7 @@ define dso_local nofpclass(nan inf) float @vmlaq(ptr noundef %0, ptr noundef %1)
br i1 %9, label %11, label %10
10: ; preds = %7
- call void @llvm.lifetime.end.p0(ptr %6) #2
+ call void @llvm.lifetime.end.p0(ptr %6)
br label %28
11: ; preds = %7
@@ -61,16 +61,12 @@ define dso_local nofpclass(nan inf) float @vmlaq(ptr noundef %0, ptr noundef %1)
28: ; preds = %10
%29 = load float, ptr %5, align 4, !tbaa !9
- call void @llvm.lifetime.end.p0(ptr %5) #2
+ call void @llvm.lifetime.end.p0(ptr %5)
ret float %29
}
-declare void @llvm.lifetime.start.p0(ptr captures(none)) #1
-declare void @llvm.lifetime.end.p0(ptr captures(none)) #1
-
-attributes #0 = { nounwind uwtable "frame-pointer"="non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+v8a,-fmv" "unsafe-fp-math"="true" }
-attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
-attributes #2 = { nounwind }
+declare void @llvm.lifetime.start.p0(ptr captures(none))
+declare void @llvm.lifetime.end.p0(ptr captures(none))
!llvm.module.flags = !{!0, !1, !2}
!llvm.ident = !{!3}
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/reduce_submuladd.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/reduce_submuladd.ll
index 68bfbc1..eefde9d 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/reduce_submuladd.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/reduce_submuladd.ll
@@ -6,7 +6,7 @@ target triple = "aarch64"
; This function (a more complex reduction of (a[i] - b[i]) * itself) should be vectorized successfully.
-define dso_local noundef nofpclass(nan inf) float @_Z4testPKfS0_ii(ptr noundef %0, ptr noundef %1, i32 noundef %2, i32 noundef %3) #0 {
+define dso_local noundef nofpclass(nan inf) float @_Z4testPKfS0_ii(ptr noundef %0, ptr noundef %1, i32 noundef %2, i32 noundef %3) {
; CHECK-LABEL: define dso_local noundef nofpclass(nan inf) float @_Z4testPKfS0_ii
; CHECK-SAME: (ptr noundef readonly captures(none) [[TMP0:%.*]], ptr noundef readonly captures(none) [[TMP1:%.*]], i32 noundef [[TMP2:%.*]], i32 noundef [[TMP3:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: .preheader.i:
@@ -125,7 +125,7 @@ define dso_local noundef nofpclass(nan inf) float @_Z4testPKfS0_ii(ptr noundef %
ret float %13
}
-define internal noundef nofpclass(nan inf) float @_ZL6reduceILi7EEfPKfS1_ii(ptr noundef %0, ptr noundef %1, i32 noundef %2, i32 noundef %3) #1 {
+define internal noundef nofpclass(nan inf) float @_ZL6reduceILi7EEfPKfS1_ii(ptr noundef %0, ptr noundef %1, i32 noundef %2, i32 noundef %3) {
%5 = alloca ptr, align 8
%6 = alloca ptr, align 8
%7 = alloca i32, align 4
@@ -143,15 +143,15 @@ define internal noundef nofpclass(nan inf) float @_ZL6reduceILi7EEfPKfS1_ii(ptr
store ptr %1, ptr %6, align 8, !tbaa !4
store i32 %2, ptr %7, align 4, !tbaa !9
store i32 %3, ptr %8, align 4, !tbaa !9
- call void @llvm.lifetime.start.p0(ptr %9) #3
+ call void @llvm.lifetime.start.p0(ptr %9)
store i32 3, ptr %9, align 4, !tbaa !9
- call void @llvm.lifetime.start.p0(ptr %10) #3
+ call void @llvm.lifetime.start.p0(ptr %10)
store i32 3, ptr %10, align 4, !tbaa !9
- call void @llvm.lifetime.start.p0(ptr %11) #3
+ call void @llvm.lifetime.start.p0(ptr %11)
store i32 7, ptr %11, align 4, !tbaa !9
- call void @llvm.lifetime.start.p0(ptr %12) #3
+ call void @llvm.lifetime.start.p0(ptr %12)
store float 0.000000e+00, ptr %12, align 4, !tbaa !11
- call void @llvm.lifetime.start.p0(ptr %13) #3
+ call void @llvm.lifetime.start.p0(ptr %13)
store i32 0, ptr %13, align 4, !tbaa !9
br label %18
@@ -162,13 +162,13 @@ define internal noundef nofpclass(nan inf) float @_ZL6reduceILi7EEfPKfS1_ii(ptr
21: ; preds = %18
store i32 2, ptr %14, align 4
- call void @llvm.lifetime.end.p0(ptr %13) #3
+ call void @llvm.lifetime.end.p0(ptr %13)
br label %62
22: ; preds = %18
- call void @llvm.lifetime.start.p0(ptr %15) #3
+ call void @llvm.lifetime.start.p0(ptr %15)
store float 0.000000e+00, ptr %15, align 4, !tbaa !11
- call void @llvm.lifetime.start.p0(ptr %16) #3
+ call void @llvm.lifetime.start.p0(ptr %16)
store i32 0, ptr %16, align 4, !tbaa !9
br label %23
@@ -179,11 +179,11 @@ define internal noundef nofpclass(nan inf) float @_ZL6reduceILi7EEfPKfS1_ii(ptr
26: ; preds = %23
store i32 5, ptr %14, align 4
- call void @llvm.lifetime.end.p0(ptr %16) #3
+ call void @llvm.lifetime.end.p0(ptr %16)
br label %47
27: ; preds = %23
- call void @llvm.lifetime.start.p0(ptr %17) #3
+ call void @llvm.lifetime.start.p0(ptr %17)
%28 = load ptr, ptr %5, align 8, !tbaa !4
%29 = load i32, ptr %16, align 4, !tbaa !9
%30 = sext i32 %29 to i64
@@ -202,7 +202,7 @@ define internal noundef nofpclass(nan inf) float @_ZL6reduceILi7EEfPKfS1_ii(ptr
%42 = load float, ptr %15, align 4, !tbaa !11
%43 = fadd fast float %42, %41
store float %43, ptr %15, align 4, !tbaa !11
- call void @llvm.lifetime.end.p0(ptr %17) #3
+ call void @llvm.lifetime.end.p0(ptr %17)
br label %44
44: ; preds = %27
@@ -226,7 +226,7 @@ define internal noundef nofpclass(nan inf) float @_ZL6reduceILi7EEfPKfS1_ii(ptr
%57 = load float, ptr %12, align 4, !tbaa !11
%58 = fadd fast float %57, %56
store float %58, ptr %12, align 4, !tbaa !11
- call void @llvm.lifetime.end.p0(ptr %15) #3
+ call void @llvm.lifetime.end.p0(ptr %15)
br label %59
59: ; preds = %47
@@ -238,20 +238,15 @@ define internal noundef nofpclass(nan inf) float @_ZL6reduceILi7EEfPKfS1_ii(ptr
62: ; preds = %21
%63 = load float, ptr %12, align 4, !tbaa !11
store i32 1, ptr %14, align 4
- call void @llvm.lifetime.end.p0(ptr %12) #3
- call void @llvm.lifetime.end.p0(ptr %11) #3
- call void @llvm.lifetime.end.p0(ptr %10) #3
- call void @llvm.lifetime.end.p0(ptr %9) #3
+ call void @llvm.lifetime.end.p0(ptr %12)
+ call void @llvm.lifetime.end.p0(ptr %11)
+ call void @llvm.lifetime.end.p0(ptr %10)
+ call void @llvm.lifetime.end.p0(ptr %9)
ret float %63
}
-declare void @llvm.lifetime.start.p0(ptr captures(none)) #2
-declare void @llvm.lifetime.end.p0(ptr captures(none)) #2
-
-attributes #0 = { mustprogress uwtable "frame-pointer"="non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+v8a,-fmv" "unsafe-fp-math"="true" }
-attributes #1 = { inlinehint mustprogress nounwind uwtable "frame-pointer"="non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+v8a,-fmv" "unsafe-fp-math"="true" }
-attributes #2 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
-attributes #3 = { nounwind }
+declare void @llvm.lifetime.start.p0(ptr captures(none))
+declare void @llvm.lifetime.end.p0(ptr captures(none))
!llvm.module.flags = !{!0, !1, !2}
!llvm.ident = !{!3}
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll
index 92e625d..82ecc3a 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll
@@ -10,7 +10,7 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
target triple = "aarch64"
; Function Attrs: nounwind uwtable
-define i32 @slpordering(ptr noundef %p1, i32 noundef %ip1, ptr noundef %p2, i32 noundef %ip2) #0 {
+define i32 @slpordering(ptr noundef %p1, i32 noundef %ip1, ptr noundef %p2, i32 noundef %ip2) {
; CHECK-LABEL: define range(i32 0, 65536) i32 @slpordering(
; CHECK-SAME: ptr noundef readonly captures(none) [[P1:%.*]], i32 noundef [[IP1:%.*]], ptr noundef readonly captures(none) [[P2:%.*]], i32 noundef [[IP2:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
@@ -136,14 +136,14 @@ entry:
store i32 %ip1, ptr %ip1.addr, align 4, !tbaa !8
store ptr %p2, ptr %p2.addr, align 8, !tbaa !4
store i32 %ip2, ptr %ip2.addr, align 4, !tbaa !8
- call void @llvm.lifetime.start.p0(ptr %emp) #2
- call void @llvm.lifetime.start.p0(ptr %r0) #2
- call void @llvm.lifetime.start.p0(ptr %r1) #2
- call void @llvm.lifetime.start.p0(ptr %r2) #2
- call void @llvm.lifetime.start.p0(ptr %r3) #2
- call void @llvm.lifetime.start.p0(ptr %sum) #2
+ call void @llvm.lifetime.start.p0(ptr %emp)
+ call void @llvm.lifetime.start.p0(ptr %r0)
+ call void @llvm.lifetime.start.p0(ptr %r1)
+ call void @llvm.lifetime.start.p0(ptr %r2)
+ call void @llvm.lifetime.start.p0(ptr %r3)
+ call void @llvm.lifetime.start.p0(ptr %sum)
store i32 0, ptr %sum, align 4, !tbaa !8
- call void @llvm.lifetime.start.p0(ptr %i) #2
+ call void @llvm.lifetime.start.p0(ptr %i)
store i32 0, ptr %i, align 4, !tbaa !8
br label %for.cond
@@ -153,7 +153,7 @@ for.cond: ; preds = %for.inc, %entry
br i1 %cmp, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond
- call void @llvm.lifetime.end.p0(ptr %i) #2
+ call void @llvm.lifetime.end.p0(ptr %i)
br label %for.end
for.body: ; preds = %for.cond
@@ -241,22 +241,22 @@ for.body: ; preds = %for.cond
%shl42 = shl i32 %sub41, 16
%rdd43 = add nsw i32 %sub36, %shl42
store i32 %rdd43, ptr %r3, align 4, !tbaa !8
- call void @llvm.lifetime.start.p0(ptr %e0) #2
+ call void @llvm.lifetime.start.p0(ptr %e0)
%33 = load i32, ptr %r0, align 4, !tbaa !8
%34 = load i32, ptr %r1, align 4, !tbaa !8
%rdd44 = add i32 %33, %34
store i32 %rdd44, ptr %e0, align 4, !tbaa !8
- call void @llvm.lifetime.start.p0(ptr %e1) #2
+ call void @llvm.lifetime.start.p0(ptr %e1)
%35 = load i32, ptr %r0, align 4, !tbaa !8
%36 = load i32, ptr %r1, align 4, !tbaa !8
%sub45 = sub i32 %35, %36
store i32 %sub45, ptr %e1, align 4, !tbaa !8
- call void @llvm.lifetime.start.p0(ptr %e2) #2
+ call void @llvm.lifetime.start.p0(ptr %e2)
%37 = load i32, ptr %r2, align 4, !tbaa !8
%38 = load i32, ptr %r3, align 4, !tbaa !8
%rdd46 = add i32 %37, %38
store i32 %rdd46, ptr %e2, align 4, !tbaa !8
- call void @llvm.lifetime.start.p0(ptr %e3) #2
+ call void @llvm.lifetime.start.p0(ptr %e3)
%39 = load i32, ptr %r2, align 4, !tbaa !8
%40 = load i32, ptr %r3, align 4, !tbaa !8
%sub47 = sub i32 %39, %40
@@ -293,10 +293,10 @@ for.body: ; preds = %for.cond
%rrrayidx61 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 %idxprom60
%rrrayidx62 = getelementptr inbounds [4 x i32], ptr %rrrayidx61, i64 0, i64 3
store i32 %sub59, ptr %rrrayidx62, align 4, !tbaa !8
- call void @llvm.lifetime.end.p0(ptr %e3) #2
- call void @llvm.lifetime.end.p0(ptr %e2) #2
- call void @llvm.lifetime.end.p0(ptr %e1) #2
- call void @llvm.lifetime.end.p0(ptr %e0) #2
+ call void @llvm.lifetime.end.p0(ptr %e3)
+ call void @llvm.lifetime.end.p0(ptr %e2)
+ call void @llvm.lifetime.end.p0(ptr %e1)
+ call void @llvm.lifetime.end.p0(ptr %e0)
br label %for.inc
for.inc: ; preds = %for.body
@@ -316,7 +316,7 @@ for.inc: ; preds = %for.body
br label %for.cond, !llvm.loop !11
for.end: ; preds = %for.cond.cleanup
- call void @llvm.lifetime.start.p0(ptr %i65) #2
+ call void @llvm.lifetime.start.p0(ptr %i65)
store i32 0, ptr %i65, align 4, !tbaa !8
br label %for.cond66
@@ -326,11 +326,11 @@ for.cond66: ; preds = %for.inc114, %for.en
br i1 %cmp67, label %for.body70, label %for.cond.cleanup69
for.cond.cleanup69: ; preds = %for.cond66
- call void @llvm.lifetime.end.p0(ptr %i65) #2
+ call void @llvm.lifetime.end.p0(ptr %i65)
br label %for.end116
for.body70: ; preds = %for.cond66
- call void @llvm.lifetime.start.p0(ptr %e071) #2
+ call void @llvm.lifetime.start.p0(ptr %e071)
%rrrayidx72 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 0
%59 = load i32, ptr %i65, align 4, !tbaa !8
%idxprom73 = sext i32 %59 to i64
@@ -343,7 +343,7 @@ for.body70: ; preds = %for.cond66
%62 = load i32, ptr %rrrayidx77, align 4, !tbaa !8
%rdd78 = add i32 %60, %62
store i32 %rdd78, ptr %e071, align 4, !tbaa !8
- call void @llvm.lifetime.start.p0(ptr %e179) #2
+ call void @llvm.lifetime.start.p0(ptr %e179)
%rrrayidx80 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 0
%63 = load i32, ptr %i65, align 4, !tbaa !8
%idxprom81 = sext i32 %63 to i64
@@ -356,7 +356,7 @@ for.body70: ; preds = %for.cond66
%66 = load i32, ptr %rrrayidx85, align 4, !tbaa !8
%sub86 = sub i32 %64, %66
store i32 %sub86, ptr %e179, align 4, !tbaa !8
- call void @llvm.lifetime.start.p0(ptr %e287) #2
+ call void @llvm.lifetime.start.p0(ptr %e287)
%rrrayidx88 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 2
%67 = load i32, ptr %i65, align 4, !tbaa !8
%idxprom89 = sext i32 %67 to i64
@@ -369,7 +369,7 @@ for.body70: ; preds = %for.cond66
%70 = load i32, ptr %rrrayidx93, align 4, !tbaa !8
%rdd94 = add i32 %68, %70
store i32 %rdd94, ptr %e287, align 4, !tbaa !8
- call void @llvm.lifetime.start.p0(ptr %e395) #2
+ call void @llvm.lifetime.start.p0(ptr %e395)
%rrrayidx96 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 2
%71 = load i32, ptr %i65, align 4, !tbaa !8
%idxprom97 = sext i32 %71 to i64
@@ -398,10 +398,10 @@ for.body70: ; preds = %for.cond66
%82 = load i32, ptr %e395, align 4, !tbaa !8
%sub106 = sub nsw i32 %81, %82
store i32 %sub106, ptr %r3, align 4, !tbaa !8
- call void @llvm.lifetime.end.p0(ptr %e395) #2
- call void @llvm.lifetime.end.p0(ptr %e287) #2
- call void @llvm.lifetime.end.p0(ptr %e179) #2
- call void @llvm.lifetime.end.p0(ptr %e071) #2
+ call void @llvm.lifetime.end.p0(ptr %e395)
+ call void @llvm.lifetime.end.p0(ptr %e287)
+ call void @llvm.lifetime.end.p0(ptr %e179)
+ call void @llvm.lifetime.end.p0(ptr %e071)
%83 = load i32, ptr %r0, align 4, !tbaa !8
%call = call i32 @twoabs(i32 noundef %83)
%84 = load i32, ptr %r1, align 4, !tbaa !8
@@ -432,28 +432,28 @@ for.end116: ; preds = %for.cond.cleanup69
%shr = lshr i32 %90, 16
%rdd119 = add i32 %conv118, %shr
%shr120 = lshr i32 %rdd119, 1
- call void @llvm.lifetime.end.p0(ptr %sum) #2
- call void @llvm.lifetime.end.p0(ptr %r3) #2
- call void @llvm.lifetime.end.p0(ptr %r2) #2
- call void @llvm.lifetime.end.p0(ptr %r1) #2
- call void @llvm.lifetime.end.p0(ptr %r0) #2
- call void @llvm.lifetime.end.p0(ptr %emp) #2
+ call void @llvm.lifetime.end.p0(ptr %sum)
+ call void @llvm.lifetime.end.p0(ptr %r3)
+ call void @llvm.lifetime.end.p0(ptr %r2)
+ call void @llvm.lifetime.end.p0(ptr %r1)
+ call void @llvm.lifetime.end.p0(ptr %r0)
+ call void @llvm.lifetime.end.p0(ptr %emp)
ret i32 %shr120
}
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
-declare void @llvm.lifetime.start.p0(ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(ptr nocapture)
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
-declare void @llvm.lifetime.end.p0(ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(ptr nocapture)
; Function Attrs: nounwind uwtable
-define internal i32 @twoabs(i32 noundef %r) #0 {
+define internal i32 @twoabs(i32 noundef %r) {
entry:
%r.addr = alloca i32, align 4
%s = alloca i32, align 4
store i32 %r, ptr %r.addr, align 4, !tbaa !8
- call void @llvm.lifetime.start.p0(ptr %s) #2
+ call void @llvm.lifetime.start.p0(ptr %s)
%0 = load i32, ptr %r.addr, align 4, !tbaa !8
%shr = lshr i32 %0, 15
%rnd = and i32 %shr, 65537
@@ -464,14 +464,10 @@ entry:
%rdd = add i32 %1, %2
%3 = load i32, ptr %s, align 4, !tbaa !8
%xor = xor i32 %rdd, %3
- call void @llvm.lifetime.end.p0(ptr %s) #2
+ call void @llvm.lifetime.end.p0(ptr %s)
ret i32 %xor
}
-attributes #0 = { nounwind uwtable "frame-pointer"="non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+v8a,-fmv" "unsafe-fp-math"="true" }
-attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
-attributes #2 = { nounwind }
-
!4 = !{!5, !5, i64 0}
!5 = !{!"any pointer", !6, i64 0}
!6 = !{!"omnipotent char", !7, i64 0}
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_add_q7.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_add_q7.ll
index 7fb72e6..811957c 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_add_q7.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_add_q7.ll
@@ -170,5 +170,5 @@ unreachable: ; preds = %cleanup
declare void @llvm.lifetime.start.p0(ptr nocapture)
declare void @llvm.lifetime.end.p0(ptr nocapture)
-attributes #0 = { nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" "unsafe-fp-math"="true" }
-attributes #1 = { alwaysinline nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" "unsafe-fp-math"="true" }
+attributes #0 = { nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" }
+attributes #1 = { alwaysinline nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" }
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_fill_q7.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_fill_q7.ll
index 436f848a..6735577 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_fill_q7.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_fill_q7.ll
@@ -9,7 +9,7 @@ target triple = "thumbv6m-none-none-eabi"
; should be deleted, too.
; Function Attrs: nounwind
-define dso_local void @arm_fill_q7(i8 signext %value, ptr %pDst, i32 %blockSize) #0 {
+define dso_local void @arm_fill_q7(i8 signext %value, ptr %pDst, i32 %blockSize) {
; OLDPM-LABEL: @arm_fill_q7(
; OLDPM-NEXT: entry:
; OLDPM-NEXT: [[CMP_NOT20:%.*]] = icmp ult i32 [[BLOCKSIZE:%.*]], 4
@@ -59,8 +59,8 @@ entry:
store i8 %value, ptr %value.addr, align 1, !tbaa !3
store ptr %pDst, ptr %pDst.addr, align 4, !tbaa !6
store i32 %blockSize, ptr %blockSize.addr, align 4, !tbaa !8
- call void @llvm.lifetime.start.p0(ptr %blkCnt) #3
- call void @llvm.lifetime.start.p0(ptr %packedValue) #3
+ call void @llvm.lifetime.start.p0(ptr %blkCnt)
+ call void @llvm.lifetime.start.p0(ptr %packedValue)
%0 = load i8, ptr %value.addr, align 1, !tbaa !3
%conv = sext i8 %0 to i32
%shl = shl i32 %conv, 0
@@ -122,23 +122,23 @@ while.body16: ; preds = %while.cond13
br label %while.cond13, !llvm.loop !12
while.end18: ; preds = %while.cond13
- call void @llvm.lifetime.end.p0(ptr %packedValue) #3
- call void @llvm.lifetime.end.p0(ptr %blkCnt) #3
+ call void @llvm.lifetime.end.p0(ptr %packedValue)
+ call void @llvm.lifetime.end.p0(ptr %blkCnt)
ret void
}
; Function Attrs: argmemonly nofree nosync nounwind willreturn
-declare void @llvm.lifetime.start.p0(ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(ptr nocapture)
; Function Attrs: alwaysinline nounwind
-define internal void @write_q7x4_ia(ptr %pQ7, i32 %value) #2 {
+define internal void @write_q7x4_ia(ptr %pQ7, i32 %value) {
entry:
%pQ7.addr = alloca ptr, align 4
%value.addr = alloca i32, align 4
%val = alloca i32, align 4
store ptr %pQ7, ptr %pQ7.addr, align 4, !tbaa !6
store i32 %value, ptr %value.addr, align 4, !tbaa !8
- call void @llvm.lifetime.start.p0(ptr %val) #3
+ call void @llvm.lifetime.start.p0(ptr %val)
%0 = load i32, ptr %value.addr, align 4, !tbaa !8
store i32 %0, ptr %val, align 4, !tbaa !8
%1 = load i32, ptr %val, align 4, !tbaa !8
@@ -175,17 +175,12 @@ entry:
%14 = load ptr, ptr %13, align 4, !tbaa !6
%add.ptr = getelementptr inbounds i8, ptr %14, i32 4
store ptr %add.ptr, ptr %13, align 4, !tbaa !6
- call void @llvm.lifetime.end.p0(ptr %val) #3
+ call void @llvm.lifetime.end.p0(ptr %val)
ret void
}
; Function Attrs: argmemonly nofree nosync nounwind willreturn
-declare void @llvm.lifetime.end.p0(ptr nocapture) #1
-
-attributes #0 = { nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m0plus" "target-features"="+armv6-m,+strict-align,+thumb-mode,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-dotprod,-dsp,-fp16fml,-fullfp16,-hwdiv,-hwdiv-arm,-i8mm,-lob,-mve,-mve.fp,-ras,-sb,-sha2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nofree nosync nounwind willreturn }
-attributes #2 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m0plus" "target-features"="+armv6-m,+strict-align,+thumb-mode,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-dotprod,-dsp,-fp16fml,-fullfp16,-hwdiv,-hwdiv-arm,-i8mm,-lob,-mve,-mve.fp,-ras,-sb,-sha2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
+declare void @llvm.lifetime.end.p0(ptr nocapture)
!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll
index c186207..4274719 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll
@@ -133,7 +133,7 @@ declare void @llvm.lifetime.start.p0(ptr nocapture) #1
declare i32 @llvm.arm.mve.addv.v16i8(<16 x i8>, i32) #2
declare void @llvm.lifetime.end.p0(ptr nocapture) #1
-attributes #0 = { nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-pacbti,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" "unsafe-fp-math"="true" }
+attributes #0 = { nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-pacbti,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" }
attributes #1 = { argmemonly nocallback nofree nosync nounwind willreturn }
attributes #2 = { nounwind readnone }
attributes #3 = { nounwind }
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
index 42fdafb..5127b7d 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
@@ -216,7 +216,7 @@ unreachable: ; preds = %cleanup
declare void @llvm.lifetime.end.p0(ptr nocapture) #1
-attributes #0 = { nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" "unsafe-fp-math"="true" }
+attributes #0 = { nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" }
attributes #1 = { argmemonly nofree nosync nounwind willreturn }
-attributes #2 = { alwaysinline nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" "unsafe-fp-math"="true" }
+attributes #2 = { alwaysinline nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" }
attributes #3 = { nounwind }
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/fmaddsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/fmaddsub.ll
index 8e25c9c..c5f56d3 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/fmaddsub.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/fmaddsub.ll
@@ -15,7 +15,7 @@
; Ideally, this should reach the backend with 1 fmul, 1 fsub, 1 fadd, and 1 shuffle.
; That may require some coordination between VectorCombine, SLP, and other passes.
-define <4 x float> @buildvector_mul_addsub_ps128(<4 x float> %C, <4 x float> %D, <4 x float> %B) #0 {
+define <4 x float> @buildvector_mul_addsub_ps128(<4 x float> %C, <4 x float> %D, <4 x float> %B) {
; CHECK-LABEL: @buildvector_mul_addsub_ps128(
; CHECK-NEXT: [[A:%.*]] = fmul <4 x float> [[C:%.*]], [[D:%.*]]
; CHECK-NEXT: [[TMP0:%.*]] = fsub <4 x float> [[A]], [[B:%.*]]
@@ -43,7 +43,7 @@ define <4 x float> @buildvector_mul_addsub_ps128(<4 x float> %C, <4 x float> %D,
ret <4 x float> %vecinsert4
}
-define <2 x double> @buildvector_mul_addsub_pd128(<2 x double> %C, <2 x double> %D, <2 x double> %B) #0 {
+define <2 x double> @buildvector_mul_addsub_pd128(<2 x double> %C, <2 x double> %D, <2 x double> %B) {
; CHECK-LABEL: @buildvector_mul_addsub_pd128(
; CHECK-NEXT: [[A:%.*]] = fmul <2 x double> [[C:%.*]], [[D:%.*]]
; CHECK-NEXT: [[TMP0:%.*]] = fsub <2 x double> [[A]], [[B:%.*]]
@@ -63,7 +63,7 @@ define <2 x double> @buildvector_mul_addsub_pd128(<2 x double> %C, <2 x double>
ret <2 x double> %vecinsert2
}
-define <8 x float> @buildvector_mul_addsub_ps256(<8 x float> %C, <8 x float> %D, <8 x float> %B) #0 {
+define <8 x float> @buildvector_mul_addsub_ps256(<8 x float> %C, <8 x float> %D, <8 x float> %B) {
; SSE2-LABEL: @buildvector_mul_addsub_ps256(
; SSE2-NEXT: [[A:%.*]] = fmul <8 x float> [[C:%.*]], [[D:%.*]]
; SSE2-NEXT: [[TMP0:%.*]] = fsub <8 x float> [[A]], [[B:%.*]]
@@ -123,7 +123,7 @@ define <8 x float> @buildvector_mul_addsub_ps256(<8 x float> %C, <8 x float> %D,
ret <8 x float> %vecinsert8
}
-define <4 x double> @buildvector_mul_addsub_pd256(<4 x double> %C, <4 x double> %D, <4 x double> %B) #0 {
+define <4 x double> @buildvector_mul_addsub_pd256(<4 x double> %C, <4 x double> %D, <4 x double> %B) {
; CHECK-LABEL: @buildvector_mul_addsub_pd256(
; CHECK-NEXT: [[A:%.*]] = fmul <4 x double> [[C:%.*]], [[D:%.*]]
; CHECK-NEXT: [[TMP0:%.*]] = fsub <4 x double> [[A]], [[B:%.*]]
@@ -151,7 +151,7 @@ define <4 x double> @buildvector_mul_addsub_pd256(<4 x double> %C, <4 x double>
ret <4 x double> %vecinsert4
}
-define <16 x float> @buildvector_mul_addsub_ps512(<16 x float> %C, <16 x float> %D, <16 x float> %B) #0 {
+define <16 x float> @buildvector_mul_addsub_ps512(<16 x float> %C, <16 x float> %D, <16 x float> %B) {
; SSE2-LABEL: @buildvector_mul_addsub_ps512(
; SSE2-NEXT: [[A:%.*]] = fmul <16 x float> [[C:%.*]], [[D:%.*]]
; SSE2-NEXT: [[TMP1:%.*]] = fsub <16 x float> [[A]], [[B:%.*]]
@@ -243,7 +243,7 @@ define <16 x float> @buildvector_mul_addsub_ps512(<16 x float> %C, <16 x float>
ret <16 x float> %vecinsert16
}
-define <16 x float> @buildvector_mul_addsub_ps512_partial(<16 x float> %C, <16 x float> %D, <16 x float> %B) #0 {
+define <16 x float> @buildvector_mul_addsub_ps512_partial(<16 x float> %C, <16 x float> %D, <16 x float> %B) {
; SSE-LABEL: @buildvector_mul_addsub_ps512_partial(
; SSE-NEXT: [[A:%.*]] = fmul <16 x float> [[C:%.*]], [[D:%.*]]
; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13>
@@ -350,7 +350,7 @@ define <16 x float> @buildvector_mul_addsub_ps512_partial(<16 x float> %C, <16 x
ret <16 x float> %vecinsert16
}
-define <8 x double> @buildvector_mul_addsub_pd512(<8 x double> %C, <8 x double> %D, <8 x double> %B) #0 {
+define <8 x double> @buildvector_mul_addsub_pd512(<8 x double> %C, <8 x double> %D, <8 x double> %B) {
; SSE2-LABEL: @buildvector_mul_addsub_pd512(
; SSE2-NEXT: [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
; SSE2-NEXT: [[TMP1:%.*]] = fsub <8 x double> [[A]], [[B:%.*]]
@@ -410,7 +410,7 @@ define <8 x double> @buildvector_mul_addsub_pd512(<8 x double> %C, <8 x double>
ret <8 x double> %vecinsert8
}
-define <8 x double> @buildvector_mul_addsub_pd512_partial(<8 x double> %C, <8 x double> %D, <8 x double> %B) #0 {
+define <8 x double> @buildvector_mul_addsub_pd512_partial(<8 x double> %C, <8 x double> %D, <8 x double> %B) {
; SSE-LABEL: @buildvector_mul_addsub_pd512_partial(
; SSE-NEXT: [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
; SSE-NEXT: [[TMP1:%.*]] = fsub <8 x double> [[A]], [[B:%.*]]
@@ -507,7 +507,7 @@ define <8 x double> @buildvector_mul_addsub_pd512_partial(<8 x double> %C, <8 x
ret <8 x double> %vecinsert8
}
-define <4 x float> @buildvector_mul_subadd_ps128(<4 x float> %C, <4 x float> %D, <4 x float> %B) #0 {
+define <4 x float> @buildvector_mul_subadd_ps128(<4 x float> %C, <4 x float> %D, <4 x float> %B) {
; CHECK-LABEL: @buildvector_mul_subadd_ps128(
; CHECK-NEXT: [[A:%.*]] = fmul <4 x float> [[C:%.*]], [[D:%.*]]
; CHECK-NEXT: [[TMP0:%.*]] = fadd <4 x float> [[A]], [[B:%.*]]
@@ -535,7 +535,7 @@ define <4 x float> @buildvector_mul_subadd_ps128(<4 x float> %C, <4 x float> %D,
ret <4 x float> %vecinsert4
}
-define <2 x double> @buildvector_mul_subadd_pd128(<2 x double> %C, <2 x double> %D, <2 x double> %B) #0 {
+define <2 x double> @buildvector_mul_subadd_pd128(<2 x double> %C, <2 x double> %D, <2 x double> %B) {
; CHECK-LABEL: @buildvector_mul_subadd_pd128(
; CHECK-NEXT: [[A:%.*]] = fmul <2 x double> [[C:%.*]], [[D:%.*]]
; CHECK-NEXT: [[TMP0:%.*]] = fadd <2 x double> [[A]], [[B:%.*]]
@@ -555,7 +555,7 @@ define <2 x double> @buildvector_mul_subadd_pd128(<2 x double> %C, <2 x double>
ret <2 x double> %vecinsert2
}
-define <8 x float> @buildvector_mul_subadd_ps256(<8 x float> %C, <8 x float> %D, <8 x float> %B) #0 {
+define <8 x float> @buildvector_mul_subadd_ps256(<8 x float> %C, <8 x float> %D, <8 x float> %B) {
; SSE2-LABEL: @buildvector_mul_subadd_ps256(
; SSE2-NEXT: [[A:%.*]] = fmul <8 x float> [[C:%.*]], [[D:%.*]]
; SSE2-NEXT: [[TMP0:%.*]] = fadd <8 x float> [[A]], [[B:%.*]]
@@ -634,7 +634,7 @@ define <8 x float> @buildvector_mul_subadd_ps256(<8 x float> %C, <8 x float> %D,
ret <8 x float> %vecinsert8
}
-define <4 x double> @buildvector_mul_subadd_pd256(<4 x double> %C, <4 x double> %D, <4 x double> %B) #0 {
+define <4 x double> @buildvector_mul_subadd_pd256(<4 x double> %C, <4 x double> %D, <4 x double> %B) {
; CHECK-LABEL: @buildvector_mul_subadd_pd256(
; CHECK-NEXT: [[A:%.*]] = fmul <4 x double> [[C:%.*]], [[D:%.*]]
; CHECK-NEXT: [[TMP0:%.*]] = fadd <4 x double> [[A]], [[B:%.*]]
@@ -662,7 +662,7 @@ define <4 x double> @buildvector_mul_subadd_pd256(<4 x double> %C, <4 x double>
ret <4 x double> %vecinsert4
}
-define <16 x float> @buildvector_mul_subadd_ps512(<16 x float> %C, <16 x float> %D, <16 x float> %B) #0 {
+define <16 x float> @buildvector_mul_subadd_ps512(<16 x float> %C, <16 x float> %D, <16 x float> %B) {
; SSE-LABEL: @buildvector_mul_subadd_ps512(
; SSE-NEXT: [[A:%.*]] = fmul <16 x float> [[C:%.*]], [[D:%.*]]
; SSE-NEXT: [[TMP1:%.*]] = fadd <16 x float> [[A]], [[B:%.*]]
@@ -756,7 +756,7 @@ define <16 x float> @buildvector_mul_subadd_ps512(<16 x float> %C, <16 x float>
ret <16 x float> %vecinsert16
}
-define <16 x float> @buildvector_mul_subadd_ps512_partial(<16 x float> %C, <16 x float> %D, <16 x float> %B) #0 {
+define <16 x float> @buildvector_mul_subadd_ps512_partial(<16 x float> %C, <16 x float> %D, <16 x float> %B) {
; SSE-LABEL: @buildvector_mul_subadd_ps512_partial(
; SSE-NEXT: [[A:%.*]] = fmul <16 x float> [[C:%.*]], [[D:%.*]]
; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13>
@@ -863,7 +863,7 @@ define <16 x float> @buildvector_mul_subadd_ps512_partial(<16 x float> %C, <16 x
ret <16 x float> %vecinsert16
}
-define <8 x double> @buildvector_mul_subadd_pd512(<8 x double> %C, <8 x double> %D, <8 x double> %B) #0 {
+define <8 x double> @buildvector_mul_subadd_pd512(<8 x double> %C, <8 x double> %D, <8 x double> %B) {
; SSE-LABEL: @buildvector_mul_subadd_pd512(
; SSE-NEXT: [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
; SSE-NEXT: [[TMP0:%.*]] = fadd <8 x double> [[A]], [[B:%.*]]
@@ -925,7 +925,7 @@ define <8 x double> @buildvector_mul_subadd_pd512(<8 x double> %C, <8 x double>
ret <8 x double> %vecinsert8
}
-define <8 x double> @buildvector_mul_subadd_pd512_partial(<8 x double> %C, <8 x double> %D, <8 x double> %B) #0 {
+define <8 x double> @buildvector_mul_subadd_pd512_partial(<8 x double> %C, <8 x double> %D, <8 x double> %B) {
; SSE-LABEL: @buildvector_mul_subadd_pd512_partial(
; SSE-NEXT: [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
; SSE-NEXT: [[TMP1:%.*]] = fadd <8 x double> [[A]], [[B:%.*]]
@@ -1021,5 +1021,3 @@ define <8 x double> @buildvector_mul_subadd_pd512_partial(<8 x double> %C, <8 x
%vecinsert8 = insertelement <8 x double> %vecinsert7, double %add7, i32 7
ret <8 x double> %vecinsert8
}
-
-attributes #0 = { nounwind "unsafe-fp-math"="true" }
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vdiv-nounroll.ll b/llvm/test/Transforms/PhaseOrdering/X86/vdiv-nounroll.ll
index ae6f4a7..57637d6 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vdiv-nounroll.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vdiv-nounroll.ll
@@ -14,7 +14,7 @@ target triple = "x86_64-apple-macosx11.0.0"
; a[i] /= b;
; }
-define void @vdiv(ptr %a, float %b) #0 {
+define void @vdiv(ptr %a, float %b) {
; CHECK-LABEL: define void @vdiv(
; CHECK-SAME: ptr captures(none) [[A:%.*]], float [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*]]:
@@ -40,7 +40,7 @@ entry:
%i = alloca i32, align 4
store ptr %a, ptr %a.addr, align 8, !tbaa !3
store float %b, ptr %b.addr, align 4, !tbaa !7
- call void @llvm.lifetime.start.p0(ptr %i) #2
+ call void @llvm.lifetime.start.p0(ptr %i)
store i32 0, ptr %i, align 4, !tbaa !9
br label %for.cond
@@ -50,7 +50,7 @@ for.cond: ; preds = %for.inc, %entry
br i1 %cmp, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond
- call void @llvm.lifetime.end.p0(ptr %i) #2
+ call void @llvm.lifetime.end.p0(ptr %i)
br label %for.end
for.body: ; preds = %for.cond
@@ -74,12 +74,8 @@ for.end: ; preds = %for.cond.cleanup
ret void
}
-declare void @llvm.lifetime.start.p0(ptr nocapture) #1
-declare void @llvm.lifetime.end.p0(ptr nocapture) #1
-
-attributes #0 = { nounwind ssp uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "tune-cpu"="generic" "unsafe-fp-math"="true" }
-attributes #1 = { argmemonly nofree nosync nounwind willreturn }
-attributes #2 = { nounwind }
+declare void @llvm.lifetime.start.p0(ptr nocapture)
+declare void @llvm.lifetime.end.p0(ptr nocapture)
!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll
index bcdf90c..bfb8554 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll
@@ -211,7 +211,7 @@ for.end:
ret void
}
-attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+pku,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+pku,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "use-soft-float"="false" }
!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll
index dd5ff12..987a528 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll
@@ -8,7 +8,7 @@
target triple = "x86_64--"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-define i32 @add_v4i32(ptr %p) #0 {
+define i32 @add_v4i32(ptr %p) {
; CHECK-LABEL: @add_v4i32(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 4, !tbaa [[TBAA0:![0-9]+]]
@@ -46,7 +46,7 @@ for.end:
ret i32 %r.0
}
-define signext i16 @mul_v8i16(ptr %p) #0 {
+define signext i16 @mul_v8i16(ptr %p) {
; CHECK-LABEL: @mul_v8i16(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 2, !tbaa [[TBAA4:![0-9]+]]
@@ -89,7 +89,7 @@ for.end:
ret i16 %r.0
}
-define signext i8 @or_v16i8(ptr %p) #0 {
+define signext i8 @or_v16i8(ptr %p) {
; CHECK-LABEL: @or_v16i8(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[P:%.*]], align 1, !tbaa [[TBAA6:![0-9]+]]
@@ -134,7 +134,7 @@ for.end:
ret i8 %r.0
}
-define i32 @smin_v4i32(ptr %p) #0 {
+define i32 @smin_v4i32(ptr %p) {
; CHECK-LABEL: @smin_v4i32(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 4, !tbaa [[TBAA0]]
@@ -185,7 +185,7 @@ for.end:
ret i32 %r.0
}
-define i32 @umax_v4i32(ptr %p) #0 {
+define i32 @umax_v4i32(ptr %p) {
; CHECK-LABEL: @umax_v4i32(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 4, !tbaa [[TBAA0]]
@@ -236,7 +236,7 @@ for.end:
ret i32 %r.0
}
-define float @fadd_v4i32(ptr %p) #0 {
+define float @fadd_v4i32(ptr %p) {
; CHECK-LABEL: @fadd_v4i32(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 4, !tbaa [[TBAA7:![0-9]+]]
@@ -275,7 +275,7 @@ for.end:
ret float %r.0
}
-define float @fmul_v4i32(ptr %p) #0 {
+define float @fmul_v4i32(ptr %p) {
; CHECK-LABEL: @fmul_v4i32(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 4, !tbaa [[TBAA7]]
@@ -315,7 +315,7 @@ for.end:
ret float %r.0
}
-define float @fmin_v4f32(ptr %p) #0 {
+define float @fmin_v4f32(ptr %p) {
; CHECK-LABEL: @fmin_v4f32(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 4, !tbaa [[TBAA7]]
@@ -440,8 +440,6 @@ entry:
ret float %call13
}
-attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+avx,+cx16,+cx8,+fxsr,+mmx,+popcnt,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "unsafe-fp-math"="true" "use-soft-float"="false" }
-
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 7, !"PIC Level", i32 2}
!2 = !{!"clang version 11.0.0 (https://github.com/llvm/llvm-project.git a9fe69c359de653015c39e413e48630d069abe27)"}
diff --git a/llvm/test/Transforms/PhaseOrdering/always-inline-alloca-promotion.ll b/llvm/test/Transforms/PhaseOrdering/always-inline-alloca-promotion.ll
index 63279b0..92ea2c6 100644
--- a/llvm/test/Transforms/PhaseOrdering/always-inline-alloca-promotion.ll
+++ b/llvm/test/Transforms/PhaseOrdering/always-inline-alloca-promotion.ll
@@ -35,9 +35,8 @@ define void @ham() #1 {
; CHECK-NEXT: [[SNORK_EXIT:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr inttoptr (i64 48 to ptr), align 16
; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt i64 [[TMP0]], 0
-; CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i64 0)
-; CHECK-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[TMP1]], <vscale x 16 x float> [[TMP2]], <vscale x 16 x float> undef
-; CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[SPEC_SELECT]], i64 0)
+; CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> undef, i64 0)
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP1]], <vscale x 4 x float> zeroinitializer, <vscale x 4 x float> [[TMP2]]
; CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.nxv4f32(i32 0, <vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, <vscale x 4 x float> zeroinitializer, <vscale x 4 x float> [[TMP3]])
; CHECK-NEXT: ret void
;
diff --git a/llvm/test/Transforms/SCCP/conditions-ranges.ll b/llvm/test/Transforms/SCCP/conditions-ranges.ll
index a3cf23b..f793814 100644
--- a/llvm/test/Transforms/SCCP/conditions-ranges.ll
+++ b/llvm/test/Transforms/SCCP/conditions-ranges.ll
@@ -1547,3 +1547,28 @@ bb2:
call void @use(i1 %c4)
ret void
}
+
+define i1 @and_predicate_dominating_phi(i32 %x) {
+; CHECK-LABEL: @and_predicate_dominating_phi(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[XGE1:%.*]] = icmp uge i32 [[X:%.*]], 1
+; CHECK-NEXT: [[XLT2:%.*]] = icmp ult i32 [[X]], 2
+; CHECK-NEXT: [[AND:%.*]] = and i1 [[XGE1]], [[XLT2]]
+; CHECK-NEXT: br i1 [[AND]], label [[PHI:%.*]], label [[NOPE:%.*]]
+; CHECK: nope:
+; CHECK-NEXT: br label [[PHI]]
+; CHECK: phi:
+; CHECK-NEXT: ret i1 true
+;
+entry:
+ %xge1 = icmp uge i32 %x, 1
+ %xlt2 = icmp ult i32 %x, 2
+ %and = and i1 %xge1, %xlt2
+ br i1 %and, label %phi, label %nope
+nope:
+ br label %phi
+phi:
+ %res = phi i32 [ %x, %entry ], [ 1, %nope ]
+ %ret = icmp uge i32 %res, 1
+ ret i1 %ret
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/NVPTX/vectorizable-intrinsic.ll b/llvm/test/Transforms/SLPVectorizer/NVPTX/vectorizable-intrinsic.ll
index 3a56258..0bd9a99 100644
--- a/llvm/test/Transforms/SLPVectorizer/NVPTX/vectorizable-intrinsic.ll
+++ b/llvm/test/Transforms/SLPVectorizer/NVPTX/vectorizable-intrinsic.ll
@@ -5,9 +5,9 @@ target datalayout = "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64"
target triple = "nvptx--nvidiacl"
; Vector versions of the intrinsics are scalarized, so keep them scalar
-define <2 x i8> @cltz_test(<2 x i8> %x) #0 {
+define <2 x i8> @cltz_test(<2 x i8> %x) {
; CHECK-LABEL: define <2 x i8> @cltz_test(
-; CHECK-SAME: <2 x i8> [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: <2 x i8> [[X:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i8> [[X]], i32 0
; CHECK-NEXT: [[CALL_I:%.*]] = call i8 @llvm.ctlz.i8(i8 [[TMP0]], i1 false)
@@ -27,10 +27,9 @@ entry:
ret <2 x i8> %vecinit2
}
-
-define <2 x i8> @cltz_test_poison(<2 x i8> %x) #0 {
+define <2 x i8> @cltz_test_poison(<2 x i8> %x) {
; CHECK-LABEL: define <2 x i8> @cltz_test_poison(
-; CHECK-SAME: <2 x i8> [[X:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <2 x i8> [[X:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i8> [[X]], i32 0
; CHECK-NEXT: [[CALL_I:%.*]] = call i8 @llvm.ctlz.i8(i8 [[TMP0]], i1 false)
@@ -50,7 +49,5 @@ entry:
ret <2 x i8> %vecinit2
}
-declare i8 @llvm.ctlz.i8(i8, i1) #3
-
-attributes #0 = { alwaysinline nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare i8 @llvm.ctlz.i8(i8, i1)
+ "unsafe-fp-math"="false" \ No newline at end of file
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet3.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet3.ll
index 29589f3..def73d7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet3.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet3.ll
@@ -7,7 +7,7 @@ target triple = "x86_64-apple-macosx10.8.0"
%class.btVector3.23.221.463.485.507.573.595.683.727.749.815.837.991.1585.1607.1629.1651.1849.2047.2069.2091.2113 = type { [4 x float] }
; Function Attrs: ssp uwtable
-define void @_ZN11HullLibrary15CleanupVerticesEjPK9btVector3jRjPS0_fRS0_(ptr %vertices, i1 %arg) #0 align 2 {
+define void @_ZN11HullLibrary15CleanupVerticesEjPK9btVector3jRjPS0_fRS0_(ptr %vertices, i1 %arg) align 2 {
; CHECK-LABEL: @_ZN11HullLibrary15CleanupVerticesEjPK9btVector3jRjPS0_fRS0_(
; CHECK-NEXT: entry:
; CHECK-NEXT: br i1 [[ARG:%.*]], label [[RETURN:%.*]], label [[IF_END:%.*]]
@@ -128,5 +128,3 @@ if.then17.2: ; preds = %if.end22.1
if.end22.2: ; preds = %if.then17.2, %if.end22.1
br i1 %arg, label %for.end36, label %for.body
}
-
-attributes #0 = { ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_flop7.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_flop7.ll
index fc1bd85..02e18d6 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_flop7.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_flop7.ll
@@ -5,7 +5,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
target triple = "x86_64-apple-macosx10.8.0"
; Function Attrs: nounwind ssp uwtable
-define void @main(i1 %arg) #0 {
+define void @main(i1 %arg) {
; CHECK-LABEL: @main(
; CHECK-NEXT: entry:
; CHECK-NEXT: br i1 %arg, label [[WHILE_BODY:%.*]], label [[WHILE_END:%.*]]
@@ -73,5 +73,3 @@ for.body267: ; preds = %for.body267, %for.b
for.end300: ; preds = %for.body267, %for.end80
unreachable
}
-
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/debug_info.ll b/llvm/test/Transforms/SLPVectorizer/X86/debug_info.ll
index f98a569..686befe 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/debug_info.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/debug_info.ll
@@ -13,7 +13,7 @@ target triple = "x86_64-apple-macosx10.7.0"
; A[8] = y0; A[8+1] = y1;
; }
-define i32 @depth(ptr nocapture %A, i32 %m) #0 !dbg !4 {
+define i32 @depth(ptr nocapture %A, i32 %m) !dbg !4 {
; CHECK-LABEL: @depth(
; CHECK-NEXT: entry:
; CHECK-NEXT: #dbg_value(ptr [[A:%.*]], [[META12:![0-9]+]], !DIExpression(), [[META18:![0-9]+]])
@@ -60,10 +60,7 @@ for.end: ; preds = %for.body.lr.ph, %en
}
; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
-
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!18, !32}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr16899.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr16899.ll
index 1b76ee9..4d18bf8 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr16899.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr16899.ll
@@ -6,7 +6,7 @@ target triple = "i386--netbsd"
@a = common global ptr null, align 4
; Function Attrs: noreturn nounwind readonly
-define i32 @fn1() #0 {
+define i32 @fn1() {
; CHECK-LABEL: define i32 @fn1(
; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*]]:
@@ -37,8 +37,6 @@ do.body: ; preds = %do.body, %entry
br label %do.body
}
-attributes #0 = { noreturn nounwind readonly "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!0 = !{!"any pointer", !1}
!1 = !{!"omnipotent char", !2}
!2 = !{!"Simple C/C++ TBAA"}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vector_gep.ll b/llvm/test/Transforms/SLPVectorizer/X86/vector_gep.ll
index 9e8cdc6..538751f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vector_gep.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vector_gep.ll
@@ -8,7 +8,7 @@ target triple = "x86_64-unknown-linux-gnu"
; The GEP has scalar and vector parameters and returns vector of pointers.
; Function Attrs: noreturn readonly uwtable
-define void @_Z3fn1v(i32 %x, <16 x ptr>%y) local_unnamed_addr #0 {
+define void @_Z3fn1v(i32 %x, <16 x ptr>%y) local_unnamed_addr {
; CHECK-LABEL: @_Z3fn1v(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CONV42_LE:%.*]] = sext i32 [[X:%.*]] to i64
@@ -25,6 +25,3 @@ entry:
%VectorGep208 = getelementptr i32, <16 x ptr> %y, i64 %conv42.le
unreachable
}
-
-attributes #0 = { noreturn readonly uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="broadwell" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512f,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-pair-path.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-pair-path.ll
index 79bef36..8077595 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-pair-path.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-pair-path.ll
@@ -11,7 +11,7 @@ target triple = "x86_64-unknown-linux-gnu"
; the method implementation and it is not guaranteed that the best option
; encountered first (like here).
-define double @root_selection(double %a, double %b, double %c, double %d) local_unnamed_addr #0 {
+define double @root_selection(double %a, double %b, double %c, double %d) local_unnamed_addr {
; CHECK-LABEL: @root_selection(
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[B:%.*]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A:%.*]], i32 1
@@ -53,5 +53,3 @@ define double @root_selection(double %a, double %b, double %c, double %d) local_
%i18 = fadd fast double %i17, %d
ret double %i18
}
-
-attributes #0 = { "unsafe-fp-math"="true" }
diff --git a/llvm/test/Transforms/SLPVectorizer/consecutive-access.ll b/llvm/test/Transforms/SLPVectorizer/consecutive-access.ll
index 369ca28..bdc09ed 100644
--- a/llvm/test/Transforms/SLPVectorizer/consecutive-access.ll
+++ b/llvm/test/Transforms/SLPVectorizer/consecutive-access.ll
@@ -8,7 +8,7 @@
@D = common global [2000 x float] zeroinitializer, align 16
; Function Attrs: nounwind ssp uwtable
-define void @foo_3double(i32 %u) #0 {
+define void @foo_3double(i32 %u) {
; CHECK-LABEL: @foo_3double(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[U_ADDR:%.*]] = alloca i32, align 4
@@ -65,7 +65,7 @@ entry:
; A[C1 + C2*i] are consecutive, if C2 is a power of 2, and C2 > C1 > 0.
; Thus, the following code should be vectorized.
; Function Attrs: nounwind ssp uwtable
-define void @foo_2double(i32 %u) #0 {
+define void @foo_2double(i32 %u) {
; CHECK-LABEL: @foo_2double(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[U_ADDR:%.*]] = alloca i32, align 4
@@ -104,7 +104,7 @@ entry:
; Similar to the previous test, but with different datatype.
; Function Attrs: nounwind ssp uwtable
-define void @foo_4float(i32 %u) #0 {
+define void @foo_4float(i32 %u) {
; CHECK-LABEL: @foo_4float(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[U_ADDR:%.*]] = alloca i32, align 4
@@ -159,7 +159,7 @@ entry:
; Similar to the previous tests, but now we are dealing with AddRec SCEV.
; Function Attrs: nounwind ssp uwtable
-define i32 @foo_loop(ptr %A, i32 %n) #0 {
+define i32 @foo_loop(ptr %A, i32 %n) {
; CHECK-LABEL: @foo_loop(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
@@ -248,7 +248,7 @@ for.end: ; preds = %for.cond.for.end_cr
; Similar to foo_2double but with a non-power-of-2 factor and potential
; wrapping (both indices wrap or both don't in the same time)
; Function Attrs: nounwind ssp uwtable
-define void @foo_2double_non_power_of_2(i32 %u) #0 {
+define void @foo_2double_non_power_of_2(i32 %u) {
; CHECK-LABEL: @foo_2double_non_power_of_2(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[U_ADDR:%.*]] = alloca i32, align 4
@@ -289,7 +289,7 @@ entry:
; Similar to foo_2double_non_power_of_2 but with zext's instead of sext's
; Function Attrs: nounwind ssp uwtable
-define void @foo_2double_non_power_of_2_zext(i32 %u) #0 {
+define void @foo_2double_non_power_of_2_zext(i32 %u) {
; CHECK-LABEL: @foo_2double_non_power_of_2_zext(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[U_ADDR:%.*]] = alloca i32, align 4
@@ -332,7 +332,7 @@ entry:
; Alternatively, this is like foo_loop, but with a non-power-of-2 factor and
; potential wrapping (both indices wrap or both don't in the same time)
; Function Attrs: nounwind ssp uwtable
-define i32 @foo_loop_non_power_of_2(ptr %A, i32 %n) #0 {
+define i32 @foo_loop_non_power_of_2(ptr %A, i32 %n) {
; CHECK-LABEL: @foo_loop_non_power_of_2(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
@@ -438,7 +438,7 @@ for.end: ; preds = %for.cond.for.end_cr
;
; Make sure we are able to vectorize this from now on:
;
-define double @bar(ptr nocapture readonly %a, i32 %n) local_unnamed_addr #0 {
+define double @bar(ptr nocapture readonly %a, i32 %n) local_unnamed_addr {
; CHECK-X86-LABEL: @bar(
; CHECK-X86-NEXT: entry:
; CHECK-X86-NEXT: [[CMP15:%.*]] = icmp eq i32 [[N:%.*]], 0
@@ -548,8 +548,6 @@ define void @store_constant_expression(ptr %p) {
ret void
}
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.ident = !{!0}
!0 = !{!"clang version 3.5.0 "}
diff --git a/llvm/test/Transforms/SLPVectorizer/insert-element-build-vector-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/insert-element-build-vector-inseltpoison.ll
index 74e52da..385df87 100644
--- a/llvm/test/Transforms/SLPVectorizer/insert-element-build-vector-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/insert-element-build-vector-inseltpoison.ll
@@ -6,7 +6,7 @@
; RUN: %if aarch64-registered-target %{ opt -S -passes=slp-vectorizer -slp-threshold=0 -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,NOTHRESHOLD %}
; RUN: %if aarch64-registered-target %{ opt -S -passes=slp-vectorizer -slp-threshold=-10000 -slp-min-tree-size=0 -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,MINTREESIZE %}
-define <4 x float> @simple_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define <4 x float> @simple_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
; CHECK-LABEL: @simple_select(
; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
@@ -42,7 +42,7 @@ define <4 x float> @simple_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c)
declare void @llvm.assume(i1) nounwind
; This entire tree is ephemeral, don't vectorize any of it.
-define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
; THRESHOLD-LABEL: @simple_select_eph(
; THRESHOLD-NEXT: [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
; THRESHOLD-NEXT: [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
@@ -199,7 +199,7 @@ define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32>
; Insert in an order different from the vector indices to make sure it
; doesn't matter
-define <4 x float> @simple_select_insert_out_of_order(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define <4 x float> @simple_select_insert_out_of_order(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
; CHECK-LABEL: @simple_select_insert_out_of_order(
; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
@@ -233,15 +233,15 @@ define <4 x float> @simple_select_insert_out_of_order(<4 x float> %a, <4 x float
ret <4 x float> %rd
}
-declare void @v4f32_user(<4 x float>) #0
-declare void @f32_user(float) #0
+declare void @v4f32_user(<4 x float>)
+declare void @f32_user(float)
; Multiple users of the final constructed vector
-define <4 x float> @simple_select_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define <4 x float> @simple_select_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
; CHECK-LABEL: @simple_select_users(
; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
-; CHECK-NEXT: call void @v4f32_user(<4 x float> [[TMP2]]) #[[ATTR0:[0-9]+]]
+; CHECK-NEXT: call void @v4f32_user(<4 x float> [[TMP2]])
; CHECK-NEXT: ret <4 x float> [[TMP2]]
;
%c0 = extractelement <4 x i32> %c, i32 0
@@ -268,12 +268,12 @@ define <4 x float> @simple_select_users(<4 x float> %a, <4 x float> %b, <4 x i32
%rb = insertelement <4 x float> %ra, float %s1, i32 1
%rc = insertelement <4 x float> %rb, float %s2, i32 2
%rd = insertelement <4 x float> %rc, float %s3, i32 3
- call void @v4f32_user(<4 x float> %rd) #0
+ call void @v4f32_user(<4 x float> %rd)
ret <4 x float> %rd
}
; Unused insertelement
-define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
; CHECK-LABEL: @simple_select_no_users(
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[C:%.*]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer
@@ -319,7 +319,7 @@ define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x
; Make sure infinite loop doesn't happen which I ran into when trying
; to do this backwards this backwards
-define <4 x i32> @reconstruct(<4 x i32> %c) #0 {
+define <4 x i32> @reconstruct(<4 x i32> %c) {
; CHECK-LABEL: @reconstruct(
; CHECK-NEXT: [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
; CHECK-NEXT: [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
@@ -342,7 +342,7 @@ define <4 x i32> @reconstruct(<4 x i32> %c) #0 {
ret <4 x i32> %rd
}
-define <2 x float> @simple_select_v2(<2 x float> %a, <2 x float> %b, <2 x i32> %c) #0 {
+define <2 x float> @simple_select_v2(<2 x float> %a, <2 x float> %b, <2 x i32> %c) {
; CHECK-LABEL: @simple_select_v2(
; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <2 x i32> [[C:%.*]], zeroinitializer
; CHECK-NEXT: [[TMP2:%.*]] = select <2 x i1> [[TMP1]], <2 x float> [[A:%.*]], <2 x float> [[B:%.*]]
@@ -366,7 +366,7 @@ define <2 x float> @simple_select_v2(<2 x float> %a, <2 x float> %b, <2 x i32> %
; Make sure when we construct partial vectors, we don't keep
; re-visiting the insertelement chains starting with undef
; (low cost threshold needed to force this to happen)
-define <4 x float> @simple_select_partial_vector(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define <4 x float> @simple_select_partial_vector(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
; CHECK-LABEL: @simple_select_partial_vector(
; CHECK-NEXT: [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
; CHECK-NEXT: [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
@@ -487,7 +487,7 @@ define <4 x double> @multi_tree(double %w, double %x, double %y, double %z) {
ret <4 x double> %i4
}
-define <8 x float> @_vadd256(<8 x float> %a, <8 x float> %b) local_unnamed_addr #0 {
+define <8 x float> @_vadd256(<8 x float> %a, <8 x float> %b) local_unnamed_addr {
; CHECK-LABEL: @_vadd256(
; CHECK-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]]
; CHECK-NEXT: ret <8 x float> [[TMP1]]
@@ -526,5 +526,3 @@ define <8 x float> @_vadd256(<8 x float> %a, <8 x float> %b) local_unnamed_addr
%vecinit7.i = insertelement <8 x float> %vecinit6.i, float %add22, i32 7
ret <8 x float> %vecinit7.i
}
-
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/SLPVectorizer/insert-element-build-vector.ll b/llvm/test/Transforms/SLPVectorizer/insert-element-build-vector.ll
index 0b896f4..37c02d6 100644
--- a/llvm/test/Transforms/SLPVectorizer/insert-element-build-vector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/insert-element-build-vector.ll
@@ -6,7 +6,7 @@
; RUN: %if aarch64-registered-target %{ opt -S -passes=slp-vectorizer -slp-threshold=0 -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,NOTHRESHOLD %}
; RUN: %if aarch64-registered-target %{ opt -S -passes=slp-vectorizer -slp-threshold=-10000 -slp-min-tree-size=0 -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,MINTREESIZE %}
-define <4 x float> @simple_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define <4 x float> @simple_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
; CHECK-LABEL: @simple_select(
; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
@@ -39,7 +39,7 @@ define <4 x float> @simple_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c)
ret <4 x float> %rd
}
-define <8 x float> @simple_select2(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define <8 x float> @simple_select2(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
; CHECK-LABEL: @simple_select2(
; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
@@ -77,7 +77,7 @@ define <8 x float> @simple_select2(<4 x float> %a, <4 x float> %b, <4 x i32> %c)
declare void @llvm.assume(i1) nounwind
; This entire tree is ephemeral, don't vectorize any of it.
-define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
; THRESHOLD-LABEL: @simple_select_eph(
; THRESHOLD-NEXT: [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
; THRESHOLD-NEXT: [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
@@ -234,7 +234,7 @@ define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32>
; Insert in an order different from the vector indices to make sure it
; doesn't matter
-define <4 x float> @simple_select_insert_out_of_order(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define <4 x float> @simple_select_insert_out_of_order(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
; CHECK-LABEL: @simple_select_insert_out_of_order(
; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
@@ -268,15 +268,15 @@ define <4 x float> @simple_select_insert_out_of_order(<4 x float> %a, <4 x float
ret <4 x float> %rd
}
-declare void @v4f32_user(<4 x float>) #0
-declare void @f32_user(float) #0
+declare void @v4f32_user(<4 x float>)
+declare void @f32_user(float)
; Multiple users of the final constructed vector
-define <4 x float> @simple_select_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define <4 x float> @simple_select_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
; CHECK-LABEL: @simple_select_users(
; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
-; CHECK-NEXT: call void @v4f32_user(<4 x float> [[TMP2]]) #[[ATTR0:[0-9]+]]
+; CHECK-NEXT: call void @v4f32_user(<4 x float> [[TMP2]])
; CHECK-NEXT: ret <4 x float> [[TMP2]]
;
%c0 = extractelement <4 x i32> %c, i32 0
@@ -303,12 +303,12 @@ define <4 x float> @simple_select_users(<4 x float> %a, <4 x float> %b, <4 x i32
%rb = insertelement <4 x float> %ra, float %s1, i32 1
%rc = insertelement <4 x float> %rb, float %s2, i32 2
%rd = insertelement <4 x float> %rc, float %s3, i32 3
- call void @v4f32_user(<4 x float> %rd) #0
+ call void @v4f32_user(<4 x float> %rd)
ret <4 x float> %rd
}
; Unused insertelement
-define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
; CHECK-LABEL: @simple_select_no_users(
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[C:%.*]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer
@@ -355,7 +355,7 @@ define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x
; Make sure infinite loop doesn't happen which I ran into when trying
; to do this backwards this backwards
-define <4 x i32> @reconstruct(<4 x i32> %c) #0 {
+define <4 x i32> @reconstruct(<4 x i32> %c) {
; CHECK-LABEL: @reconstruct(
; CHECK-NEXT: [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
; CHECK-NEXT: [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
@@ -378,7 +378,7 @@ define <4 x i32> @reconstruct(<4 x i32> %c) #0 {
ret <4 x i32> %rd
}
-define <2 x float> @simple_select_v2(<2 x float> %a, <2 x float> %b, <2 x i32> %c) #0 {
+define <2 x float> @simple_select_v2(<2 x float> %a, <2 x float> %b, <2 x i32> %c) {
; CHECK-LABEL: @simple_select_v2(
; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <2 x i32> [[C:%.*]], zeroinitializer
; CHECK-NEXT: [[TMP2:%.*]] = select <2 x i1> [[TMP1]], <2 x float> [[A:%.*]], <2 x float> [[B:%.*]]
@@ -402,7 +402,7 @@ define <2 x float> @simple_select_v2(<2 x float> %a, <2 x float> %b, <2 x i32> %
; Make sure when we construct partial vectors, we don't keep
; re-visiting the insertelement chains starting with zeroinitializer
; (low cost threshold needed to force this to happen)
-define <4 x float> @simple_select_partial_vector(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define <4 x float> @simple_select_partial_vector(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
; CHECK-LABEL: @simple_select_partial_vector(
; CHECK-NEXT: [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
; CHECK-NEXT: [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
@@ -523,7 +523,7 @@ define <4 x double> @multi_tree(double %w, double %x, double %y, double %z) {
ret <4 x double> %i4
}
-define <8 x float> @_vadd256(<8 x float> %a, <8 x float> %b) local_unnamed_addr #0 {
+define <8 x float> @_vadd256(<8 x float> %a, <8 x float> %b) local_unnamed_addr {
; CHECK-LABEL: @_vadd256(
; CHECK-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]]
; CHECK-NEXT: ret <8 x float> [[TMP1]]
@@ -562,5 +562,3 @@ define <8 x float> @_vadd256(<8 x float> %a, <8 x float> %b) local_unnamed_addr
%vecinit7.i = insertelement <8 x float> %vecinit6.i, float %add22, i32 7
ret <8 x float> %vecinit7.i
}
-
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/SROA/mem-par-metadata-sroa.ll b/llvm/test/Transforms/SROA/mem-par-metadata-sroa.ll
index bc05971..82ebdaa 100644
--- a/llvm/test/Transforms/SROA/mem-par-metadata-sroa.ll
+++ b/llvm/test/Transforms/SROA/mem-par-metadata-sroa.ll
@@ -33,7 +33,6 @@
; }
; }
-
; ModuleID = '<stdin>'
source_filename = "mem-par-metadata-sroa1.cpp"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -42,7 +41,7 @@ target triple = "x86_64-unknown-linux-gnu"
%class.Complex = type { float, float }
; Function Attrs: norecurse nounwind uwtable
-define void @_Z4testP7Complexl(ptr nocapture %out, i64 %size) local_unnamed_addr #0 {
+define void @_Z4testP7Complexl(ptr nocapture %out, i64 %size) local_unnamed_addr {
; CHECK-LABEL: @_Z4testP7Complexl(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[FOR_COND:%.*]]
@@ -108,10 +107,7 @@ for.end: ; preds = %for.cond
}
; Function Attrs: argmemonly nounwind
-declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1) #1
-
-attributes #0 = { norecurse nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
+declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1)
!llvm.ident = !{!0}
diff --git a/llvm/test/Transforms/SafeStack/AArch64/abi_ssp.ll b/llvm/test/Transforms/SafeStack/AArch64/abi_ssp.ll
index 43fb260..d981626 100644
--- a/llvm/test/Transforms/SafeStack/AArch64/abi_ssp.ll
+++ b/llvm/test/Transforms/SafeStack/AArch64/abi_ssp.ll
@@ -1,7 +1,5 @@
; RUN: opt -safe-stack -S -mtriple=aarch64-linux-android < %s -o - | FileCheck --check-prefixes=TLS,ANDROID %s
-; RUN: opt -safe-stack -S -mtriple=aarch64-unknown-fuchsia < %s -o - | FileCheck --check-prefixes=TLS,FUCHSIA %s
; RUN: opt -passes=safe-stack -S -mtriple=aarch64-linux-android < %s -o - | FileCheck --check-prefixes=TLS,ANDROID %s
-; RUN: opt -passes=safe-stack -S -mtriple=aarch64-unknown-fuchsia < %s -o - | FileCheck --check-prefixes=TLS,FUCHSIA %s
define void @foo() nounwind uwtable safestack sspreq {
entry:
@@ -10,7 +8,6 @@ entry:
; TLS: %[[TP2:.*]] = call ptr @llvm.thread.pointer.p0()
; ANDROID: %[[B:.*]] = getelementptr i8, ptr %[[TP2]], i32 40
-; FUCHSIA: %[[B:.*]] = getelementptr i8, ptr %[[TP2]], i32 -16
; TLS: %[[StackGuard:.*]] = load ptr, ptr %[[B]]
; TLS: store ptr %[[StackGuard]], ptr %[[StackGuardSlot:.*]]
%a = alloca i128, align 16
diff --git a/llvm/test/Transforms/SafeStack/ARM/debug.ll b/llvm/test/Transforms/SafeStack/ARM/debug.ll
index 207475a..dfce13a 100644
--- a/llvm/test/Transforms/SafeStack/ARM/debug.ll
+++ b/llvm/test/Transforms/SafeStack/ARM/debug.ll
@@ -42,16 +42,15 @@ declare void @llvm.lifetime.start.p0(ptr nocapture) #2
; Function Attrs: nounwind readnone speculatable
declare void @llvm.dbg.declare(metadata, metadata, metadata) #3
-declare void @Capture(ptr) local_unnamed_addr #4
+declare void @Capture(ptr) local_unnamed_addr
; Function Attrs: argmemonly nounwind
declare void @llvm.lifetime.end.p0(ptr nocapture) #2
-attributes #0 = { norecurse nounwind readonly safestack "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv7-a,+dsp,+neon,+vfp3,-thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind safestack "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv7-a,+dsp,+neon,+vfp3,-thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind readonly safestack }
+attributes #1 = { nounwind safestack }
attributes #2 = { argmemonly nounwind }
attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv7-a,+dsp,+neon,+vfp3,-thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #5 = { nounwind }
!llvm.dbg.cu = !{!2}
diff --git a/llvm/test/Transforms/SafeStack/X86/debug-loc.ll b/llvm/test/Transforms/SafeStack/X86/debug-loc.ll
index fcb4935..92197a7 100644
--- a/llvm/test/Transforms/SafeStack/X86/debug-loc.ll
+++ b/llvm/test/Transforms/SafeStack/X86/debug-loc.ll
@@ -44,11 +44,10 @@ entry:
; Function Attrs: nounwind readnone
declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
-declare void @Capture(ptr) #2
+declare void @Capture(ptr)
-attributes #0 = { safestack uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { safestack uwtable }
attributes #1 = { nounwind readnone }
-attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!15, !16}
diff --git a/llvm/test/Transforms/SafeStack/X86/debug-loc2.ll b/llvm/test/Transforms/SafeStack/X86/debug-loc2.ll
index e60522f..634231d 100644
--- a/llvm/test/Transforms/SafeStack/X86/debug-loc2.ll
+++ b/llvm/test/Transforms/SafeStack/X86/debug-loc2.ll
@@ -45,7 +45,7 @@ entry:
; Function Attrs: argmemonly nounwind
declare void @llvm.lifetime.start.p0(ptr nocapture) #1
-declare void @capture(ptr) #2
+declare void @capture(ptr)
; Function Attrs: argmemonly nounwind
declare void @llvm.lifetime.end.p0(ptr nocapture) #1
@@ -55,9 +55,8 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) #3
declare void @llvm.random.metadata.use(metadata)
-attributes #0 = { noinline safestack uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline safestack uwtable }
attributes #1 = { argmemonly nounwind }
-attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #3 = { nounwind readnone }
attributes #4 = { nounwind }
diff --git a/llvm/test/Transforms/SampleProfile/Inputs/profile-symbol-list.ll b/llvm/test/Transforms/SampleProfile/Inputs/profile-symbol-list.ll
index b8e64e8..a48d46d 100644
--- a/llvm/test/Transforms/SampleProfile/Inputs/profile-symbol-list.ll
+++ b/llvm/test/Transforms/SampleProfile/Inputs/profile-symbol-list.ll
@@ -90,10 +90,10 @@ while.end: ; preds = %while.body
; Function Attrs: nofree nounwind
declare dso_local i32 @printf(ptr nocapture readonly, ...) local_unnamed_addr #3
-attributes #0 = { noinline norecurse nounwind readnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #1 = { norecurse nounwind readnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #2 = { nofree norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #3 = { nofree nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #0 = { noinline norecurse nounwind readnone uwtable "use-sample-profile" }
+attributes #1 = { norecurse nounwind readnone uwtable "use-sample-profile" }
+attributes #2 = { nofree norecurse nounwind uwtable "use-sample-profile" }
+attributes #3 = { nofree nounwind "use-sample-profile" }
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/Transforms/SampleProfile/branch.ll b/llvm/test/Transforms/SampleProfile/branch.ll
index ff5d8bb..83f9354 100644
--- a/llvm/test/Transforms/SampleProfile/branch.ll
+++ b/llvm/test/Transforms/SampleProfile/branch.ll
@@ -62,7 +62,7 @@ if.end: ; preds = %entry
%1 = load ptr, ptr %argv.addr, align 8, !dbg !30
%arrayidx = getelementptr inbounds ptr, ptr %1, i64 1, !dbg !30
%2 = load ptr, ptr %arrayidx, align 8, !dbg !30
- %call = call i32 @atoi(ptr %2) #4, !dbg !31
+ %call = call i32 @atoi(ptr %2), !dbg !31
store i32 %call, ptr %limit, align 4, !dbg !29
%3 = load i32, ptr %limit, align 4, !dbg !32
%cmp1 = icmp sgt i32 %3, 100, !dbg !34
@@ -75,7 +75,7 @@ if.then.2: ; preds = %if.end
%4 = load ptr, ptr %argv.addr, align 8, !dbg !39
%arrayidx3 = getelementptr inbounds ptr, ptr %4, i64 2, !dbg !39
%5 = load ptr, ptr %arrayidx3, align 8, !dbg !39
- %call4 = call i32 @atoi(ptr %5) #4, !dbg !40
+ %call4 = call i32 @atoi(ptr %5), !dbg !40
%conv = sitofp i32 %call4 to double, !dbg !40
%mul = fmul double 0x40370ABE6A337A81, %conv, !dbg !41
store double %mul, ptr %s, align 8, !dbg !38
@@ -128,7 +128,7 @@ if.else: ; preds = %if.end
%16 = load ptr, ptr %argv.addr, align 8, !dbg !72
%arrayidx10 = getelementptr inbounds ptr, ptr %16, i64 2, !dbg !72
%17 = load ptr, ptr %arrayidx10, align 8, !dbg !72
- %call11 = call i32 @atoi(ptr %17) #4, !dbg !74
+ %call11 = call i32 @atoi(ptr %17), !dbg !74
%conv12 = sitofp i32 %call11 to double, !dbg !74
store double %conv12, ptr %result, align 8, !dbg !75
br label %if.end.13
@@ -145,18 +145,14 @@ return: ; preds = %if.end.13, %if.then
}
; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
; Function Attrs: nounwind readonly
-declare i32 @atoi(ptr) #2
+declare i32 @atoi(ptr)
-declare i32 @printf(ptr, ...) #3
+declare i32 @printf(ptr, ...)
-attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readonly "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind readonly }
+attributes #0 = { uwtable "use-sample-profile" }
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!13, !14}
diff --git a/llvm/test/Transforms/SampleProfile/csspgo-import-list.ll b/llvm/test/Transforms/SampleProfile/csspgo-import-list.ll
index 077eab7..ade2d73 100644
--- a/llvm/test/Transforms/SampleProfile/csspgo-import-list.ll
+++ b/llvm/test/Transforms/SampleProfile/csspgo-import-list.ll
@@ -56,7 +56,7 @@ for.body: ; preds = %for.body, %entry
; THRESHOLD-REPLAY: !{!"function_entry_count", i64 1, i64 446061515086924981, i64 3815895320998406042, i64 6309742469962978389, i64 7102633082150537521, i64 -2862076748587597320, i64 -2016976694713209516}
; THRESHOLD-REPLAY-NO-FUNCA: !{!"function_entry_count", i64 1, i64 446061515086924981, i64 3815895320998406042, i64 6309742469962978389, i64 7102633082150537521, i64 -2862076748587597320}
-attributes #0 = { nofree noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #0 = { nofree noinline norecurse nounwind uwtable "use-sample-profile" }
!llvm.dbg.cu = !{!2}
!llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/Transforms/SampleProfile/csspgo-inline-debug.ll b/llvm/test/Transforms/SampleProfile/csspgo-inline-debug.ll
index fe31023..de51afd 100644
--- a/llvm/test/Transforms/SampleProfile/csspgo-inline-debug.ll
+++ b/llvm/test/Transforms/SampleProfile/csspgo-inline-debug.ll
@@ -88,8 +88,8 @@ entry:
declare i32 @_Z3fibi(i32)
-attributes #0 = { nofree noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #1 = { nofree norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #0 = { nofree noinline norecurse nounwind uwtable "use-sample-profile" }
+attributes #1 = { nofree norecurse nounwind uwtable "use-sample-profile" }
!llvm.dbg.cu = !{!2}
!llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/Transforms/SampleProfile/csspgo-inline.ll b/llvm/test/Transforms/SampleProfile/csspgo-inline.ll
index 177329f..deabc27 100644
--- a/llvm/test/Transforms/SampleProfile/csspgo-inline.ll
+++ b/llvm/test/Transforms/SampleProfile/csspgo-inline.ll
@@ -109,8 +109,8 @@ entry:
declare i32 @_Z3fibi(i32)
-attributes #0 = { nofree noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #1 = { nofree norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #0 = { nofree noinline norecurse nounwind uwtable "use-sample-profile" }
+attributes #1 = { nofree norecurse nounwind uwtable "use-sample-profile" }
!llvm.dbg.cu = !{!2}
!llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/Transforms/SampleProfile/csspgo-summary.ll b/llvm/test/Transforms/SampleProfile/csspgo-summary.ll
index f18425e..3daa69e 100644
--- a/llvm/test/Transforms/SampleProfile/csspgo-summary.ll
+++ b/llvm/test/Transforms/SampleProfile/csspgo-summary.ll
@@ -75,8 +75,8 @@ entry:
declare i32 @_Z3fibi(i32)
-attributes #0 = { nofree noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #1 = { nofree norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #0 = { nofree noinline norecurse nounwind uwtable "use-sample-profile" }
+attributes #1 = { nofree norecurse nounwind uwtable "use-sample-profile" }
!llvm.dbg.cu = !{!2}
!llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/Transforms/SampleProfile/csspgo-use-preinliner.ll b/llvm/test/Transforms/SampleProfile/csspgo-use-preinliner.ll
index 030b5aa..e4ee939 100644
--- a/llvm/test/Transforms/SampleProfile/csspgo-use-preinliner.ll
+++ b/llvm/test/Transforms/SampleProfile/csspgo-use-preinliner.ll
@@ -87,8 +87,8 @@ entry:
declare i32 @_Z3fibi(i32)
-attributes #0 = { nofree noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #1 = { nofree norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #0 = { nofree noinline norecurse nounwind uwtable "use-sample-profile" }
+attributes #1 = { nofree norecurse nounwind uwtable "use-sample-profile" }
!llvm.dbg.cu = !{!2}
!llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/Transforms/SampleProfile/entry_counts_cold.ll b/llvm/test/Transforms/SampleProfile/entry_counts_cold.ll
index c7617c1..4aacde8 100644
--- a/llvm/test/Transforms/SampleProfile/entry_counts_cold.ll
+++ b/llvm/test/Transforms/SampleProfile/entry_counts_cold.ll
@@ -91,12 +91,11 @@ declare void @llvm.lifetime.start.p0(ptr nocapture) #2
; Function Attrs: argmemonly nounwind
declare void @llvm.lifetime.end.p0(ptr nocapture) #2
-declare void @baz(...) #3
+declare void @baz(...)
-attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #0 = { nounwind ssp uwtable "use-sample-profile" }
attributes #1 = { nounwind readnone speculatable }
attributes #2 = { argmemonly nounwind }
-attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #4 = { nounwind }
!llvm.dbg.cu = !{!0}
diff --git a/llvm/test/Transforms/SampleProfile/entry_counts_missing_dbginfo.ll b/llvm/test/Transforms/SampleProfile/entry_counts_missing_dbginfo.ll
index 0e62921..e0cd883 100644
--- a/llvm/test/Transforms/SampleProfile/entry_counts_missing_dbginfo.ll
+++ b/llvm/test/Transforms/SampleProfile/entry_counts_missing_dbginfo.ll
@@ -101,12 +101,11 @@ declare void @llvm.lifetime.start.p0(ptr nocapture) #2
; Function Attrs: argmemonly nounwind
declare void @llvm.lifetime.end.p0(ptr nocapture) #2
-declare void @baz(...) #3
+declare void @baz(...)
-attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #0 = { nounwind ssp uwtable "use-sample-profile" }
attributes #1 = { nounwind readnone speculatable }
attributes #2 = { argmemonly nounwind }
-attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #4 = { nounwind }
!llvm.dbg.cu = !{!0}
diff --git a/llvm/test/Transforms/SampleProfile/fsafdo_test.ll b/llvm/test/Transforms/SampleProfile/fsafdo_test.ll
index 4a35cb1..8cc5d06 100644
--- a/llvm/test/Transforms/SampleProfile/fsafdo_test.ll
+++ b/llvm/test/Transforms/SampleProfile/fsafdo_test.ll
@@ -6,7 +6,7 @@ target triple = "x86_64-unknown-linux-gnu"
@sum = dso_local local_unnamed_addr global i32 0, align 4
declare i32 @bar(i32 %i) #0
-declare void @work(i32 %i) #2
+declare void @work(i32 %i)
define dso_local void @foo() #0 !dbg !29 {
; CHECK: Printing analysis {{.*}} for function 'foo':
@@ -141,7 +141,7 @@ if.end9.3:
; CHECK: edge %if.end9.3 -> %for.cond1.preheader probability is 0x7f7cb227 / 0x80000000 = 99.60% [HOT edge]
}
-define dso_local i32 @main() #3 !dbg !52 {
+define dso_local i32 @main() !dbg !52 {
entry:
br label %for.body, !dbg !53
@@ -157,10 +157,8 @@ for.end:
}
-attributes #0 = { noinline nounwind uwtable "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile"}
+attributes #0 = { noinline nounwind uwtable "use-sample-profile"}
attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { nofree noinline norecurse nounwind uwtable "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind uwtable "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/Transforms/SampleProfile/gcc-simple.ll b/llvm/test/Transforms/SampleProfile/gcc-simple.ll
index 315ac5a..2496d18 100644
--- a/llvm/test/Transforms/SampleProfile/gcc-simple.ll
+++ b/llvm/test/Transforms/SampleProfile/gcc-simple.ll
@@ -31,7 +31,7 @@ entry:
%i.addr = alloca i64, align 8
store i64 %i, ptr %i.addr, align 8
call void @llvm.dbg.declare(metadata ptr %i.addr, metadata !16, metadata !17), !dbg !18
- %call = call i32 @rand() #3, !dbg !19
+ %call = call i32 @rand(), !dbg !19
; CHECK: !prof ![[PROF1:[0-9]+]]
%cmp = icmp slt i32 %call, 500, !dbg !21
br i1 %cmp, label %if.then, label %if.else, !dbg !22
@@ -42,7 +42,7 @@ if.then: ; preds = %entry
br label %return, !dbg !23
if.else: ; preds = %entry
- %call1 = call i32 @rand() #3, !dbg !25
+ %call1 = call i32 @rand(), !dbg !25
; CHECK: !prof ![[PROF3:[0-9]+]]
%cmp2 = icmp sgt i32 %call1, 5000, !dbg !28
br i1 %cmp2, label %if.then.3, label %if.else.4, !dbg !29
@@ -62,10 +62,10 @@ return: ; preds = %if.else.4, %if.then
}
; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
; Function Attrs: nounwind
-declare i32 @rand() #2
+declare i32 @rand()
; Function Attrs: nounwind uwtable
define i32 @main() #0 !dbg !9 {
@@ -141,10 +141,7 @@ for.end.6: ; preds = %for.cond
; CHECK: ![[PROF7]] = !{!"branch_weights", i32 18943, i32 1}
; CHECK: ![[PROF8]] = !{!"branch_weights", i32 18942}
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
+attributes #0 = { nounwind uwtable "use-sample-profile" }
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!13, !14}
diff --git a/llvm/test/Transforms/SampleProfile/inline-act.ll b/llvm/test/Transforms/SampleProfile/inline-act.ll
index 3ab3efd..e962642 100644
--- a/llvm/test/Transforms/SampleProfile/inline-act.ll
+++ b/llvm/test/Transforms/SampleProfile/inline-act.ll
@@ -24,7 +24,7 @@ target triple = "x86_64-unknown-linux-gnu"
@t = global i32 0, align 4
; Function Attrs: nounwind uwtable
-define zeroext i1 @_Z3fooi(i32) #0 {
+define zeroext i1 @_Z3fooi(i32) {
%switch.tableidx = sub i32 %0, 0
%2 = icmp ult i32 %switch.tableidx, 4
br i1 %2, label %switch.lookup, label %3
@@ -41,7 +41,7 @@ switch.lookup: ; preds = %1
}
; Function Attrs: nounwind uwtable
-define void @_Z3bari(i32) #0 !dbg !9 {
+define void @_Z3bari(i32) !dbg !9 {
%2 = call zeroext i1 @_Z3fooi(i32 %0), !dbg !10
br i1 %2, label %3, label %6, !dbg !10
@@ -55,8 +55,6 @@ define void @_Z3bari(i32) #0 !dbg !9 {
ret void
}
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3}
!llvm.ident = !{!4}
diff --git a/llvm/test/Transforms/SampleProfile/misexpect.ll b/llvm/test/Transforms/SampleProfile/misexpect.ll
index 26f76ee5..a7667fc 100644
--- a/llvm/test/Transforms/SampleProfile/misexpect.ll
+++ b/llvm/test/Transforms/SampleProfile/misexpect.ll
@@ -68,7 +68,7 @@ if.end: ; preds = %entry
%1 = load ptr, ptr %argv.addr, align 8, !dbg !30
%arrayidx = getelementptr inbounds ptr, ptr %1, i64 1, !dbg !30
%2 = load ptr, ptr %arrayidx, align 8, !dbg !30
- %call = call i32 @atoi(ptr %2) #4, !dbg !31
+ %call = call i32 @atoi(ptr %2), !dbg !31
store i32 %call, ptr %limit, align 4, !dbg !29
%3 = load i32, ptr %limit, align 4, !dbg !32
%exp = call i32 @llvm.expect.i32(i32 %3, i32 0)
@@ -80,7 +80,7 @@ if.then.2: ; preds = %if.end
%4 = load ptr, ptr %argv.addr, align 8, !dbg !39
%arrayidx3 = getelementptr inbounds ptr, ptr %4, i64 2, !dbg !39
%5 = load ptr, ptr %arrayidx3, align 8, !dbg !39
- %call4 = call i32 @atoi(ptr %5) #4, !dbg !40
+ %call4 = call i32 @atoi(ptr %5), !dbg !40
%conv = sitofp i32 %call4 to double, !dbg !40
%mul = fmul double 0x40370ABE6A337A81, %conv, !dbg !41
store double %mul, ptr %s, align 8, !dbg !38
@@ -130,7 +130,7 @@ if.else: ; preds = %if.end
%16 = load ptr, ptr %argv.addr, align 8, !dbg !72
%arrayidx10 = getelementptr inbounds ptr, ptr %16, i64 2, !dbg !72
%17 = load ptr, ptr %arrayidx10, align 8, !dbg !72
- %call11 = call i32 @atoi(ptr %17) #4, !dbg !74
+ %call11 = call i32 @atoi(ptr %17), !dbg !74
%conv12 = sitofp i32 %call11 to double, !dbg !74
store double %conv12, ptr %result, align 8, !dbg !75
br label %if.end.13
@@ -147,23 +147,17 @@ return: ; preds = %if.end.13, %if.then
}
; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
; Function Attrs: nounwind readonly
-declare i32 @atoi(ptr) #2
+declare i32 @atoi(ptr)
-declare i32 @printf(ptr, ...) #3
+declare i32 @printf(ptr, ...)
; Function Attrs: nounwind readnone willreturn
-declare i32 @llvm.expect.i32(i32, i32) #5
+declare i32 @llvm.expect.i32(i32, i32)
-
-attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readonly "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind readonly }
-attributes #5 = { nounwind readnone willreturn }
+attributes #0 = { uwtable "use-sample-profile" }
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!13, !14}
diff --git a/llvm/test/Transforms/SampleProfile/norepeated-icp-2.ll b/llvm/test/Transforms/SampleProfile/norepeated-icp-2.ll
index 30a5198..5ce8670 100644
--- a/llvm/test/Transforms/SampleProfile/norepeated-icp-2.ll
+++ b/llvm/test/Transforms/SampleProfile/norepeated-icp-2.ll
@@ -17,7 +17,7 @@ entry:
}
; Function Attrs: nofree nounwind
-declare dso_local noundef i32 @printf(ptr nocapture noundef readonly, ...) #1
+declare dso_local noundef i32 @printf(ptr nocapture noundef readonly, ...)
; Function Attrs: uwtable mustprogress
define dso_local void @_Z3hoov() #0 !dbg !11 {
@@ -35,7 +35,7 @@ if.end: ; preds = %if.then, %entry
ret void, !dbg !22
}
-declare !dbg !23 dso_local void @_Z10hoo_calleev() #2
+declare !dbg !23 dso_local void @_Z10hoo_calleev()
; MAX2-LABEL: @_Z3goov(
; MAX2: icmp eq ptr {{.*}} @_Z3hoov
@@ -78,12 +78,9 @@ entry:
; MAX4: ![[PROF_ID5]] = !{!"VP", i32 0, i64 13000, i64 4128940972712279918, i64 -1, i64 3137940972712279918, i64 -1, i64 2132940972712279918, i64 -1, i64 1850239051784516332, i64 -1}
; Function Attrs: nofree nounwind
-declare noundef i32 @puts(ptr nocapture noundef readonly) #3
+declare noundef i32 @puts(ptr nocapture noundef readonly)
-attributes #0 = { uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-sample-profile" "use-soft-float"="false" }
-attributes #1 = { nofree nounwind "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nofree nounwind }
+attributes #0 = { uwtable mustprogress "use-sample-profile" }
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/Transforms/SampleProfile/norepeated-icp-3.ll b/llvm/test/Transforms/SampleProfile/norepeated-icp-3.ll
index 9b9bbca4..c5645cd 100644
--- a/llvm/test/Transforms/SampleProfile/norepeated-icp-3.ll
+++ b/llvm/test/Transforms/SampleProfile/norepeated-icp-3.ll
@@ -34,11 +34,10 @@ entry:
}
; Function Attrs: nofree nounwind
-declare noundef i32 @puts(ptr nocapture noundef readonly) #2
+declare noundef i32 @puts(ptr nocapture noundef readonly) #1
-attributes #0 = { uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-sample-profile" "use-soft-float"="false" }
-attributes #1 = { nofree nounwind "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nofree nounwind }
+attributes #0 = { uwtable mustprogress "use-sample-profile" }
+attributes #1 = { nofree nounwind }
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/Transforms/SampleProfile/norepeated-icp-4.ll b/llvm/test/Transforms/SampleProfile/norepeated-icp-4.ll
index 57a2386..c418372 100644
--- a/llvm/test/Transforms/SampleProfile/norepeated-icp-4.ll
+++ b/llvm/test/Transforms/SampleProfile/norepeated-icp-4.ll
@@ -33,7 +33,7 @@ entry:
ret void, !dbg !21
}
-attributes #0 = { uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-sample-profile" "use-soft-float"="false" }
+attributes #0 = { uwtable mustprogress "use-sample-profile" }
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5, !25}
diff --git a/llvm/test/Transforms/SampleProfile/norepeated-icp.ll b/llvm/test/Transforms/SampleProfile/norepeated-icp.ll
index f3340ba..bf3cfe4 100644
--- a/llvm/test/Transforms/SampleProfile/norepeated-icp.ll
+++ b/llvm/test/Transforms/SampleProfile/norepeated-icp.ll
@@ -15,7 +15,7 @@ entry:
}
; Function Attrs: nofree nounwind
-declare dso_local noundef i32 @printf(ptr nocapture noundef readonly, ...) #1
+declare dso_local noundef i32 @printf(ptr nocapture noundef readonly, ...)
; Function Attrs: uwtable mustprogress
define dso_local void @_Z3goov() #0 !dbg !11 {
@@ -40,11 +40,9 @@ entry:
}
; Function Attrs: nofree nounwind
-declare noundef i32 @puts(ptr nocapture noundef readonly) #2
+declare noundef i32 @puts(ptr nocapture noundef readonly)
-attributes #0 = { uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-sample-profile" "use-soft-float"="false" }
-attributes #1 = { nofree nounwind "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nofree nounwind }
+attributes #0 = { uwtable mustprogress "use-sample-profile" }
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/Transforms/SampleProfile/offset.ll b/llvm/test/Transforms/SampleProfile/offset.ll
index 0fdeb08..0f0187a 100644
--- a/llvm/test/Transforms/SampleProfile/offset.ll
+++ b/llvm/test/Transforms/SampleProfile/offset.ll
@@ -47,7 +47,7 @@ return: ; preds = %if.else, %if.then
; Function Attrs: nounwind readnone
declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #0 = { nounwind uwtable "use-sample-profile" }
attributes #1 = { nounwind readnone }
!llvm.dbg.cu = !{!0}
diff --git a/llvm/test/Transforms/SampleProfile/profile-context-order.ll b/llvm/test/Transforms/SampleProfile/profile-context-order.ll
index db368bc..eddbb0b 100644
--- a/llvm/test/Transforms/SampleProfile/profile-context-order.ll
+++ b/llvm/test/Transforms/SampleProfile/profile-context-order.ll
@@ -118,8 +118,8 @@ entry:
declare i32 @_Z3foo(i32)
-attributes #0 = { nofree noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #1 = { nofree nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #0 = { nofree noinline nounwind uwtable "use-sample-profile" }
+attributes #1 = { nofree nounwind uwtable "use-sample-profile" }
!llvm.dbg.cu = !{!2}
!llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/Transforms/SampleProfile/profile-context-tracker-debug.ll b/llvm/test/Transforms/SampleProfile/profile-context-tracker-debug.ll
index bb0abb1..aad4e38 100644
--- a/llvm/test/Transforms/SampleProfile/profile-context-tracker-debug.ll
+++ b/llvm/test/Transforms/SampleProfile/profile-context-tracker-debug.ll
@@ -155,8 +155,8 @@ entry:
declare i32 @_Z3fibi(i32)
-attributes #0 = { nofree noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #1 = { nofree norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #0 = { nofree noinline norecurse nounwind uwtable "use-sample-profile" }
+attributes #1 = { nofree norecurse nounwind uwtable "use-sample-profile" }
!llvm.dbg.cu = !{!2}
!llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll b/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll
index 9d2ac2f..a1bf359 100644
--- a/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll
+++ b/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll
@@ -152,8 +152,8 @@ entry:
declare i32 @_Z3fibi(i32)
-attributes #0 = { nofree noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #1 = { nofree norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #0 = { nofree noinline norecurse nounwind uwtable "use-sample-profile" }
+attributes #1 = { nofree norecurse nounwind uwtable "use-sample-profile" }
!llvm.dbg.cu = !{!2}
!llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/Transforms/SampleProfile/profile-topdown-order.ll b/llvm/test/Transforms/SampleProfile/profile-topdown-order.ll
index f85ab24..3cb8cd1 100644
--- a/llvm/test/Transforms/SampleProfile/profile-topdown-order.ll
+++ b/llvm/test/Transforms/SampleProfile/profile-topdown-order.ll
@@ -99,8 +99,8 @@ entry:
declare i32 @_Z3foo(i32)
-attributes #0 = { nofree noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #1 = { nofree nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #0 = { nofree noinline nounwind uwtable "use-sample-profile" }
+attributes #1 = { nofree nounwind uwtable "use-sample-profile" }
!llvm.dbg.cu = !{!2}
!llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/Transforms/SampleProfile/propagate.ll b/llvm/test/Transforms/SampleProfile/propagate.ll
index ece85a8..3a07152 100644
--- a/llvm/test/Transforms/SampleProfile/propagate.ll
+++ b/llvm/test/Transforms/SampleProfile/propagate.ll
@@ -123,7 +123,6 @@ for.cond8: ; preds = %for.inc, %if.else7
; CHECK: edge %for.cond8 -> %for.body10 probability is 0x7e941a89 / 0x80000000 = 98.89% [HOT edge]
; CHECK: edge %for.cond8 -> %for.end probability is 0x016be577 / 0x80000000 = 1.11%
-
for.body10: ; preds = %for.cond8
%14 = load i64, ptr %j, align 8, !dbg !69
%15 = load i32, ptr %x.addr, align 4, !dbg !71
@@ -171,7 +170,7 @@ return: ; preds = %if.end20, %if.then
}
; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
; Function Attrs: norecurse uwtable
define i32 @main() #2 !dbg !86 {
@@ -198,12 +197,10 @@ entry:
ret i32 0, !dbg !104
}
-declare i32 @printf(ptr, ...) #3
+declare i32 @printf(ptr, ...)
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { norecurse uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #3 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "use-sample-profile" }
+attributes #2 = { norecurse uwtable "use-sample-profile" }
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4}
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-discriminator.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-discriminator.ll
index 26ae198..7d6fd1b 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-discriminator.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-discriminator.ll
@@ -2,11 +2,10 @@
; RUN: opt < %s -passes='default<O2>' -debug-info-for-profiling -pseudo-probe-for-profiling -S | FileCheck %s --check-prefix=PROBE
; RUN: opt < %s -passes='thinlto-pre-link<O2>' -debug-info-for-profiling -pseudo-probe-for-profiling -S | FileCheck %s --check-prefix=PROBE
-
@a = dso_local global i32 0, align 4
; Function Attrs: uwtable
-define void @_Z3foov(i32 %x) #0 !dbg !4 {
+define void @_Z3foov(i32 %x) !dbg !4 {
bb0:
%cmp = icmp eq i32 %x, 0, !dbg !10
br i1 %cmp, label %bb1, label %bb2
@@ -30,13 +29,10 @@ bb3:
ret void, !dbg !12
}
-declare void @_Z3barv() #1
+declare void @_Z3barv()
declare void @llvm.lifetime.start.p0(ptr nocapture) nounwind argmemonly
declare void @llvm.lifetime.end.p0(ptr nocapture) nounwind argmemonly
-attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!7, !8}
!llvm.ident = !{!9}
@@ -62,7 +58,6 @@ attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "fra
; DEBUG: ![[INST]] = !DILocation(line: 4, column: 15, scope: ![[INSTBLOCK:[0-9]+]])
; DEBUG: ![[INSTBLOCK]] = !DILexicalBlockFile({{.*}} discriminator: 4)
-
; PROBE: ![[CALL1]] = !DILocation(line: 4, column: 3, scope: ![[CALL1BLOCK:[0-9]+]])
; PROBE: ![[CALL1BLOCK]] = !DILexicalBlockFile({{.*}} discriminator: 455147551)
; PROBE: ![[CALL2]] = !DILocation(line: 4, column: 9, scope: ![[CALL2BLOCK:[0-9]+]])
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-icp-factor.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-icp-factor.ll
index 383289e..92f04d5 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-icp-factor.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-icp-factor.ll
@@ -135,7 +135,7 @@ declare dso_local i32 @printf(ptr, ...)
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite)
declare void @llvm.pseudoprobe(i64, i64, i32, i64) #3
-attributes #0 = { nounwind uwtable "disable-tail-calls"="true" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-sample-profile" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "use-sample-profile" }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #2 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
attributes #3 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) }
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-no-debug-info.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-no-debug-info.ll
index e45ddb1..08b7e4c 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-no-debug-info.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-no-debug-info.ll
@@ -1,6 +1,5 @@
; RUN: opt < %s -passes='pseudo-probe,cgscc(inline)' -S | FileCheck %s
-
; CHECK-LABEL: @caller(
; This instruction did not have a !dbg metadata in the callee but get a !dbg after inlined.
@@ -10,26 +9,23 @@
; CHECK-NOT: call void @llvm.pseudoprobe({{.*}}), !dbg ![[#]]
; CHECK: call void @llvm.pseudoprobe({{.*}})
-
@a = common global i32 0, align 4
@b = common global i32 0, align 4
; Function Attrs: nounwind uwtable
-define void @callee() #0 {
+define void @callee() {
entry:
store i32 1, ptr @a, align 4
ret void
}
; Function Attrs: nounwind uwtable
-define void @caller() #0 !dbg !4 {
+define void @caller() !dbg !4 {
entry:
tail call void @callee(), !dbg !12
ret void, !dbg !12
}
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!8, !9}
!llvm.ident = !{!10}
diff --git a/llvm/test/Transforms/SampleProfile/remarks.ll b/llvm/test/Transforms/SampleProfile/remarks.ll
index 3cb91b7..decf4b1 100644
--- a/llvm/test/Transforms/SampleProfile/remarks.ll
+++ b/llvm/test/Transforms/SampleProfile/remarks.ll
@@ -202,10 +202,10 @@ entry:
ret i32 %conv, !dbg !58
}
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #0 = { nounwind uwtable "use-sample-profile" }
attributes #1 = { nounwind argmemonly }
attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind alwaysinline "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind alwaysinline }
attributes #4 = { nounwind }
!llvm.dbg.cu = !{!0}
diff --git a/llvm/test/Transforms/SampleProfile/uniqname.ll b/llvm/test/Transforms/SampleProfile/uniqname.ll
index 23c3ac9..67988c7 100644
--- a/llvm/test/Transforms/SampleProfile/uniqname.ll
+++ b/llvm/test/Transforms/SampleProfile/uniqname.ll
@@ -65,9 +65,9 @@ if.end: ; preds = %if.then, %entry
ret void, !dbg !31
}
-declare !dbg !32 dso_local void @_Z10hoo_calleev() #3
+declare !dbg !32 dso_local void @_Z10hoo_calleev()
-declare !dbg !33 dso_local void @_Z10moo_calleev() #3
+declare !dbg !33 dso_local void @_Z10moo_calleev()
; Function Attrs: uwtable mustprogress
define internal void @_ZL3noov.__uniq.334154460836426447066042049082945760258() #1 !dbg !34 {
@@ -84,12 +84,11 @@ if.end: ; preds = %if.then, %entry
ret void, !dbg !37
}
-declare !dbg !38 dso_local void @_Z10noo_calleev() #3
+declare !dbg !38 dso_local void @_Z10noo_calleev()
-attributes #0 = { uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-sample-profile" "use-soft-float"="false" }
-attributes #1 = { uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "sample-profile-suffix-elision-policy"="selected" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-sample-profile" "use-soft-float"="false" }
-attributes #2 = { noinline uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "sample-profile-suffix-elision-policy"="selected" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-sample-profile" "use-soft-float"="false" }
-attributes #3 = { "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { uwtable mustprogress "use-sample-profile" }
+attributes #1 = { uwtable mustprogress "use-sample-profile" "sample-profile-suffix-elision-policy"="selected" }
+attributes #2 = { noinline uwtable mustprogress "use-sample-profile" "sample-profile-suffix-elision-policy"="selected" }
; CHECK: ![[PROF_ID1]] = !{!"branch_weights", i32 5931}
; CHECK: ![[PROF_ID2]] = !{!"branch_weights", i32 2000}
diff --git a/llvm/test/Transforms/Scalarizer/dbginfo.ll b/llvm/test/Transforms/Scalarizer/dbginfo.ll
index 464e752..d6b8aa2 100644
--- a/llvm/test/Transforms/Scalarizer/dbginfo.ll
+++ b/llvm/test/Transforms/Scalarizer/dbginfo.ll
@@ -2,7 +2,7 @@
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
; Function Attrs: nounwind uwtable
-define void @f1(ptr nocapture %a, ptr nocapture readonly %b, ptr nocapture readonly %c) #0 !dbg !4 {
+define void @f1(ptr nocapture %a, ptr nocapture readonly %b, ptr nocapture readonly %c) !dbg !4 {
; CHECK: @f1(
; CHECK: %a.i1 = getelementptr i32, ptr %a, i32 1
; CHECK: %a.i2 = getelementptr i32, ptr %a, i32 2
@@ -45,10 +45,7 @@ entry:
}
; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!18, !26}
diff --git a/llvm/test/Transforms/SimplifyCFG/Hexagon/switch-to-lookup-table.ll b/llvm/test/Transforms/SimplifyCFG/Hexagon/switch-to-lookup-table.ll
index ccc8def..7aa6ced 100644
--- a/llvm/test/Transforms/SimplifyCFG/Hexagon/switch-to-lookup-table.ll
+++ b/llvm/test/Transforms/SimplifyCFG/Hexagon/switch-to-lookup-table.ll
@@ -7,7 +7,7 @@ target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i
target triple = "hexagon-unknown--elf"
; Function Attrs: noinline nounwind
-define i32 @foo(i32 %x) #0 section ".tcm_text" {
+define i32 @foo(i32 %x) section ".tcm_text" {
; ENABLE-LABEL: @foo(
; ENABLE-NEXT: entry:
; ENABLE-NEXT: [[TMP0:%.*]] = icmp ult i32 [[X:%.*]], 6
@@ -92,5 +92,3 @@ return: ; preds = %sw.default, %sw.bb5
%1 = load i32, ptr %retval, align 4
ret i32 %1
}
-
-attributes #0 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv60" "target-features"="-hvx,-long-calls" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/merge-cleanuppads.ll b/llvm/test/Transforms/SimplifyCFG/X86/merge-cleanuppads.ll
index 6e6c97f..2d21a59 100644
--- a/llvm/test/Transforms/SimplifyCFG/X86/merge-cleanuppads.ll
+++ b/llvm/test/Transforms/SimplifyCFG/X86/merge-cleanuppads.ll
@@ -3,37 +3,33 @@ target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-windows-msvc18.0.0"
; Function Attrs: uwtable
-define void @test1() #0 personality ptr @__CxxFrameHandler3 {
+define void @test1() personality ptr @__CxxFrameHandler3 {
entry:
invoke void @may_throw(i32 3)
to label %invoke.cont unwind label %ehcleanup
invoke.cont: ; preds = %entry
- tail call void @may_throw(i32 2) #2
- tail call void @may_throw(i32 1) #2
+ tail call void @may_throw(i32 2)
+ tail call void @may_throw(i32 1)
ret void
ehcleanup: ; preds = %entry
%cp = cleanuppad within none []
- tail call void @may_throw(i32 2) #2 [ "funclet"(token %cp) ]
+ tail call void @may_throw(i32 2) [ "funclet"(token %cp) ]
cleanupret from %cp unwind label %ehcleanup2
ehcleanup2:
%cp2 = cleanuppad within none []
- tail call void @may_throw(i32 1) #2 [ "funclet"(token %cp2) ]
+ tail call void @may_throw(i32 1) [ "funclet"(token %cp2) ]
cleanupret from %cp2 unwind to caller
}
; CHECK-LABEL: define void @test1(
; CHECK: %[[cp:.*]] = cleanuppad within none []
-; CHECK: tail call void @may_throw(i32 2) #2 [ "funclet"(token %[[cp]]) ]
-; CHECK: tail call void @may_throw(i32 1) #2 [ "funclet"(token %[[cp]]) ]
+; CHECK: tail call void @may_throw(i32 2) [ "funclet"(token %[[cp]]) ]
+; CHECK: tail call void @may_throw(i32 1) [ "funclet"(token %[[cp]]) ]
; CHECK: cleanupret from %[[cp]] unwind to caller
-declare void @may_throw(i32) #1
+declare void @may_throw(i32)
declare i32 @__CxxFrameHandler3(...)
-
-attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
diff --git a/llvm/test/Transforms/SimplifyCFG/pr50060-constantfold-loopid.ll b/llvm/test/Transforms/SimplifyCFG/pr50060-constantfold-loopid.ll
index 19e1c73..0363792 100644
--- a/llvm/test/Transforms/SimplifyCFG/pr50060-constantfold-loopid.ll
+++ b/llvm/test/Transforms/SimplifyCFG/pr50060-constantfold-loopid.ll
@@ -13,7 +13,7 @@
@C = dso_local global i32 0, align 4
; Function Attrs: nounwind
-define dso_local void @_Z6test01v() addrspace(1) #0 {
+define dso_local void @_Z6test01v() addrspace(1) {
; CHECK-LABEL: @_Z6test01v(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[J:%.*]] = alloca i32, align 4
@@ -22,7 +22,7 @@ define dso_local void @_Z6test01v() addrspace(1) #0 {
; CHECK: do.body:
; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr @C, align 4, !tbaa [[TBAA2:![0-9]+]]
; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1
-; CHECK-NEXT: call addrspace(1) void @llvm.lifetime.start.p0(ptr [[J]]) #[[ATTR2:[0-9]+]]
+; CHECK-NEXT: call addrspace(1) void @llvm.lifetime.start.p0(ptr [[J]])
; CHECK-NEXT: store i32 0, ptr [[J]], align 4, !tbaa [[TBAA2]]
; CHECK-NEXT: br label [[FOR_COND:%.*]]
; CHECK: for.cond:
@@ -30,11 +30,11 @@ define dso_local void @_Z6test01v() addrspace(1) #0 {
; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 3
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; CHECK: for.cond.cleanup:
-; CHECK-NEXT: call addrspace(1) void @llvm.lifetime.end.p0(ptr [[J]]) #[[ATTR2]]
+; CHECK-NEXT: call addrspace(1) void @llvm.lifetime.end.p0(ptr [[J]])
; CHECK-NEXT: br label [[DO_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: for.body:
; CHECK-NEXT: store i32 undef, ptr [[I]], align 4
-; CHECK-NEXT: call addrspace(1) void @llvm.lifetime.start.p0(ptr [[I]]) #[[ATTR2]]
+; CHECK-NEXT: call addrspace(1) void @llvm.lifetime.start.p0(ptr [[I]])
; CHECK-NEXT: store i32 0, ptr [[I]], align 4, !tbaa [[TBAA2]]
; CHECK-NEXT: br label [[FOR_COND1:%.*]]
; CHECK: for.cond1:
@@ -43,7 +43,7 @@ define dso_local void @_Z6test01v() addrspace(1) #0 {
; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP2]], [[TMP3]]
; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_BODY4:%.*]], label [[FOR_COND_CLEANUP3:%.*]]
; CHECK: for.cond.cleanup3:
-; CHECK-NEXT: call addrspace(1) void @llvm.lifetime.end.p0(ptr [[I]]) #[[ATTR2]]
+; CHECK-NEXT: call addrspace(1) void @llvm.lifetime.end.p0(ptr [[I]])
; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[J]], align 4, !tbaa [[TBAA2]]
; CHECK-NEXT: [[INC7:%.*]] = add nsw i32 [[TMP4]], 1
; CHECK-NEXT: store i32 [[INC7]], ptr [[J]], align 4, !tbaa [[TBAA2]]
@@ -64,7 +64,7 @@ entry:
do.body: ; preds = %do.cond, %entry
%0 = load i32, ptr @C, align 4, !tbaa !2
%inc = add nsw i32 %0, 1
- call addrspace(1) void @llvm.lifetime.start.p0(ptr %j) #2
+ call addrspace(1) void @llvm.lifetime.start.p0(ptr %j)
store i32 0, ptr %j, align 4, !tbaa !2
br label %for.cond
@@ -74,12 +74,12 @@ for.cond: ; preds = %for.inc6, %do.body
br i1 %cmp, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond
- call addrspace(1) void @llvm.lifetime.end.p0(ptr %j) #2
+ call addrspace(1) void @llvm.lifetime.end.p0(ptr %j)
br label %for.end8
for.body: ; preds = %for.cond
store i32 undef, ptr %i, align 4
- call addrspace(1) void @llvm.lifetime.start.p0(ptr %i) #2
+ call addrspace(1) void @llvm.lifetime.start.p0(ptr %i)
store i32 0, ptr %i, align 4, !tbaa !2
br label %for.cond1
@@ -90,7 +90,7 @@ for.cond1: ; preds = %for.inc, %for.body
br i1 %cmp2, label %for.body4, label %for.cond.cleanup3
for.cond.cleanup3: ; preds = %for.cond1
- call addrspace(1) void @llvm.lifetime.end.p0(ptr %i) #2
+ call addrspace(1) void @llvm.lifetime.end.p0(ptr %i)
br label %for.end
for.body4: ; preds = %for.cond1
@@ -124,14 +124,10 @@ do.end: ; preds = %do.cond
}
; Function Attrs: argmemonly nofree nosync nounwind willreturn
-declare void @llvm.lifetime.start.p0(ptr nocapture) addrspace(1) #1
+declare void @llvm.lifetime.start.p0(ptr nocapture) addrspace(1)
; Function Attrs: argmemonly nofree nosync nounwind willreturn
-declare void @llvm.lifetime.end.p0(ptr nocapture) addrspace(1) #1
-
-attributes #0 = { nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nofree nosync nounwind willreturn }
-attributes #2 = { nounwind }
+declare void @llvm.lifetime.end.p0(ptr nocapture) addrspace(1)
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/pr23975.ll b/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/pr23975.ll
index b3cbc3d..0d3846d 100644
--- a/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/pr23975.ll
+++ b/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/pr23975.ll
@@ -7,7 +7,7 @@ target triple = "amdgcn--"
%struct.Matrix4x4 = type { [4 x [4 x float]] }
; Function Attrs: nounwind
-define fastcc void @Accelerator_Intersect(ptr addrspace(1) nocapture readonly %leafTransformations) #0 {
+define fastcc void @Accelerator_Intersect(ptr addrspace(1) nocapture readonly %leafTransformations) {
; CHECK-LABEL: @Accelerator_Intersect(
entry:
%tmp = sext i32 undef to i64
@@ -17,5 +17,3 @@ entry:
%tmp2 = load <4 x float>, ptr addrspace(1) undef, align 4
ret void
}
-
-attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "target-cpu"="tahiti" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/StructurizeCFG/nested-loop-order.ll b/llvm/test/Transforms/StructurizeCFG/nested-loop-order.ll
index 91e9511..11753cf 100644
--- a/llvm/test/Transforms/StructurizeCFG/nested-loop-order.ll
+++ b/llvm/test/Transforms/StructurizeCFG/nested-loop-order.ll
@@ -63,7 +63,3 @@ ENDIF28: ; preds = %ENDIF
%tmp36 = icmp sgt i32 %tmp20, 2
br i1 %tmp36, label %ENDLOOP, label %LOOP.outer
}
-
-attributes #0 = { "enable-no-nans-fp-math"="true" "unsafe-fp-math"="true" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { readnone }
diff --git a/llvm/test/Transforms/Util/PredicateInfo/testandor.ll b/llvm/test/Transforms/Util/PredicateInfo/testandor.ll
index 2e96a92..cc1dc4e 100644
--- a/llvm/test/Transforms/Util/PredicateInfo/testandor.ll
+++ b/llvm/test/Transforms/Util/PredicateInfo/testandor.ll
@@ -994,3 +994,30 @@ define void @test_assume_deep_and_tree(i1 %a1) {
call void @foo(i1 %a15)
ret void
}
+
+define i32 @test_and_with_phinode(i32 %x) {
+; CHECK-LABEL: @test_and_with_phinode(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[XGE1:%.*]] = icmp uge i32 [[X:%.*]], 1
+; CHECK-NEXT: [[XLT2:%.*]] = icmp ult i32 [[X]], 2
+; CHECK-NEXT: [[AND:%.*]] = and i1 [[XGE1]], [[XLT2]]
+; CHECK: [[X_0_1:%.*]] = bitcast i32 [[X]] to i32
+; CHECK: [[X_0_2:%.*]] = bitcast i32 [[X_0_1]] to i32
+; CHECK-NEXT: br i1 [[AND]], label [[PHI:%.*]], label [[NOPE:%.*]]
+; CHECK: nope:
+; CHECK-NEXT: br label [[PHI]]
+; CHECK: phi:
+; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[X_0_2]], [[ENTRY:%.*]] ], [ 1, [[NOPE]] ]
+; CHECK-NEXT: ret i32 [[RES]]
+;
+entry:
+ %xge1 = icmp uge i32 %x, 1
+ %xlt2 = icmp ult i32 %x, 2
+ %and = and i1 %xge1, %xlt2
+ br i1 %and, label %phi, label %nope
+nope:
+ br label %phi
+phi:
+ %res = phi i32 [ %x, %entry ], [ 1, %nope ]
+ ret i32 %res
+}
diff --git a/llvm/test/Transforms/Util/dbg-user-of-aext.ll b/llvm/test/Transforms/Util/dbg-user-of-aext.ll
index 9e7935e..b3d1b90 100644
--- a/llvm/test/Transforms/Util/dbg-user-of-aext.ll
+++ b/llvm/test/Transforms/Util/dbg-user-of-aext.ll
@@ -27,7 +27,7 @@
%struct.foo = type { i8, i64 }
; Function Attrs: noinline nounwind uwtable
-define void @_Z1fbb3foo(i1 zeroext %b, i1 zeroext %frag, i8 %g.coerce0, i64 %g.coerce1) #0 !dbg !6 {
+define void @_Z1fbb3foo(i1 zeroext %b, i1 zeroext %frag, i8 %g.coerce0, i64 %g.coerce1) !dbg !6 {
entry:
%g = alloca %struct.foo, align 8
%b.addr = alloca i8, align 1
@@ -51,10 +51,7 @@ entry:
; CHECK: ![[VAR_FRAG]] = !DILocalVariable(name: "frag"
; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
-
-attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4}
diff --git a/llvm/test/Transforms/Util/libcalls-fast-math-inf-loop.ll b/llvm/test/Transforms/Util/libcalls-fast-math-inf-loop.ll
index ad23bf7..e9f0c8c 100644
--- a/llvm/test/Transforms/Util/libcalls-fast-math-inf-loop.ll
+++ b/llvm/test/Transforms/Util/libcalls-fast-math-inf-loop.ll
@@ -19,18 +19,18 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-unknown"
; Function Attrs: nounwind
-define float @fn(float %f) #0 {
+define float @fn(float %f) {
; CHECK: define float @fn(
; CHECK: call fast float @expf(
%f.addr = alloca float, align 4
store float %f, ptr %f.addr, align 4, !tbaa !1
%1 = load float, ptr %f.addr, align 4, !tbaa !1
- %call = call fast float @expf(float %1) #3
+ %call = call fast float @expf(float %1)
ret float %call
}
; Function Attrs: inlinehint nounwind readnone
-define available_externally float @expf(float %x) #1 {
+define available_externally float @expf(float %x) {
; CHECK: define available_externally float @expf(
; CHECK: fpext float
; CHECK: call fast double @exp(
@@ -39,17 +39,13 @@ define available_externally float @expf(float %x) #1 {
store float %x, ptr %x.addr, align 4, !tbaa !1
%1 = load float, ptr %x.addr, align 4, !tbaa !1
%conv = fpext float %1 to double
- %call = call fast double @exp(double %conv) #3
+ %call = call fast double @exp(double %conv)
%conv1 = fptrunc double %call to float
ret float %conv1
}
; Function Attrs: nounwind readnone
-declare double @exp(double) #2
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { inlinehint nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
+declare double @exp(double)
!llvm.ident = !{!0}
diff --git a/llvm/test/Verifier/atomics.ll b/llvm/test/Verifier/atomics.ll
index f835b98..17bf5a0 100644
--- a/llvm/test/Verifier/atomics.ll
+++ b/llvm/test/Verifier/atomics.ll
@@ -1,14 +1,15 @@
; RUN: not opt -passes=verify < %s 2>&1 | FileCheck %s
+; CHECK: atomic store operand must have integer, pointer, floating point, or vector type!
+; CHECK: atomic load operand must have integer, pointer, floating point, or vector type!
-; CHECK: atomic store operand must have integer, pointer, or floating point type!
-; CHECK: atomic load operand must have integer, pointer, or floating point type!
+%ty = type { i32 };
-define void @foo(ptr %P, <1 x i64> %v) {
- store atomic <1 x i64> %v, ptr %P unordered, align 8
+define void @foo(ptr %P, %ty %v) {
+ store atomic %ty %v, ptr %P unordered, align 8
ret void
}
-define <1 x i64> @bar(ptr %P) {
- %v = load atomic <1 x i64>, ptr %P unordered, align 8
- ret <1 x i64> %v
+define %ty @bar(ptr %P) {
+ %v = load atomic %ty, ptr %P unordered, align 8
+ ret %ty %v
}
diff --git a/llvm/test/tools/llvm-ir2vec/embeddings-flowaware.ll b/llvm/test/tools/llvm-ir2vec/embeddings-flowaware.ll
index b2362f8..ade228d 100644
--- a/llvm/test/tools/llvm-ir2vec/embeddings-flowaware.ll
+++ b/llvm/test/tools/llvm-ir2vec/embeddings-flowaware.ll
@@ -49,7 +49,7 @@ entry:
; CHECK-FUNC-LEVEL-ABC: Function: abc
; CHECK-FUNC-LEVEL-NEXT-ABC: [ 3630.00 3672.00 3714.00 ]
-; CHECK-FUNC-DEF: Error: Function 'def' not found
+; CHECK-FUNC-DEF: error: Function 'def' not found
; CHECK-BB-LEVEL: Function: abc
; CHECK-BB-LEVEL-NEXT: entry: [ 3630.00 3672.00 3714.00 ]
diff --git a/llvm/test/tools/llvm-ir2vec/embeddings-symbolic.ll b/llvm/test/tools/llvm-ir2vec/embeddings-symbolic.ll
index f9aa108..9d60e12 100644
--- a/llvm/test/tools/llvm-ir2vec/embeddings-symbolic.ll
+++ b/llvm/test/tools/llvm-ir2vec/embeddings-symbolic.ll
@@ -49,7 +49,7 @@ entry:
; CHECK-FUNC-LEVEL-ABC: Function: abc
; CHECK-FUNC-LEVEL-NEXT-ABC: [ 878.00 889.00 900.00 ]
-; CHECK-FUNC-DEF: Error: Function 'def' not found
+; CHECK-FUNC-DEF: error: Function 'def' not found
; CHECK-BB-LEVEL: Function: abc
; CHECK-BB-LEVEL-NEXT: entry: [ 878.00 889.00 900.00 ]
diff --git a/llvm/test/tools/llvm-ir2vec/embeddings-symbolic.mir b/llvm/test/tools/llvm-ir2vec/embeddings-symbolic.mir
new file mode 100644
index 0000000..ef835fe
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/embeddings-symbolic.mir
@@ -0,0 +1,92 @@
+# REQUIRES: x86_64-linux
+# RUN: llvm-ir2vec embeddings --mode=mir --mir2vec-vocab-path=%S/../../CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json %s | FileCheck %s -check-prefix=CHECK-DEFAULT
+# RUN: llvm-ir2vec embeddings --mode=mir --level=func --mir2vec-vocab-path=%S/../../CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json %s | FileCheck %s -check-prefix=CHECK-FUNC-LEVEL
+# RUN: llvm-ir2vec embeddings --mode=mir --level=func --function=add_function --mir2vec-vocab-path=%S/../../CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json %s | FileCheck %s -check-prefix=CHECK-FUNC-LEVEL-ADD
+# RUN: not llvm-ir2vec embeddings --mode=mir --level=func --function=missing_function --mir2vec-vocab-path=%S/../../CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json %s 2>&1 | FileCheck %s -check-prefix=CHECK-FUNC-MISSING
+# RUN: llvm-ir2vec embeddings --mode=mir --level=bb --mir2vec-vocab-path=%S/../../CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json %s | FileCheck %s -check-prefix=CHECK-BB-LEVEL
+# RUN: llvm-ir2vec embeddings --mode=mir --level=inst --function=add_function --mir2vec-vocab-path=%S/../../CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json %s | FileCheck %s -check-prefix=CHECK-INST-LEVEL
+
+--- |
+ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+ target triple = "x86_64-unknown-linux-gnu"
+
+ define dso_local noundef i32 @add_function(i32 noundef %a, i32 noundef %b) {
+ entry:
+ %sum = add nsw i32 %a, %b
+ %result = mul nsw i32 %sum, 2
+ ret i32 %result
+ }
+
+ define dso_local void @simple_function() {
+ entry:
+ ret void
+ }
+...
+---
+name: add_function
+alignment: 16
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: gr32 }
+ - { id: 1, class: gr32 }
+ - { id: 2, class: gr32 }
+ - { id: 3, class: gr32 }
+liveins:
+ - { reg: '$edi', virtual-reg: '%0' }
+ - { reg: '$esi', virtual-reg: '%1' }
+body: |
+ bb.0.entry:
+ liveins: $edi, $esi
+
+ %1:gr32 = COPY $esi
+ %0:gr32 = COPY $edi
+ %2:gr32 = nsw ADD32rr %0, %1, implicit-def dead $eflags
+ %3:gr32 = ADD32rr %2, %2, implicit-def dead $eflags
+ $eax = COPY %3
+ RET 0, $eax
+
+---
+name: simple_function
+alignment: 16
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ RET 0
+
+# CHECK-DEFAULT: MIR2Vec embeddings for machine function add_function:
+# CHECK-DEFAULT-NEXT: Function vector: [ 26.50 27.10 27.70 ]
+# CHECK-DEFAULT: MIR2Vec embeddings for machine function simple_function:
+# CHECK-DEFAULT-NEXT: Function vector: [ 1.10 1.20 1.30 ]
+
+# CHECK-FUNC-LEVEL: MIR2Vec embeddings for machine function add_function:
+# CHECK-FUNC-LEVEL-NEXT: Function vector: [ 26.50 27.10 27.70 ]
+# CHECK-FUNC-LEVEL: MIR2Vec embeddings for machine function simple_function:
+# CHECK-FUNC-LEVEL-NEXT: Function vector: [ 1.10 1.20 1.30 ]
+
+# CHECK-FUNC-LEVEL-ADD: MIR2Vec embeddings for machine function add_function:
+# CHECK-FUNC-LEVEL-ADD-NEXT: Function vector: [ 26.50 27.10 27.70 ]
+# CHECK-FUNC-LEVEL-ADD-NOT: simple_function
+
+# CHECK-FUNC-MISSING: error: Function 'missing_function' not found
+
+# CHECK-BB-LEVEL: MIR2Vec embeddings for machine function add_function:
+# CHECK-BB-LEVEL-NEXT: Basic block vectors:
+# CHECK-BB-LEVEL-NEXT: MBB entry: [ 26.50 27.10 27.70 ]
+# CHECK-BB-LEVEL: MIR2Vec embeddings for machine function simple_function:
+# CHECK-BB-LEVEL-NEXT: Basic block vectors:
+# CHECK-BB-LEVEL-NEXT: MBB entry: [ 1.10 1.20 1.30 ]
+
+# CHECK-INST-LEVEL: MIR2Vec embeddings for machine function add_function:
+# CHECK-INST-LEVEL-NEXT: Instruction vectors:
+# CHECK-INST-LEVEL: %1:gr32 = COPY $esi
+# CHECK-INST-LEVEL-NEXT: -> [ 6.00 6.10 6.20 ]
+# CHECK-INST-LEVEL-NEXT: %0:gr32 = COPY $edi
+# CHECK-INST-LEVEL-NEXT: -> [ 6.00 6.10 6.20 ]
+# CHECK-INST-LEVEL: %2:gr32 = nsw ADD32rr
+# CHECK-INST-LEVEL: -> [ 3.70 3.80 3.90 ]
+# CHECK-INST-LEVEL: %3:gr32 = ADD32rr
+# CHECK-INST-LEVEL: -> [ 3.70 3.80 3.90 ]
+# CHECK-INST-LEVEL: $eax = COPY %3:gr32
+# CHECK-INST-LEVEL-NEXT: -> [ 6.00 6.10 6.20 ]
+# CHECK-INST-LEVEL: RET 0, $eax
+# CHECK-INST-LEVEL-NEXT: -> [ 1.10 1.20 1.30 ]
diff --git a/llvm/test/tools/llvm-ir2vec/entities.mir b/llvm/test/tools/llvm-ir2vec/entities.mir
new file mode 100644
index 0000000..60d9c7a
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/entities.mir
@@ -0,0 +1,28 @@
+# REQUIRES: x86_64-linux
+# RUN: llvm-ir2vec entities --mode=mir %s -o 2>&1 %t1.log
+# RUN: diff %S/output/reference_x86_entities.txt %t1.log
+
+--- |
+ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+ target triple = "x86_64-unknown-linux-gnu"
+
+ define dso_local noundef i32 @test_function(i32 noundef %a) {
+ entry:
+ ret i32 %a
+ }
+...
+---
+name: test_function
+alignment: 16
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: gr32 }
+liveins:
+ - { reg: '$edi', virtual-reg: '%0' }
+body: |
+ bb.0.entry:
+ liveins: $edi
+
+ %0:gr32 = COPY $edi
+ $eax = COPY %0
+ RET 0, $eax
diff --git a/llvm/test/tools/llvm-ir2vec/error-handling.ll b/llvm/test/tools/llvm-ir2vec/error-handling.ll
index b944ea0..8e9e455 100644
--- a/llvm/test/tools/llvm-ir2vec/error-handling.ll
+++ b/llvm/test/tools/llvm-ir2vec/error-handling.ll
@@ -10,4 +10,4 @@ entry:
}
; CHECK-NO-VOCAB: error: IR2Vec vocabulary file path not specified; You may need to set it using --ir2vec-vocab-path
-; CHECK-FUNC-NOT-FOUND: Error: Function 'nonexistent' not found
+; CHECK-FUNC-NOT-FOUND: error: Function 'nonexistent' not found
diff --git a/llvm/test/tools/llvm-ir2vec/error-handling.mir b/llvm/test/tools/llvm-ir2vec/error-handling.mir
new file mode 100644
index 0000000..caec454c
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/error-handling.mir
@@ -0,0 +1,41 @@
+# REQUIRES: x86_64-linux
+# Test error handling and input validation for llvm-ir2vec tool in MIR mode
+
+# RUN: not llvm-ir2vec embeddings --mode=mir %s 2>&1 | FileCheck %s -check-prefix=CHECK-NO-VOCAB
+# RUN: not llvm-ir2vec embeddings --mode=mir --mir2vec-vocab-path=%S/nonexistent-vocab.json %s 2>&1 | FileCheck %s -check-prefix=CHECK-VOCAB-NOT-FOUND
+# RUN: not llvm-ir2vec embeddings --mode=mir --mir2vec-vocab-path=%S/../../CodeGen/MIR2Vec/Inputs/mir2vec_invalid_vocab.json %s 2>&1 | FileCheck %s -check-prefix=CHECK-INVALID-VOCAB
+# RUN: not llvm-ir2vec embeddings --mode=mir --function=nonexistent_function --mir2vec-vocab-path=%S/../../CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json %s 2>&1 | FileCheck %s -check-prefix=CHECK-FUNC-NOT-FOUND
+
+--- |
+ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+ target triple = "x86_64-unknown-linux-gnu"
+
+ define dso_local noundef i32 @test_function(i32 noundef %a) {
+ entry:
+ ret i32 %a
+ }
+...
+---
+name: test_function
+alignment: 16
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: gr32 }
+liveins:
+ - { reg: '$edi', virtual-reg: '%0' }
+body: |
+ bb.0.entry:
+ liveins: $edi
+
+ %0:gr32 = COPY $edi
+ $eax = COPY %0
+ RET 0, $eax
+
+# CHECK-NO-VOCAB: error: Failed to load MIR2Vec vocabulary - MIR2Vec vocabulary file path not specified; set it using --mir2vec-vocab-path
+
+# CHECK-VOCAB-NOT-FOUND: error: Failed to load MIR2Vec vocabulary
+# CHECK-VOCAB-NOT-FOUND: No such file or directory
+
+# CHECK-INVALID-VOCAB: error: Failed to load MIR2Vec vocabulary - Missing 'Opcodes' section in vocabulary file
+
+# CHECK-FUNC-NOT-FOUND: error: Function 'nonexistent_function' not found
diff --git a/llvm/test/tools/llvm-ir2vec/output/lit.local.cfg b/llvm/test/tools/llvm-ir2vec/output/lit.local.cfg
new file mode 100644
index 0000000..2406f19
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/output/lit.local.cfg
@@ -0,0 +1,3 @@
+# Don't treat files in this directory as tests
+# These are reference data files, not test scripts
+config.suffixes = []
diff --git a/llvm/test/tools/llvm-ir2vec/output/reference_triplets.txt b/llvm/test/tools/llvm-ir2vec/output/reference_triplets.txt
new file mode 100644
index 0000000..dfbac4c
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/output/reference_triplets.txt
@@ -0,0 +1,33 @@
+MAX_RELATION=4
+187 7072 1
+187 6968 2
+187 187 0
+187 7072 1
+187 6969 2
+187 10 0
+10 7072 1
+10 7072 2
+10 7072 3
+10 6961 4
+10 187 0
+187 6952 1
+187 7072 2
+187 1555 0
+1555 6882 1
+1555 6952 2
+187 7072 1
+187 6968 2
+187 187 0
+187 7072 1
+187 6969 2
+187 601 0
+601 7072 1
+601 7072 2
+601 7072 3
+601 6961 4
+601 187 0
+187 6952 1
+187 7072 2
+187 1555 0
+1555 6882 1
+1555 6952 2
diff --git a/llvm/test/tools/llvm-ir2vec/output/reference_x86_entities.txt b/llvm/test/tools/llvm-ir2vec/output/reference_x86_entities.txt
new file mode 100644
index 0000000..dc436d1
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/output/reference_x86_entities.txt
@@ -0,0 +1,7174 @@
+7173
+AAA 0
+AAD 1
+AADD 2
+AAM 3
+AAND 4
+AAS 5
+ABS_F 6
+ABS_Fp 7
+ADC 8
+ADCX 9
+ADD 10
+ADDPDrm 11
+ADDPDrr 12
+ADDPSrm 13
+ADDPSrr 14
+ADDR 15
+ADDSDrm 16
+ADDSDrm_Int 17
+ADDSDrr 18
+ADDSDrr_Int 19
+ADDSSrm 20
+ADDSSrm_Int 21
+ADDSSrr 22
+ADDSSrr_Int 23
+ADDSUBPDrm 24
+ADDSUBPDrr 25
+ADDSUBPSrm 26
+ADDSUBPSrr 27
+ADD_F 28
+ADD_FI 29
+ADD_FPrST 30
+ADD_FST 31
+ADD_Fp 32
+ADD_FpI 33
+ADD_FrST 34
+ADJCALLSTACKDOWN 35
+ADJCALLSTACKUP 36
+ADOX 37
+AESDEC 38
+AESDECLASTrm 39
+AESDECLASTrr 40
+AESDECWIDE 41
+AESDECrm 42
+AESDECrr 43
+AESENC 44
+AESENCLASTrm 45
+AESENCLASTrr 46
+AESENCWIDE 47
+AESENCrm 48
+AESENCrr 49
+AESIMCrm 50
+AESIMCrr 51
+AESKEYGENASSISTrmi 52
+AESKEYGENASSISTrri 53
+AND 54
+ANDN 55
+ANDNPDrm 56
+ANDNPDrr 57
+ANDNPSrm 58
+ANDNPSrr 59
+ANDPDrm 60
+ANDPDrr 61
+ANDPSrm 62
+ANDPSrr 63
+ANNOTATION_LABEL 64
+AOR 65
+ARITH_FENCE 66
+ARPL 67
+ASAN_CHECK_MEMACCESS 68
+AVX 69
+AVX_SET 70
+AXOR 71
+BEXTR 72
+BEXTRI 73
+BLCFILL 74
+BLCI 75
+BLCIC 76
+BLCMSK 77
+BLCS 78
+BLENDPDrmi 79
+BLENDPDrri 80
+BLENDPSrmi 81
+BLENDPSrri 82
+BLENDVPDrm 83
+BLENDVPDrr 84
+BLENDVPSrm 85
+BLENDVPSrr 86
+BLSFILL 87
+BLSI 88
+BLSIC 89
+BLSMSK 90
+BLSR 91
+BOUNDS 92
+BSF 93
+BSR 94
+BSWAP 95
+BT 96
+BTC 97
+BTR 98
+BTS 99
+BUNDLE 100
+BZHI 101
+CALL 102
+CALLpcrel 103
+CATCHRET 104
+CBW 105
+CCMP 106
+CDQ 107
+CDQE 108
+CFCMOV 109
+CFI_INSTRUCTION 110
+CHS_F 111
+CHS_Fp 112
+CLAC 113
+CLC 114
+CLD 115
+CLDEMOTE 116
+CLEANUPRET 117
+CLFLUSH 118
+CLFLUSHOPT 119
+CLGI 120
+CLI 121
+CLRSSBSY 122
+CLTS 123
+CLUI 124
+CLWB 125
+CLZERO 126
+CMC 127
+CMOV 128
+CMOVBE_F 129
+CMOVBE_Fp 130
+CMOVB_F 131
+CMOVB_Fp 132
+CMOVE_F 133
+CMOVE_Fp 134
+CMOVNBE_F 135
+CMOVNBE_Fp 136
+CMOVNB_F 137
+CMOVNB_Fp 138
+CMOVNE_F 139
+CMOVNE_Fp 140
+CMOVNP_F 141
+CMOVNP_Fp 142
+CMOVP_F 143
+CMOVP_Fp 144
+CMOV_FR 145
+CMOV_GR 146
+CMOV_RFP 147
+CMOV_VK 148
+CMOV_VR 149
+CMP 150
+CMPCCXADDmr 151
+CMPPDrmi 152
+CMPPDrri 153
+CMPPSrmi 154
+CMPPSrri 155
+CMPSB 156
+CMPSDrmi 157
+CMPSDrmi_Int 158
+CMPSDrri 159
+CMPSDrri_Int 160
+CMPSL 161
+CMPSQ 162
+CMPSSrmi 163
+CMPSSrmi_Int 164
+CMPSSrri 165
+CMPSSrri_Int 166
+CMPSW 167
+CMPXCHG 168
+COMISDrm 169
+COMISDrm_Int 170
+COMISDrr 171
+COMISDrr_Int 172
+COMISSrm 173
+COMISSrm_Int 174
+COMISSrr 175
+COMISSrr_Int 176
+COMP_FST 177
+COM_FIPr 178
+COM_FIr 179
+COM_FST 180
+COM_FpIr 181
+COM_Fpr 182
+CONVERGENCECTRL_ANCHOR 183
+CONVERGENCECTRL_ENTRY 184
+CONVERGENCECTRL_GLUE 185
+CONVERGENCECTRL_LOOP 186
+COPY 187
+COPY_TO_REGCLASS 188
+CPUID 189
+CQO 190
+CRC 191
+CS_PREFIX 192
+CTEST 193
+CVTDQ 194
+CVTPD 195
+CVTPS 196
+CVTSD 197
+CVTSI 198
+CVTSS 199
+CVTTPD 200
+CVTTPS 201
+CVTTSD 202
+CVTTSS 203
+CWD 204
+CWDE 205
+DAA 206
+DAS 207
+DATA 208
+DBG_INSTR_REF 209
+DBG_LABEL 210
+DBG_PHI 211
+DBG_VALUE 212
+DBG_VALUE_LIST 213
+DEC 214
+DIV 215
+DIVPDrm 216
+DIVPDrr 217
+DIVPSrm 218
+DIVPSrr 219
+DIVR_F 220
+DIVR_FI 221
+DIVR_FPrST 222
+DIVR_FST 223
+DIVR_Fp 224
+DIVR_FpI 225
+DIVR_FrST 226
+DIVSDrm 227
+DIVSDrm_Int 228
+DIVSDrr 229
+DIVSDrr_Int 230
+DIVSSrm 231
+DIVSSrm_Int 232
+DIVSSrr 233
+DIVSSrr_Int 234
+DIV_F 235
+DIV_FI 236
+DIV_FPrST 237
+DIV_FST 238
+DIV_Fp 239
+DIV_FpI 240
+DIV_FrST 241
+DPPDrmi 242
+DPPDrri 243
+DPPSrmi 244
+DPPSrri 245
+DS_PREFIX 246
+DYN_ALLOCA 247
+EH_LABEL 248
+EH_RETURN 249
+EH_SjLj_LongJmp 250
+EH_SjLj_SetJmp 251
+EH_SjLj_Setup 252
+ENCLS 253
+ENCLU 254
+ENCLV 255
+ENCODEKEY 256
+ENDBR 257
+ENQCMD 258
+ENQCMDS 259
+ENTER 260
+ERETS 261
+ERETU 262
+ES_PREFIX 263
+EXTRACTPSmri 264
+EXTRACTPSrri 265
+EXTRACT_SUBREG 266
+EXTRQ 267
+EXTRQI 268
+F 269
+FAKE_USE 270
+FARCALL 271
+FARJMP 272
+FAULTING_OP 273
+FBLDm 274
+FBSTPm 275
+FCOM 276
+FCOMP 277
+FCOMPP 278
+FCOS 279
+FDECSTP 280
+FEMMS 281
+FENTRY_CALL 282
+FFREE 283
+FFREEP 284
+FICOM 285
+FICOMP 286
+FINCSTP 287
+FLDCW 288
+FLDENVm 289
+FLDL 290
+FLDLG 291
+FLDLN 292
+FLDPI 293
+FNCLEX 294
+FNINIT 295
+FNOP 296
+FNSTCW 297
+FNSTSW 298
+FNSTSWm 299
+FP 300
+FPATAN 301
+FPREM 302
+FPTAN 303
+FRNDINT 304
+FRSTORm 305
+FSAVEm 306
+FSCALE 307
+FSIN 308
+FSINCOS 309
+FSTENVm 310
+FS_PREFIX 311
+FXRSTOR 312
+FXSAVE 313
+FXTRACT 314
+FYL 315
+FsFLD 316
+GC_LABEL 317
+GETSEC 318
+GF 319
+GS_PREFIX 320
+G_ABDS 321
+G_ABDU 322
+G_ABS 323
+G_ADD 324
+G_ADDRSPACE_CAST 325
+G_AND 326
+G_ANYEXT 327
+G_ASHR 328
+G_ASSERT_ALIGN 329
+G_ASSERT_SEXT 330
+G_ASSERT_ZEXT 331
+G_ATOMICRMW_ADD 332
+G_ATOMICRMW_AND 333
+G_ATOMICRMW_FADD 334
+G_ATOMICRMW_FMAX 335
+G_ATOMICRMW_FMAXIMUM 336
+G_ATOMICRMW_FMIN 337
+G_ATOMICRMW_FMINIMUM 338
+G_ATOMICRMW_FSUB 339
+G_ATOMICRMW_MAX 340
+G_ATOMICRMW_MIN 341
+G_ATOMICRMW_NAND 342
+G_ATOMICRMW_OR 343
+G_ATOMICRMW_SUB 344
+G_ATOMICRMW_UDEC_WRAP 345
+G_ATOMICRMW_UINC_WRAP 346
+G_ATOMICRMW_UMAX 347
+G_ATOMICRMW_UMIN 348
+G_ATOMICRMW_USUB_COND 349
+G_ATOMICRMW_USUB_SAT 350
+G_ATOMICRMW_XCHG 351
+G_ATOMICRMW_XOR 352
+G_ATOMIC_CMPXCHG 353
+G_ATOMIC_CMPXCHG_WITH_SUCCESS 354
+G_BITCAST 355
+G_BITREVERSE 356
+G_BLOCK_ADDR 357
+G_BR 358
+G_BRCOND 359
+G_BRINDIRECT 360
+G_BRJT 361
+G_BSWAP 362
+G_BUILD_VECTOR 363
+G_BUILD_VECTOR_TRUNC 364
+G_BZERO 365
+G_CONCAT_VECTORS 366
+G_CONSTANT 367
+G_CONSTANT_FOLD_BARRIER 368
+G_CONSTANT_POOL 369
+G_CTLZ 370
+G_CTLZ_ZERO_UNDEF 371
+G_CTPOP 372
+G_CTTZ 373
+G_CTTZ_ZERO_UNDEF 374
+G_DEBUGTRAP 375
+G_DYN_STACKALLOC 376
+G_EXTRACT 377
+G_EXTRACT_SUBVECTOR 378
+G_EXTRACT_VECTOR_ELT 379
+G_FABS 380
+G_FACOS 381
+G_FADD 382
+G_FASIN 383
+G_FATAN 384
+G_FCANONICALIZE 385
+G_FCEIL 386
+G_FCMP 387
+G_FCONSTANT 388
+G_FCOPYSIGN 389
+G_FCOS 390
+G_FCOSH 391
+G_FDIV 392
+G_FENCE 393
+G_FEXP 394
+G_FFLOOR 395
+G_FFREXP 396
+G_FILD 397
+G_FIST 398
+G_FLDCW 399
+G_FLDEXP 400
+G_FLOG 401
+G_FMA 402
+G_FMAD 403
+G_FMAXIMUM 404
+G_FMAXIMUMNUM 405
+G_FMAXNUM 406
+G_FMAXNUM_IEEE 407
+G_FMINIMUM 408
+G_FMINIMUMNUM 409
+G_FMINNUM 410
+G_FMINNUM_IEEE 411
+G_FMODF 412
+G_FMUL 413
+G_FNEARBYINT 414
+G_FNEG 415
+G_FNSTCW 416
+G_FPEXT 417
+G_FPOW 418
+G_FPOWI 419
+G_FPTOSI 420
+G_FPTOSI_SAT 421
+G_FPTOUI 422
+G_FPTOUI_SAT 423
+G_FPTRUNC 424
+G_FRAME_INDEX 425
+G_FREEZE 426
+G_FREM 427
+G_FRINT 428
+G_FSHL 429
+G_FSHR 430
+G_FSIN 431
+G_FSINCOS 432
+G_FSINH 433
+G_FSQRT 434
+G_FSUB 435
+G_FTAN 436
+G_FTANH 437
+G_GET_FPENV 438
+G_GET_FPMODE 439
+G_GET_ROUNDING 440
+G_GLOBAL_VALUE 441
+G_ICMP 442
+G_IMPLICIT_DEF 443
+G_INDEXED_LOAD 444
+G_INDEXED_SEXTLOAD 445
+G_INDEXED_STORE 446
+G_INDEXED_ZEXTLOAD 447
+G_INSERT 448
+G_INSERT_SUBVECTOR 449
+G_INSERT_VECTOR_ELT 450
+G_INTRINSIC 451
+G_INTRINSIC_CONVERGENT 452
+G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS 453
+G_INTRINSIC_FPTRUNC_ROUND 454
+G_INTRINSIC_LLRINT 455
+G_INTRINSIC_LRINT 456
+G_INTRINSIC_ROUND 457
+G_INTRINSIC_ROUNDEVEN 458
+G_INTRINSIC_TRUNC 459
+G_INTRINSIC_W_SIDE_EFFECTS 460
+G_INTTOPTR 461
+G_INVOKE_REGION_START 462
+G_IS_FPCLASS 463
+G_JUMP_TABLE 464
+G_LLROUND 465
+G_LOAD 466
+G_LROUND 467
+G_LSHR 468
+G_MEMCPY 469
+G_MEMCPY_INLINE 470
+G_MEMMOVE 471
+G_MEMSET 472
+G_MERGE_VALUES 473
+G_MUL 474
+G_OR 475
+G_PHI 476
+G_PREFETCH 477
+G_PTRAUTH_GLOBAL_VALUE 478
+G_PTRMASK 479
+G_PTRTOINT 480
+G_PTR_ADD 481
+G_READCYCLECOUNTER 482
+G_READSTEADYCOUNTER 483
+G_READ_REGISTER 484
+G_RESET_FPENV 485
+G_RESET_FPMODE 486
+G_ROTL 487
+G_ROTR 488
+G_SADDE 489
+G_SADDO 490
+G_SADDSAT 491
+G_SBFX 492
+G_SCMP 493
+G_SDIV 494
+G_SDIVFIX 495
+G_SDIVFIXSAT 496
+G_SDIVREM 497
+G_SELECT 498
+G_SET_FPENV 499
+G_SET_FPMODE 500
+G_SET_ROUNDING 501
+G_SEXT 502
+G_SEXTLOAD 503
+G_SEXT_INREG 504
+G_SHL 505
+G_SHUFFLE_VECTOR 506
+G_SITOFP 507
+G_SMAX 508
+G_SMIN 509
+G_SMULFIX 510
+G_SMULFIXSAT 511
+G_SMULH 512
+G_SMULO 513
+G_SPLAT_VECTOR 514
+G_SREM 515
+G_SSHLSAT 516
+G_SSUBE 517
+G_SSUBO 518
+G_SSUBSAT 519
+G_STACKRESTORE 520
+G_STACKSAVE 521
+G_STEP_VECTOR 522
+G_STORE 523
+G_STRICT_FADD 524
+G_STRICT_FDIV 525
+G_STRICT_FLDEXP 526
+G_STRICT_FMA 527
+G_STRICT_FMUL 528
+G_STRICT_FREM 529
+G_STRICT_FSQRT 530
+G_STRICT_FSUB 531
+G_SUB 532
+G_TRAP 533
+G_TRUNC 534
+G_TRUNC_SSAT_S 535
+G_TRUNC_SSAT_U 536
+G_TRUNC_USAT_U 537
+G_UADDE 538
+G_UADDO 539
+G_UADDSAT 540
+G_UBFX 541
+G_UBSANTRAP 542
+G_UCMP 543
+G_UDIV 544
+G_UDIVFIX 545
+G_UDIVFIXSAT 546
+G_UDIVREM 547
+G_UITOFP 548
+G_UMAX 549
+G_UMIN 550
+G_UMULFIX 551
+G_UMULFIXSAT 552
+G_UMULH 553
+G_UMULO 554
+G_UNMERGE_VALUES 555
+G_UREM 556
+G_USHLSAT 557
+G_USUBE 558
+G_USUBO 559
+G_USUBSAT 560
+G_VAARG 561
+G_VASTART 562
+G_VECREDUCE_ADD 563
+G_VECREDUCE_AND 564
+G_VECREDUCE_FADD 565
+G_VECREDUCE_FMAX 566
+G_VECREDUCE_FMAXIMUM 567
+G_VECREDUCE_FMIN 568
+G_VECREDUCE_FMINIMUM 569
+G_VECREDUCE_FMUL 570
+G_VECREDUCE_MUL 571
+G_VECREDUCE_OR 572
+G_VECREDUCE_SEQ_FADD 573
+G_VECREDUCE_SEQ_FMUL 574
+G_VECREDUCE_SMAX 575
+G_VECREDUCE_SMIN 576
+G_VECREDUCE_UMAX 577
+G_VECREDUCE_UMIN 578
+G_VECREDUCE_XOR 579
+G_VECTOR_COMPRESS 580
+G_VSCALE 581
+G_WRITE_REGISTER 582
+G_XOR 583
+G_ZEXT 584
+G_ZEXTLOAD 585
+HADDPDrm 586
+HADDPDrr 587
+HADDPSrm 588
+HADDPSrr 589
+HLT 590
+HRESET 591
+HSUBPDrm 592
+HSUBPDrr 593
+HSUBPSrm 594
+HSUBPSrr 595
+ICALL_BRANCH_FUNNEL 596
+IDIV 597
+ILD_F 598
+ILD_Fp 599
+IMPLICIT_DEF 600
+IMUL 601
+IMULZU 602
+IN 603
+INC 604
+INCSSPD 605
+INCSSPQ 606
+INDIRECT_THUNK_CALL 607
+INDIRECT_THUNK_TCRETURN 608
+INIT_UNDEF 609
+INLINEASM 610
+INLINEASM_BR 611
+INSB 612
+INSERTPSrmi 613
+INSERTPSrri 614
+INSERTQ 615
+INSERTQI 616
+INSERT_SUBREG 617
+INSL 618
+INSW 619
+INT 620
+INTO 621
+INVD 622
+INVEPT 623
+INVLPG 624
+INVLPGA 625
+INVLPGB 626
+INVPCID 627
+INVVPID 628
+IRET 629
+ISTT_FP 630
+ISTT_Fp 631
+IST_F 632
+IST_FP 633
+IST_Fp 634
+Int_eh_sjlj_setup_dispatch 635
+JCC 636
+JCXZ 637
+JECXZ 638
+JMP 639
+JMPABS 640
+JRCXZ 641
+JUMP_TABLE_DEBUG_INFO 642
+KADDBkk 643
+KADDDkk 644
+KADDQkk 645
+KADDWkk 646
+KANDBkk 647
+KANDDkk 648
+KANDNBkk 649
+KANDNDkk 650
+KANDNQkk 651
+KANDNWkk 652
+KANDQkk 653
+KANDWkk 654
+KCFI_CHECK 655
+KILL 656
+KMOVBkk 657
+KMOVBkk_EVEX 658
+KMOVBkm 659
+KMOVBkm_EVEX 660
+KMOVBkr 661
+KMOVBkr_EVEX 662
+KMOVBmk 663
+KMOVBmk_EVEX 664
+KMOVBrk 665
+KMOVBrk_EVEX 666
+KMOVDkk 667
+KMOVDkk_EVEX 668
+KMOVDkm 669
+KMOVDkm_EVEX 670
+KMOVDkr 671
+KMOVDkr_EVEX 672
+KMOVDmk 673
+KMOVDmk_EVEX 674
+KMOVDrk 675
+KMOVDrk_EVEX 676
+KMOVQkk 677
+KMOVQkk_EVEX 678
+KMOVQkm 679
+KMOVQkm_EVEX 680
+KMOVQkr 681
+KMOVQkr_EVEX 682
+KMOVQmk 683
+KMOVQmk_EVEX 684
+KMOVQrk 685
+KMOVQrk_EVEX 686
+KMOVWkk 687
+KMOVWkk_EVEX 688
+KMOVWkm 689
+KMOVWkm_EVEX 690
+KMOVWkr 691
+KMOVWkr_EVEX 692
+KMOVWmk 693
+KMOVWmk_EVEX 694
+KMOVWrk 695
+KMOVWrk_EVEX 696
+KNOTBkk 697
+KNOTDkk 698
+KNOTQkk 699
+KNOTWkk 700
+KORBkk 701
+KORDkk 702
+KORQkk 703
+KORTESTBkk 704
+KORTESTDkk 705
+KORTESTQkk 706
+KORTESTWkk 707
+KORWkk 708
+KSET 709
+KSHIFTLBki 710
+KSHIFTLDki 711
+KSHIFTLQki 712
+KSHIFTLWki 713
+KSHIFTRBki 714
+KSHIFTRDki 715
+KSHIFTRQki 716
+KSHIFTRWki 717
+KTESTBkk 718
+KTESTDkk 719
+KTESTQkk 720
+KTESTWkk 721
+KUNPCKBWkk 722
+KUNPCKDQkk 723
+KUNPCKWDkk 724
+KXNORBkk 725
+KXNORDkk 726
+KXNORQkk 727
+KXNORWkk 728
+KXORBkk 729
+KXORDkk 730
+KXORQkk 731
+KXORWkk 732
+LAHF 733
+LAR 734
+LCMPXCHG 735
+LDDQUrm 736
+LDMXCSR 737
+LDS 738
+LDTILECFG 739
+LDTILECFG_EVEX 740
+LD_F 741
+LD_Fp 742
+LD_Frr 743
+LEA 744
+LEAVE 745
+LES 746
+LFENCE 747
+LFS 748
+LGDT 749
+LGS 750
+LIDT 751
+LIFETIME_END 752
+LIFETIME_START 753
+LKGS 754
+LLDT 755
+LLWPCB 756
+LMSW 757
+LOADIWKEY 758
+LOAD_STACK_GUARD 759
+LOCAL_ESCAPE 760
+LOCK_ADD 761
+LOCK_AND 762
+LOCK_BTC 763
+LOCK_BTC_RM 764
+LOCK_BTR 765
+LOCK_BTR_RM 766
+LOCK_BTS 767
+LOCK_BTS_RM 768
+LOCK_DEC 769
+LOCK_INC 770
+LOCK_OR 771
+LOCK_PREFIX 772
+LOCK_SUB 773
+LOCK_XOR 774
+LODSB 775
+LODSL 776
+LODSQ 777
+LODSW 778
+LOOP 779
+LOOPE 780
+LOOPNE 781
+LRET 782
+LRETI 783
+LSL 784
+LSS 785
+LTRm 786
+LTRr 787
+LWPINS 788
+LWPVAL 789
+LXADD 790
+LZCNT 791
+MASKMOVDQU 792
+MASKPAIR 793
+MAXCPDrm 794
+MAXCPDrr 795
+MAXCPSrm 796
+MAXCPSrr 797
+MAXCSDrm 798
+MAXCSDrr 799
+MAXCSSrm 800
+MAXCSSrr 801
+MAXPDrm 802
+MAXPDrr 803
+MAXPSrm 804
+MAXPSrr 805
+MAXSDrm 806
+MAXSDrm_Int 807
+MAXSDrr 808
+MAXSDrr_Int 809
+MAXSSrm 810
+MAXSSrm_Int 811
+MAXSSrr 812
+MAXSSrr_Int 813
+MEMBARRIER 814
+MFENCE 815
+MINCPDrm 816
+MINCPDrr 817
+MINCPSrm 818
+MINCPSrr 819
+MINCSDrm 820
+MINCSDrr 821
+MINCSSrm 822
+MINCSSrr 823
+MINPDrm 824
+MINPDrr 825
+MINPSrm 826
+MINPSrr 827
+MINSDrm 828
+MINSDrm_Int 829
+MINSDrr 830
+MINSDrr_Int 831
+MINSSrm 832
+MINSSrm_Int 833
+MINSSrr 834
+MINSSrr_Int 835
+MMX_CVTPD 836
+MMX_CVTPI 837
+MMX_CVTPS 838
+MMX_CVTTPD 839
+MMX_CVTTPS 840
+MMX_EMMS 841
+MMX_MASKMOVQ 842
+MMX_MOVD 843
+MMX_MOVDQ 844
+MMX_MOVFR 845
+MMX_MOVNTQmr 846
+MMX_MOVQ 847
+MMX_PABSBrm 848
+MMX_PABSBrr 849
+MMX_PABSDrm 850
+MMX_PABSDrr 851
+MMX_PABSWrm 852
+MMX_PABSWrr 853
+MMX_PACKSSDWrm 854
+MMX_PACKSSDWrr 855
+MMX_PACKSSWBrm 856
+MMX_PACKSSWBrr 857
+MMX_PACKUSWBrm 858
+MMX_PACKUSWBrr 859
+MMX_PADDBrm 860
+MMX_PADDBrr 861
+MMX_PADDDrm 862
+MMX_PADDDrr 863
+MMX_PADDQrm 864
+MMX_PADDQrr 865
+MMX_PADDSBrm 866
+MMX_PADDSBrr 867
+MMX_PADDSWrm 868
+MMX_PADDSWrr 869
+MMX_PADDUSBrm 870
+MMX_PADDUSBrr 871
+MMX_PADDUSWrm 872
+MMX_PADDUSWrr 873
+MMX_PADDWrm 874
+MMX_PADDWrr 875
+MMX_PALIGNRrmi 876
+MMX_PALIGNRrri 877
+MMX_PANDNrm 878
+MMX_PANDNrr 879
+MMX_PANDrm 880
+MMX_PANDrr 881
+MMX_PAVGBrm 882
+MMX_PAVGBrr 883
+MMX_PAVGWrm 884
+MMX_PAVGWrr 885
+MMX_PCMPEQBrm 886
+MMX_PCMPEQBrr 887
+MMX_PCMPEQDrm 888
+MMX_PCMPEQDrr 889
+MMX_PCMPEQWrm 890
+MMX_PCMPEQWrr 891
+MMX_PCMPGTBrm 892
+MMX_PCMPGTBrr 893
+MMX_PCMPGTDrm 894
+MMX_PCMPGTDrr 895
+MMX_PCMPGTWrm 896
+MMX_PCMPGTWrr 897
+MMX_PEXTRWrri 898
+MMX_PHADDDrm 899
+MMX_PHADDDrr 900
+MMX_PHADDSWrm 901
+MMX_PHADDSWrr 902
+MMX_PHADDWrm 903
+MMX_PHADDWrr 904
+MMX_PHSUBDrm 905
+MMX_PHSUBDrr 906
+MMX_PHSUBSWrm 907
+MMX_PHSUBSWrr 908
+MMX_PHSUBWrm 909
+MMX_PHSUBWrr 910
+MMX_PINSRWrmi 911
+MMX_PINSRWrri 912
+MMX_PMADDUBSWrm 913
+MMX_PMADDUBSWrr 914
+MMX_PMADDWDrm 915
+MMX_PMADDWDrr 916
+MMX_PMAXSWrm 917
+MMX_PMAXSWrr 918
+MMX_PMAXUBrm 919
+MMX_PMAXUBrr 920
+MMX_PMINSWrm 921
+MMX_PMINSWrr 922
+MMX_PMINUBrm 923
+MMX_PMINUBrr 924
+MMX_PMOVMSKBrr 925
+MMX_PMULHRSWrm 926
+MMX_PMULHRSWrr 927
+MMX_PMULHUWrm 928
+MMX_PMULHUWrr 929
+MMX_PMULHWrm 930
+MMX_PMULHWrr 931
+MMX_PMULLWrm 932
+MMX_PMULLWrr 933
+MMX_PMULUDQrm 934
+MMX_PMULUDQrr 935
+MMX_PORrm 936
+MMX_PORrr 937
+MMX_PSADBWrm 938
+MMX_PSADBWrr 939
+MMX_PSHUFBrm 940
+MMX_PSHUFBrr 941
+MMX_PSHUFWmi 942
+MMX_PSHUFWri 943
+MMX_PSIGNBrm 944
+MMX_PSIGNBrr 945
+MMX_PSIGNDrm 946
+MMX_PSIGNDrr 947
+MMX_PSIGNWrm 948
+MMX_PSIGNWrr 949
+MMX_PSLLDri 950
+MMX_PSLLDrm 951
+MMX_PSLLDrr 952
+MMX_PSLLQri 953
+MMX_PSLLQrm 954
+MMX_PSLLQrr 955
+MMX_PSLLWri 956
+MMX_PSLLWrm 957
+MMX_PSLLWrr 958
+MMX_PSRADri 959
+MMX_PSRADrm 960
+MMX_PSRADrr 961
+MMX_PSRAWri 962
+MMX_PSRAWrm 963
+MMX_PSRAWrr 964
+MMX_PSRLDri 965
+MMX_PSRLDrm 966
+MMX_PSRLDrr 967
+MMX_PSRLQri 968
+MMX_PSRLQrm 969
+MMX_PSRLQrr 970
+MMX_PSRLWri 971
+MMX_PSRLWrm 972
+MMX_PSRLWrr 973
+MMX_PSUBBrm 974
+MMX_PSUBBrr 975
+MMX_PSUBDrm 976
+MMX_PSUBDrr 977
+MMX_PSUBQrm 978
+MMX_PSUBQrr 979
+MMX_PSUBSBrm 980
+MMX_PSUBSBrr 981
+MMX_PSUBSWrm 982
+MMX_PSUBSWrr 983
+MMX_PSUBUSBrm 984
+MMX_PSUBUSBrr 985
+MMX_PSUBUSWrm 986
+MMX_PSUBUSWrr 987
+MMX_PSUBWrm 988
+MMX_PSUBWrr 989
+MMX_PUNPCKHBWrm 990
+MMX_PUNPCKHBWrr 991
+MMX_PUNPCKHDQrm 992
+MMX_PUNPCKHDQrr 993
+MMX_PUNPCKHWDrm 994
+MMX_PUNPCKHWDrr 995
+MMX_PUNPCKLBWrm 996
+MMX_PUNPCKLBWrr 997
+MMX_PUNPCKLDQrm 998
+MMX_PUNPCKLDQrr 999
+MMX_PUNPCKLWDrm 1000
+MMX_PUNPCKLWDrr 1001
+MMX_PXORrm 1002
+MMX_PXORrr 1003
+MMX_SET 1004
+MONITOR 1005
+MONITORX 1006
+MONTMUL 1007
+MORESTACK_RET 1008
+MORESTACK_RET_RESTORE_R 1009
+MOV 1010
+MOVAPDmr 1011
+MOVAPDrm 1012
+MOVAPDrr 1013
+MOVAPDrr_REV 1014
+MOVAPSmr 1015
+MOVAPSrm 1016
+MOVAPSrr 1017
+MOVAPSrr_REV 1018
+MOVBE 1019
+MOVDDUPrm 1020
+MOVDDUPrr 1021
+MOVDI 1022
+MOVDIR 1023
+MOVDIRI 1024
+MOVDQAmr 1025
+MOVDQArm 1026
+MOVDQArr 1027
+MOVDQArr_REV 1028
+MOVDQUmr 1029
+MOVDQUrm 1030
+MOVDQUrr 1031
+MOVDQUrr_REV 1032
+MOVHLPSrr 1033
+MOVHPDmr 1034
+MOVHPDrm 1035
+MOVHPSmr 1036
+MOVHPSrm 1037
+MOVLHPSrr 1038
+MOVLPDmr 1039
+MOVLPDrm 1040
+MOVLPSmr 1041
+MOVLPSrm 1042
+MOVMSKPDrr 1043
+MOVMSKPSrr 1044
+MOVNTDQArm 1045
+MOVNTDQmr 1046
+MOVNTI 1047
+MOVNTImr 1048
+MOVNTPDmr 1049
+MOVNTPSmr 1050
+MOVNTSD 1051
+MOVNTSS 1052
+MOVPC 1053
+MOVPDI 1054
+MOVPQI 1055
+MOVPQIto 1056
+MOVQI 1057
+MOVRS 1058
+MOVSB 1059
+MOVSDmr 1060
+MOVSDrm 1061
+MOVSDrm_alt 1062
+MOVSDrr 1063
+MOVSDrr_REV 1064
+MOVSDto 1065
+MOVSHDUPrm 1066
+MOVSHDUPrr 1067
+MOVSHPmr 1068
+MOVSHPrm 1069
+MOVSL 1070
+MOVSLDUPrm 1071
+MOVSLDUPrr 1072
+MOVSQ 1073
+MOVSS 1074
+MOVSSmr 1075
+MOVSSrm 1076
+MOVSSrm_alt 1077
+MOVSSrr 1078
+MOVSSrr_REV 1079
+MOVSW 1080
+MOVSX 1081
+MOVUPDmr 1082
+MOVUPDrm 1083
+MOVUPDrr 1084
+MOVUPDrr_REV 1085
+MOVUPSmr 1086
+MOVUPSrm 1087
+MOVUPSrr 1088
+MOVUPSrr_REV 1089
+MOVZPQILo 1090
+MOVZX 1091
+MPSADBWrmi 1092
+MPSADBWrri 1093
+MUL 1094
+MULPDrm 1095
+MULPDrr 1096
+MULPSrm 1097
+MULPSrr 1098
+MULSDrm 1099
+MULSDrm_Int 1100
+MULSDrr 1101
+MULSDrr_Int 1102
+MULSSrm 1103
+MULSSrm_Int 1104
+MULSSrr 1105
+MULSSrr_Int 1106
+MULX 1107
+MUL_F 1108
+MUL_FI 1109
+MUL_FPrST 1110
+MUL_FST 1111
+MUL_Fp 1112
+MUL_FpI 1113
+MUL_FrST 1114
+MWAITX 1115
+MWAITX_SAVE_RBX 1116
+MWAITXrrr 1117
+MWAITrr 1118
+NEG 1119
+NOOP 1120
+NOOPL 1121
+NOOPLr 1122
+NOOPQ 1123
+NOOPQr 1124
+NOOPW 1125
+NOOPWr 1126
+NOT 1127
+OR 1128
+ORPDrm 1129
+ORPDrr 1130
+ORPSrm 1131
+ORPSrr 1132
+OUT 1133
+OUTSB 1134
+OUTSL 1135
+OUTSW 1136
+PABSBrm 1137
+PABSBrr 1138
+PABSDrm 1139
+PABSDrr 1140
+PABSWrm 1141
+PABSWrr 1142
+PACKSSDWrm 1143
+PACKSSDWrr 1144
+PACKSSWBrm 1145
+PACKSSWBrr 1146
+PACKUSDWrm 1147
+PACKUSDWrr 1148
+PACKUSWBrm 1149
+PACKUSWBrr 1150
+PADDBrm 1151
+PADDBrr 1152
+PADDDrm 1153
+PADDDrr 1154
+PADDQrm 1155
+PADDQrr 1156
+PADDSBrm 1157
+PADDSBrr 1158
+PADDSWrm 1159
+PADDSWrr 1160
+PADDUSBrm 1161
+PADDUSBrr 1162
+PADDUSWrm 1163
+PADDUSWrr 1164
+PADDWrm 1165
+PADDWrr 1166
+PALIGNRrmi 1167
+PALIGNRrri 1168
+PANDNrm 1169
+PANDNrr 1170
+PANDrm 1171
+PANDrr 1172
+PATCHABLE_EVENT_CALL 1173
+PATCHABLE_FUNCTION_ENTER 1174
+PATCHABLE_FUNCTION_EXIT 1175
+PATCHABLE_OP 1176
+PATCHABLE_RET 1177
+PATCHABLE_TAIL_CALL 1178
+PATCHABLE_TYPED_EVENT_CALL 1179
+PATCHPOINT 1180
+PAUSE 1181
+PAVGBrm 1182
+PAVGBrr 1183
+PAVGUSBrm 1184
+PAVGUSBrr 1185
+PAVGWrm 1186
+PAVGWrr 1187
+PBLENDVBrm 1188
+PBLENDVBrr 1189
+PBLENDWrmi 1190
+PBLENDWrri 1191
+PBNDKB 1192
+PCLMULQDQrmi 1193
+PCLMULQDQrri 1194
+PCMPEQBrm 1195
+PCMPEQBrr 1196
+PCMPEQDrm 1197
+PCMPEQDrr 1198
+PCMPEQQrm 1199
+PCMPEQQrr 1200
+PCMPEQWrm 1201
+PCMPEQWrr 1202
+PCMPESTRIrmi 1203
+PCMPESTRIrri 1204
+PCMPESTRMrmi 1205
+PCMPESTRMrri 1206
+PCMPGTBrm 1207
+PCMPGTBrr 1208
+PCMPGTDrm 1209
+PCMPGTDrr 1210
+PCMPGTQrm 1211
+PCMPGTQrr 1212
+PCMPGTWrm 1213
+PCMPGTWrr 1214
+PCMPISTRIrmi 1215
+PCMPISTRIrri 1216
+PCMPISTRMrmi 1217
+PCMPISTRMrri 1218
+PCONFIG 1219
+PDEP 1220
+PEXT 1221
+PEXTRBmri 1222
+PEXTRBrri 1223
+PEXTRDmri 1224
+PEXTRDrri 1225
+PEXTRQmri 1226
+PEXTRQrri 1227
+PEXTRWmri 1228
+PEXTRWrri 1229
+PEXTRWrri_REV 1230
+PF 1231
+PFACCrm 1232
+PFACCrr 1233
+PFADDrm 1234
+PFADDrr 1235
+PFCMPEQrm 1236
+PFCMPEQrr 1237
+PFCMPGErm 1238
+PFCMPGErr 1239
+PFCMPGTrm 1240
+PFCMPGTrr 1241
+PFMAXrm 1242
+PFMAXrr 1243
+PFMINrm 1244
+PFMINrr 1245
+PFMULrm 1246
+PFMULrr 1247
+PFNACCrm 1248
+PFNACCrr 1249
+PFPNACCrm 1250
+PFPNACCrr 1251
+PFRCPIT 1252
+PFRCPrm 1253
+PFRCPrr 1254
+PFRSQIT 1255
+PFRSQRTrm 1256
+PFRSQRTrr 1257
+PFSUBRrm 1258
+PFSUBRrr 1259
+PFSUBrm 1260
+PFSUBrr 1261
+PHADDDrm 1262
+PHADDDrr 1263
+PHADDSWrm 1264
+PHADDSWrr 1265
+PHADDWrm 1266
+PHADDWrr 1267
+PHI 1268
+PHMINPOSUWrm 1269
+PHMINPOSUWrr 1270
+PHSUBDrm 1271
+PHSUBDrr 1272
+PHSUBSWrm 1273
+PHSUBSWrr 1274
+PHSUBWrm 1275
+PHSUBWrr 1276
+PI 1277
+PINSRBrmi 1278
+PINSRBrri 1279
+PINSRDrmi 1280
+PINSRDrri 1281
+PINSRQrmi 1282
+PINSRQrri 1283
+PINSRWrmi 1284
+PINSRWrri 1285
+PLDTILECFGV 1286
+PLEA 1287
+PMADDUBSWrm 1288
+PMADDUBSWrr 1289
+PMADDWDrm 1290
+PMADDWDrr 1291
+PMAXSBrm 1292
+PMAXSBrr 1293
+PMAXSDrm 1294
+PMAXSDrr 1295
+PMAXSWrm 1296
+PMAXSWrr 1297
+PMAXUBrm 1298
+PMAXUBrr 1299
+PMAXUDrm 1300
+PMAXUDrr 1301
+PMAXUWrm 1302
+PMAXUWrr 1303
+PMINSBrm 1304
+PMINSBrr 1305
+PMINSDrm 1306
+PMINSDrr 1307
+PMINSWrm 1308
+PMINSWrr 1309
+PMINUBrm 1310
+PMINUBrr 1311
+PMINUDrm 1312
+PMINUDrr 1313
+PMINUWrm 1314
+PMINUWrr 1315
+PMOVMSKBrr 1316
+PMOVSXBDrm 1317
+PMOVSXBDrr 1318
+PMOVSXBQrm 1319
+PMOVSXBQrr 1320
+PMOVSXBWrm 1321
+PMOVSXBWrr 1322
+PMOVSXDQrm 1323
+PMOVSXDQrr 1324
+PMOVSXWDrm 1325
+PMOVSXWDrr 1326
+PMOVSXWQrm 1327
+PMOVSXWQrr 1328
+PMOVZXBDrm 1329
+PMOVZXBDrr 1330
+PMOVZXBQrm 1331
+PMOVZXBQrr 1332
+PMOVZXBWrm 1333
+PMOVZXBWrr 1334
+PMOVZXDQrm 1335
+PMOVZXDQrr 1336
+PMOVZXWDrm 1337
+PMOVZXWDrr 1338
+PMOVZXWQrm 1339
+PMOVZXWQrr 1340
+PMULDQrm 1341
+PMULDQrr 1342
+PMULHRSWrm 1343
+PMULHRSWrr 1344
+PMULHRWrm 1345
+PMULHRWrr 1346
+PMULHUWrm 1347
+PMULHUWrr 1348
+PMULHWrm 1349
+PMULHWrr 1350
+PMULLDrm 1351
+PMULLDrr 1352
+PMULLWrm 1353
+PMULLWrr 1354
+PMULUDQrm 1355
+PMULUDQrr 1356
+POP 1357
+POPA 1358
+POPCNT 1359
+POPDS 1360
+POPES 1361
+POPF 1362
+POPFS 1363
+POPGS 1364
+POPP 1365
+POPSS 1366
+PORrm 1367
+PORrr 1368
+PREALLOCATED_ARG 1369
+PREALLOCATED_SETUP 1370
+PREFETCH 1371
+PREFETCHIT 1372
+PREFETCHNTA 1373
+PREFETCHRST 1374
+PREFETCHT 1375
+PREFETCHW 1376
+PREFETCHWT 1377
+PROBED_ALLOCA 1378
+PSADBWrm 1379
+PSADBWrr 1380
+PSEUDO_PROBE 1381
+PSHUFBrm 1382
+PSHUFBrr 1383
+PSHUFDmi 1384
+PSHUFDri 1385
+PSHUFHWmi 1386
+PSHUFHWri 1387
+PSHUFLWmi 1388
+PSHUFLWri 1389
+PSIGNBrm 1390
+PSIGNBrr 1391
+PSIGNDrm 1392
+PSIGNDrr 1393
+PSIGNWrm 1394
+PSIGNWrr 1395
+PSLLDQri 1396
+PSLLDri 1397
+PSLLDrm 1398
+PSLLDrr 1399
+PSLLQri 1400
+PSLLQrm 1401
+PSLLQrr 1402
+PSLLWri 1403
+PSLLWrm 1404
+PSLLWrr 1405
+PSMASH 1406
+PSRADri 1407
+PSRADrm 1408
+PSRADrr 1409
+PSRAWri 1410
+PSRAWrm 1411
+PSRAWrr 1412
+PSRLDQri 1413
+PSRLDri 1414
+PSRLDrm 1415
+PSRLDrr 1416
+PSRLQri 1417
+PSRLQrm 1418
+PSRLQrr 1419
+PSRLWri 1420
+PSRLWrm 1421
+PSRLWrr 1422
+PSUBBrm 1423
+PSUBBrr 1424
+PSUBDrm 1425
+PSUBDrr 1426
+PSUBQrm 1427
+PSUBQrr 1428
+PSUBSBrm 1429
+PSUBSBrr 1430
+PSUBSWrm 1431
+PSUBSWrr 1432
+PSUBUSBrm 1433
+PSUBUSBrr 1434
+PSUBUSWrm 1435
+PSUBUSWrr 1436
+PSUBWrm 1437
+PSUBWrr 1438
+PSWAPDrm 1439
+PSWAPDrr 1440
+PT 1441
+PTCMMIMFP 1442
+PTCMMRLFP 1443
+PTCONJTCMMIMFP 1444
+PTCONJTFP 1445
+PTCVTROWD 1446
+PTCVTROWPS 1447
+PTDPBF 1448
+PTDPBHF 1449
+PTDPBSSD 1450
+PTDPBSSDV 1451
+PTDPBSUD 1452
+PTDPBSUDV 1453
+PTDPBUSD 1454
+PTDPBUSDV 1455
+PTDPBUUD 1456
+PTDPBUUDV 1457
+PTDPFP 1458
+PTDPHBF 1459
+PTDPHF 1460
+PTESTrm 1461
+PTESTrr 1462
+PTILELOADD 1463
+PTILELOADDRS 1464
+PTILELOADDRST 1465
+PTILELOADDRSV 1466
+PTILELOADDT 1467
+PTILELOADDV 1468
+PTILEMOVROWrre 1469
+PTILEMOVROWrreV 1470
+PTILEMOVROWrri 1471
+PTILEMOVROWrriV 1472
+PTILEPAIRLOAD 1473
+PTILEPAIRSTORE 1474
+PTILESTORED 1475
+PTILESTOREDV 1476
+PTILEZERO 1477
+PTILEZEROV 1478
+PTMMULTF 1479
+PTTCMMIMFP 1480
+PTTCMMRLFP 1481
+PTTDPBF 1482
+PTTDPFP 1483
+PTTMMULTF 1484
+PTTRANSPOSED 1485
+PTTRANSPOSEDV 1486
+PTWRITE 1487
+PTWRITEm 1488
+PTWRITEr 1489
+PUNPCKHBWrm 1490
+PUNPCKHBWrr 1491
+PUNPCKHDQrm 1492
+PUNPCKHDQrr 1493
+PUNPCKHQDQrm 1494
+PUNPCKHQDQrr 1495
+PUNPCKHWDrm 1496
+PUNPCKHWDrr 1497
+PUNPCKLBWrm 1498
+PUNPCKLBWrr 1499
+PUNPCKLDQrm 1500
+PUNPCKLDQrr 1501
+PUNPCKLQDQrm 1502
+PUNPCKLQDQrr 1503
+PUNPCKLWDrm 1504
+PUNPCKLWDrr 1505
+PUSH 1506
+PUSHA 1507
+PUSHCS 1508
+PUSHDS 1509
+PUSHES 1510
+PUSHF 1511
+PUSHFS 1512
+PUSHGS 1513
+PUSHP 1514
+PUSHSS 1515
+PVALIDATE 1516
+PXORrm 1517
+PXORrr 1518
+RCL 1519
+RCPPSm 1520
+RCPPSr 1521
+RCPSSm 1522
+RCPSSm_Int 1523
+RCPSSr 1524
+RCPSSr_Int 1525
+RCR 1526
+RDFLAGS 1527
+RDFSBASE 1528
+RDGSBASE 1529
+RDMSR 1530
+RDMSRLIST 1531
+RDMSRri 1532
+RDMSRri_EVEX 1533
+RDPID 1534
+RDPKRUr 1535
+RDPMC 1536
+RDPRU 1537
+RDRAND 1538
+RDSEED 1539
+RDSSPD 1540
+RDSSPQ 1541
+RDTSC 1542
+RDTSCP 1543
+REG_SEQUENCE 1544
+REPNE_PREFIX 1545
+REP_MOVSB 1546
+REP_MOVSD 1547
+REP_MOVSQ 1548
+REP_MOVSW 1549
+REP_PREFIX 1550
+REP_STOSB 1551
+REP_STOSD 1552
+REP_STOSQ 1553
+REP_STOSW 1554
+RET 1555
+RETI 1556
+REX 1557
+RMPADJUST 1558
+RMPQUERY 1559
+RMPUPDATE 1560
+ROL 1561
+ROR 1562
+RORX 1563
+ROUNDPDmi 1564
+ROUNDPDri 1565
+ROUNDPSmi 1566
+ROUNDPSri 1567
+ROUNDSDmi 1568
+ROUNDSDmi_Int 1569
+ROUNDSDri 1570
+ROUNDSDri_Int 1571
+ROUNDSSmi 1572
+ROUNDSSmi_Int 1573
+ROUNDSSri 1574
+ROUNDSSri_Int 1575
+RSM 1576
+RSQRTPSm 1577
+RSQRTPSr 1578
+RSQRTSSm 1579
+RSQRTSSm_Int 1580
+RSQRTSSr 1581
+RSQRTSSr_Int 1582
+RSTORSSP 1583
+SAHF 1584
+SALC 1585
+SAR 1586
+SARX 1587
+SAVEPREVSSP 1588
+SBB 1589
+SCASB 1590
+SCASL 1591
+SCASQ 1592
+SCASW 1593
+SEAMCALL 1594
+SEAMOPS 1595
+SEAMRET 1596
+SEG_ALLOCA 1597
+SEH_BeginEpilogue 1598
+SEH_EndEpilogue 1599
+SEH_EndPrologue 1600
+SEH_PushFrame 1601
+SEH_PushReg 1602
+SEH_SaveReg 1603
+SEH_SaveXMM 1604
+SEH_SetFrame 1605
+SEH_StackAlign 1606
+SEH_StackAlloc 1607
+SEH_UnwindV 1608
+SEH_UnwindVersion 1609
+SENDUIPI 1610
+SERIALIZE 1611
+SETB_C 1612
+SETCCm 1613
+SETCCm_EVEX 1614
+SETCCr 1615
+SETCCr_EVEX 1616
+SETSSBSY 1617
+SETZUCCm 1618
+SETZUCCr 1619
+SFENCE 1620
+SGDT 1621
+SHA 1622
+SHL 1623
+SHLD 1624
+SHLDROT 1625
+SHLX 1626
+SHR 1627
+SHRD 1628
+SHRDROT 1629
+SHRX 1630
+SHUFPDrmi 1631
+SHUFPDrri 1632
+SHUFPSrmi 1633
+SHUFPSrri 1634
+SIDT 1635
+SKINIT 1636
+SLDT 1637
+SLWPCB 1638
+SMSW 1639
+SQRTPDm 1640
+SQRTPDr 1641
+SQRTPSm 1642
+SQRTPSr 1643
+SQRTSDm 1644
+SQRTSDm_Int 1645
+SQRTSDr 1646
+SQRTSDr_Int 1647
+SQRTSSm 1648
+SQRTSSm_Int 1649
+SQRTSSr 1650
+SQRTSSr_Int 1651
+SQRT_F 1652
+SQRT_Fp 1653
+SS_PREFIX 1654
+STAC 1655
+STACKALLOC_W_PROBING 1656
+STACKMAP 1657
+STATEPOINT 1658
+STC 1659
+STD 1660
+STGI 1661
+STI 1662
+STMXCSR 1663
+STOSB 1664
+STOSL 1665
+STOSQ 1666
+STOSW 1667
+STR 1668
+STRm 1669
+STTILECFG 1670
+STTILECFG_EVEX 1671
+STUI 1672
+ST_F 1673
+ST_FP 1674
+ST_FPrr 1675
+ST_Fp 1676
+ST_FpP 1677
+ST_Frr 1678
+SUB 1679
+SUBPDrm 1680
+SUBPDrr 1681
+SUBPSrm 1682
+SUBPSrr 1683
+SUBREG_TO_REG 1684
+SUBR_F 1685
+SUBR_FI 1686
+SUBR_FPrST 1687
+SUBR_FST 1688
+SUBR_Fp 1689
+SUBR_FpI 1690
+SUBR_FrST 1691
+SUBSDrm 1692
+SUBSDrm_Int 1693
+SUBSDrr 1694
+SUBSDrr_Int 1695
+SUBSSrm 1696
+SUBSSrm_Int 1697
+SUBSSrr 1698
+SUBSSrr_Int 1699
+SUB_F 1700
+SUB_FI 1701
+SUB_FPrST 1702
+SUB_FST 1703
+SUB_Fp 1704
+SUB_FpI 1705
+SUB_FrST 1706
+SWAPGS 1707
+SYSCALL 1708
+SYSENTER 1709
+SYSEXIT 1710
+SYSRET 1711
+T 1712
+TAILJMPd 1713
+TAILJMPd_CC 1714
+TAILJMPm 1715
+TAILJMPr 1716
+TCMMIMFP 1717
+TCMMRLFP 1718
+TCONJTCMMIMFP 1719
+TCONJTFP 1720
+TCRETURN_HIPE 1721
+TCRETURN_WIN 1722
+TCRETURN_WINmi 1723
+TCRETURNdi 1724
+TCRETURNdicc 1725
+TCRETURNmi 1726
+TCRETURNri 1727
+TCVTROWD 1728
+TCVTROWPS 1729
+TDCALL 1730
+TDPBF 1731
+TDPBHF 1732
+TDPBSSD 1733
+TDPBSUD 1734
+TDPBUSD 1735
+TDPBUUD 1736
+TDPFP 1737
+TDPHBF 1738
+TDPHF 1739
+TEST 1740
+TESTUI 1741
+TILELOADD 1742
+TILELOADDRS 1743
+TILELOADDRST 1744
+TILELOADDRS_EVEX 1745
+TILELOADDT 1746
+TILELOADD_EVEX 1747
+TILEMOVROWrre 1748
+TILEMOVROWrri 1749
+TILERELEASE 1750
+TILESTORED 1751
+TILESTORED_EVEX 1752
+TILEZERO 1753
+TLBSYNC 1754
+TLSCall 1755
+TLS_addr 1756
+TLS_addrX 1757
+TLS_base_addr 1758
+TLS_base_addrX 1759
+TLS_desc 1760
+TMMULTF 1761
+TPAUSE 1762
+TRAP 1763
+TST_F 1764
+TST_Fp 1765
+TTCMMIMFP 1766
+TTCMMRLFP 1767
+TTDPBF 1768
+TTDPFP 1769
+TTMMULTF 1770
+TTRANSPOSED 1771
+TZCNT 1772
+TZMSK 1773
+UBSAN_UD 1774
+UCOMISDrm 1775
+UCOMISDrm_Int 1776
+UCOMISDrr 1777
+UCOMISDrr_Int 1778
+UCOMISSrm 1779
+UCOMISSrm_Int 1780
+UCOMISSrr 1781
+UCOMISSrr_Int 1782
+UCOM_FIPr 1783
+UCOM_FIr 1784
+UCOM_FPPr 1785
+UCOM_FPr 1786
+UCOM_FpIr 1787
+UCOM_Fpr 1788
+UCOM_Fr 1789
+UD 1790
+UIRET 1791
+UMONITOR 1792
+UMWAIT 1793
+UNPCKHPDrm 1794
+UNPCKHPDrr 1795
+UNPCKHPSrm 1796
+UNPCKHPSrr 1797
+UNPCKLPDrm 1798
+UNPCKLPDrr 1799
+UNPCKLPSrm 1800
+UNPCKLPSrr 1801
+URDMSRri 1802
+URDMSRri_EVEX 1803
+URDMSRrr 1804
+URDMSRrr_EVEX 1805
+UWRMSRir 1806
+UWRMSRir_EVEX 1807
+UWRMSRrr 1808
+UWRMSRrr_EVEX 1809
+V 1810
+VAARG 1811
+VAARG_X 1812
+VADDBF 1813
+VADDPDYrm 1814
+VADDPDYrr 1815
+VADDPDZ 1816
+VADDPDZrm 1817
+VADDPDZrmb 1818
+VADDPDZrmbk 1819
+VADDPDZrmbkz 1820
+VADDPDZrmk 1821
+VADDPDZrmkz 1822
+VADDPDZrr 1823
+VADDPDZrrb 1824
+VADDPDZrrbk 1825
+VADDPDZrrbkz 1826
+VADDPDZrrk 1827
+VADDPDZrrkz 1828
+VADDPDrm 1829
+VADDPDrr 1830
+VADDPHZ 1831
+VADDPHZrm 1832
+VADDPHZrmb 1833
+VADDPHZrmbk 1834
+VADDPHZrmbkz 1835
+VADDPHZrmk 1836
+VADDPHZrmkz 1837
+VADDPHZrr 1838
+VADDPHZrrb 1839
+VADDPHZrrbk 1840
+VADDPHZrrbkz 1841
+VADDPHZrrk 1842
+VADDPHZrrkz 1843
+VADDPSYrm 1844
+VADDPSYrr 1845
+VADDPSZ 1846
+VADDPSZrm 1847
+VADDPSZrmb 1848
+VADDPSZrmbk 1849
+VADDPSZrmbkz 1850
+VADDPSZrmk 1851
+VADDPSZrmkz 1852
+VADDPSZrr 1853
+VADDPSZrrb 1854
+VADDPSZrrbk 1855
+VADDPSZrrbkz 1856
+VADDPSZrrk 1857
+VADDPSZrrkz 1858
+VADDPSrm 1859
+VADDPSrr 1860
+VADDSDZrm 1861
+VADDSDZrm_Int 1862
+VADDSDZrmk_Int 1863
+VADDSDZrmkz_Int 1864
+VADDSDZrr 1865
+VADDSDZrr_Int 1866
+VADDSDZrrb_Int 1867
+VADDSDZrrbk_Int 1868
+VADDSDZrrbkz_Int 1869
+VADDSDZrrk_Int 1870
+VADDSDZrrkz_Int 1871
+VADDSDrm 1872
+VADDSDrm_Int 1873
+VADDSDrr 1874
+VADDSDrr_Int 1875
+VADDSHZrm 1876
+VADDSHZrm_Int 1877
+VADDSHZrmk_Int 1878
+VADDSHZrmkz_Int 1879
+VADDSHZrr 1880
+VADDSHZrr_Int 1881
+VADDSHZrrb_Int 1882
+VADDSHZrrbk_Int 1883
+VADDSHZrrbkz_Int 1884
+VADDSHZrrk_Int 1885
+VADDSHZrrkz_Int 1886
+VADDSSZrm 1887
+VADDSSZrm_Int 1888
+VADDSSZrmk_Int 1889
+VADDSSZrmkz_Int 1890
+VADDSSZrr 1891
+VADDSSZrr_Int 1892
+VADDSSZrrb_Int 1893
+VADDSSZrrbk_Int 1894
+VADDSSZrrbkz_Int 1895
+VADDSSZrrk_Int 1896
+VADDSSZrrkz_Int 1897
+VADDSSrm 1898
+VADDSSrm_Int 1899
+VADDSSrr 1900
+VADDSSrr_Int 1901
+VADDSUBPDYrm 1902
+VADDSUBPDYrr 1903
+VADDSUBPDrm 1904
+VADDSUBPDrr 1905
+VADDSUBPSYrm 1906
+VADDSUBPSYrr 1907
+VADDSUBPSrm 1908
+VADDSUBPSrr 1909
+VAESDECLASTYrm 1910
+VAESDECLASTYrr 1911
+VAESDECLASTZ 1912
+VAESDECLASTZrm 1913
+VAESDECLASTZrr 1914
+VAESDECLASTrm 1915
+VAESDECLASTrr 1916
+VAESDECYrm 1917
+VAESDECYrr 1918
+VAESDECZ 1919
+VAESDECZrm 1920
+VAESDECZrr 1921
+VAESDECrm 1922
+VAESDECrr 1923
+VAESENCLASTYrm 1924
+VAESENCLASTYrr 1925
+VAESENCLASTZ 1926
+VAESENCLASTZrm 1927
+VAESENCLASTZrr 1928
+VAESENCLASTrm 1929
+VAESENCLASTrr 1930
+VAESENCYrm 1931
+VAESENCYrr 1932
+VAESENCZ 1933
+VAESENCZrm 1934
+VAESENCZrr 1935
+VAESENCrm 1936
+VAESENCrr 1937
+VAESIMCrm 1938
+VAESIMCrr 1939
+VAESKEYGENASSISTrmi 1940
+VAESKEYGENASSISTrri 1941
+VALIGNDZ 1942
+VALIGNDZrmbi 1943
+VALIGNDZrmbik 1944
+VALIGNDZrmbikz 1945
+VALIGNDZrmi 1946
+VALIGNDZrmik 1947
+VALIGNDZrmikz 1948
+VALIGNDZrri 1949
+VALIGNDZrrik 1950
+VALIGNDZrrikz 1951
+VALIGNQZ 1952
+VALIGNQZrmbi 1953
+VALIGNQZrmbik 1954
+VALIGNQZrmbikz 1955
+VALIGNQZrmi 1956
+VALIGNQZrmik 1957
+VALIGNQZrmikz 1958
+VALIGNQZrri 1959
+VALIGNQZrrik 1960
+VALIGNQZrrikz 1961
+VANDNPDYrm 1962
+VANDNPDYrr 1963
+VANDNPDZ 1964
+VANDNPDZrm 1965
+VANDNPDZrmb 1966
+VANDNPDZrmbk 1967
+VANDNPDZrmbkz 1968
+VANDNPDZrmk 1969
+VANDNPDZrmkz 1970
+VANDNPDZrr 1971
+VANDNPDZrrk 1972
+VANDNPDZrrkz 1973
+VANDNPDrm 1974
+VANDNPDrr 1975
+VANDNPSYrm 1976
+VANDNPSYrr 1977
+VANDNPSZ 1978
+VANDNPSZrm 1979
+VANDNPSZrmb 1980
+VANDNPSZrmbk 1981
+VANDNPSZrmbkz 1982
+VANDNPSZrmk 1983
+VANDNPSZrmkz 1984
+VANDNPSZrr 1985
+VANDNPSZrrk 1986
+VANDNPSZrrkz 1987
+VANDNPSrm 1988
+VANDNPSrr 1989
+VANDPDYrm 1990
+VANDPDYrr 1991
+VANDPDZ 1992
+VANDPDZrm 1993
+VANDPDZrmb 1994
+VANDPDZrmbk 1995
+VANDPDZrmbkz 1996
+VANDPDZrmk 1997
+VANDPDZrmkz 1998
+VANDPDZrr 1999
+VANDPDZrrk 2000
+VANDPDZrrkz 2001
+VANDPDrm 2002
+VANDPDrr 2003
+VANDPSYrm 2004
+VANDPSYrr 2005
+VANDPSZ 2006
+VANDPSZrm 2007
+VANDPSZrmb 2008
+VANDPSZrmbk 2009
+VANDPSZrmbkz 2010
+VANDPSZrmk 2011
+VANDPSZrmkz 2012
+VANDPSZrr 2013
+VANDPSZrrk 2014
+VANDPSZrrkz 2015
+VANDPSrm 2016
+VANDPSrr 2017
+VASTART_SAVE_XMM_REGS 2018
+VBCSTNEBF 2019
+VBCSTNESH 2020
+VBLENDMPDZ 2021
+VBLENDMPDZrm 2022
+VBLENDMPDZrmb 2023
+VBLENDMPDZrmbk 2024
+VBLENDMPDZrmbkz 2025
+VBLENDMPDZrmk 2026
+VBLENDMPDZrmkz 2027
+VBLENDMPDZrr 2028
+VBLENDMPDZrrk 2029
+VBLENDMPDZrrkz 2030
+VBLENDMPSZ 2031
+VBLENDMPSZrm 2032
+VBLENDMPSZrmb 2033
+VBLENDMPSZrmbk 2034
+VBLENDMPSZrmbkz 2035
+VBLENDMPSZrmk 2036
+VBLENDMPSZrmkz 2037
+VBLENDMPSZrr 2038
+VBLENDMPSZrrk 2039
+VBLENDMPSZrrkz 2040
+VBLENDPDYrmi 2041
+VBLENDPDYrri 2042
+VBLENDPDrmi 2043
+VBLENDPDrri 2044
+VBLENDPSYrmi 2045
+VBLENDPSYrri 2046
+VBLENDPSrmi 2047
+VBLENDPSrri 2048
+VBLENDVPDYrmr 2049
+VBLENDVPDYrrr 2050
+VBLENDVPDrmr 2051
+VBLENDVPDrrr 2052
+VBLENDVPSYrmr 2053
+VBLENDVPSYrrr 2054
+VBLENDVPSrmr 2055
+VBLENDVPSrrr 2056
+VBROADCASTF 2057
+VBROADCASTI 2058
+VBROADCASTSDYrm 2059
+VBROADCASTSDYrr 2060
+VBROADCASTSDZ 2061
+VBROADCASTSDZrm 2062
+VBROADCASTSDZrmk 2063
+VBROADCASTSDZrmkz 2064
+VBROADCASTSDZrr 2065
+VBROADCASTSDZrrk 2066
+VBROADCASTSDZrrkz 2067
+VBROADCASTSSYrm 2068
+VBROADCASTSSYrr 2069
+VBROADCASTSSZ 2070
+VBROADCASTSSZrm 2071
+VBROADCASTSSZrmk 2072
+VBROADCASTSSZrmkz 2073
+VBROADCASTSSZrr 2074
+VBROADCASTSSZrrk 2075
+VBROADCASTSSZrrkz 2076
+VBROADCASTSSrm 2077
+VBROADCASTSSrr 2078
+VCMPBF 2079
+VCMPPDYrmi 2080
+VCMPPDYrri 2081
+VCMPPDZ 2082
+VCMPPDZrmbi 2083
+VCMPPDZrmbik 2084
+VCMPPDZrmi 2085
+VCMPPDZrmik 2086
+VCMPPDZrri 2087
+VCMPPDZrrib 2088
+VCMPPDZrribk 2089
+VCMPPDZrrik 2090
+VCMPPDrmi 2091
+VCMPPDrri 2092
+VCMPPHZ 2093
+VCMPPHZrmbi 2094
+VCMPPHZrmbik 2095
+VCMPPHZrmi 2096
+VCMPPHZrmik 2097
+VCMPPHZrri 2098
+VCMPPHZrrib 2099
+VCMPPHZrribk 2100
+VCMPPHZrrik 2101
+VCMPPSYrmi 2102
+VCMPPSYrri 2103
+VCMPPSZ 2104
+VCMPPSZrmbi 2105
+VCMPPSZrmbik 2106
+VCMPPSZrmi 2107
+VCMPPSZrmik 2108
+VCMPPSZrri 2109
+VCMPPSZrrib 2110
+VCMPPSZrribk 2111
+VCMPPSZrrik 2112
+VCMPPSrmi 2113
+VCMPPSrri 2114
+VCMPSDZrmi 2115
+VCMPSDZrmi_Int 2116
+VCMPSDZrmik_Int 2117
+VCMPSDZrri 2118
+VCMPSDZrri_Int 2119
+VCMPSDZrrib_Int 2120
+VCMPSDZrribk_Int 2121
+VCMPSDZrrik_Int 2122
+VCMPSDrmi 2123
+VCMPSDrmi_Int 2124
+VCMPSDrri 2125
+VCMPSDrri_Int 2126
+VCMPSHZrmi 2127
+VCMPSHZrmi_Int 2128
+VCMPSHZrmik_Int 2129
+VCMPSHZrri 2130
+VCMPSHZrri_Int 2131
+VCMPSHZrrib_Int 2132
+VCMPSHZrribk_Int 2133
+VCMPSHZrrik_Int 2134
+VCMPSSZrmi 2135
+VCMPSSZrmi_Int 2136
+VCMPSSZrmik_Int 2137
+VCMPSSZrri 2138
+VCMPSSZrri_Int 2139
+VCMPSSZrrib_Int 2140
+VCMPSSZrribk_Int 2141
+VCMPSSZrrik_Int 2142
+VCMPSSrmi 2143
+VCMPSSrmi_Int 2144
+VCMPSSrri 2145
+VCMPSSrri_Int 2146
+VCOMISBF 2147
+VCOMISDZrm 2148
+VCOMISDZrm_Int 2149
+VCOMISDZrr 2150
+VCOMISDZrr_Int 2151
+VCOMISDZrrb 2152
+VCOMISDrm 2153
+VCOMISDrm_Int 2154
+VCOMISDrr 2155
+VCOMISDrr_Int 2156
+VCOMISHZrm 2157
+VCOMISHZrm_Int 2158
+VCOMISHZrr 2159
+VCOMISHZrr_Int 2160
+VCOMISHZrrb 2161
+VCOMISSZrm 2162
+VCOMISSZrm_Int 2163
+VCOMISSZrr 2164
+VCOMISSZrr_Int 2165
+VCOMISSZrrb 2166
+VCOMISSrm 2167
+VCOMISSrm_Int 2168
+VCOMISSrr 2169
+VCOMISSrr_Int 2170
+VCOMPRESSPDZ 2171
+VCOMPRESSPDZmr 2172
+VCOMPRESSPDZmrk 2173
+VCOMPRESSPDZrr 2174
+VCOMPRESSPDZrrk 2175
+VCOMPRESSPDZrrkz 2176
+VCOMPRESSPSZ 2177
+VCOMPRESSPSZmr 2178
+VCOMPRESSPSZmrk 2179
+VCOMPRESSPSZrr 2180
+VCOMPRESSPSZrrk 2181
+VCOMPRESSPSZrrkz 2182
+VCOMXSDZrm_Int 2183
+VCOMXSDZrr_Int 2184
+VCOMXSDZrrb_Int 2185
+VCOMXSHZrm_Int 2186
+VCOMXSHZrr_Int 2187
+VCOMXSHZrrb_Int 2188
+VCOMXSSZrm_Int 2189
+VCOMXSSZrr_Int 2190
+VCOMXSSZrrb_Int 2191
+VCVT 2192
+VCVTBF 2193
+VCVTBIASPH 2194
+VCVTDQ 2195
+VCVTHF 2196
+VCVTNE 2197
+VCVTNEEBF 2198
+VCVTNEEPH 2199
+VCVTNEOBF 2200
+VCVTNEOPH 2201
+VCVTNEPS 2202
+VCVTPD 2203
+VCVTPH 2204
+VCVTPS 2205
+VCVTQQ 2206
+VCVTSD 2207
+VCVTSH 2208
+VCVTSI 2209
+VCVTSS 2210
+VCVTTBF 2211
+VCVTTPD 2212
+VCVTTPH 2213
+VCVTTPS 2214
+VCVTTSD 2215
+VCVTTSH 2216
+VCVTTSS 2217
+VCVTUDQ 2218
+VCVTUQQ 2219
+VCVTUSI 2220
+VCVTUW 2221
+VCVTW 2222
+VDBPSADBWZ 2223
+VDBPSADBWZrmi 2224
+VDBPSADBWZrmik 2225
+VDBPSADBWZrmikz 2226
+VDBPSADBWZrri 2227
+VDBPSADBWZrrik 2228
+VDBPSADBWZrrikz 2229
+VDIVBF 2230
+VDIVPDYrm 2231
+VDIVPDYrr 2232
+VDIVPDZ 2233
+VDIVPDZrm 2234
+VDIVPDZrmb 2235
+VDIVPDZrmbk 2236
+VDIVPDZrmbkz 2237
+VDIVPDZrmk 2238
+VDIVPDZrmkz 2239
+VDIVPDZrr 2240
+VDIVPDZrrb 2241
+VDIVPDZrrbk 2242
+VDIVPDZrrbkz 2243
+VDIVPDZrrk 2244
+VDIVPDZrrkz 2245
+VDIVPDrm 2246
+VDIVPDrr 2247
+VDIVPHZ 2248
+VDIVPHZrm 2249
+VDIVPHZrmb 2250
+VDIVPHZrmbk 2251
+VDIVPHZrmbkz 2252
+VDIVPHZrmk 2253
+VDIVPHZrmkz 2254
+VDIVPHZrr 2255
+VDIVPHZrrb 2256
+VDIVPHZrrbk 2257
+VDIVPHZrrbkz 2258
+VDIVPHZrrk 2259
+VDIVPHZrrkz 2260
+VDIVPSYrm 2261
+VDIVPSYrr 2262
+VDIVPSZ 2263
+VDIVPSZrm 2264
+VDIVPSZrmb 2265
+VDIVPSZrmbk 2266
+VDIVPSZrmbkz 2267
+VDIVPSZrmk 2268
+VDIVPSZrmkz 2269
+VDIVPSZrr 2270
+VDIVPSZrrb 2271
+VDIVPSZrrbk 2272
+VDIVPSZrrbkz 2273
+VDIVPSZrrk 2274
+VDIVPSZrrkz 2275
+VDIVPSrm 2276
+VDIVPSrr 2277
+VDIVSDZrm 2278
+VDIVSDZrm_Int 2279
+VDIVSDZrmk_Int 2280
+VDIVSDZrmkz_Int 2281
+VDIVSDZrr 2282
+VDIVSDZrr_Int 2283
+VDIVSDZrrb_Int 2284
+VDIVSDZrrbk_Int 2285
+VDIVSDZrrbkz_Int 2286
+VDIVSDZrrk_Int 2287
+VDIVSDZrrkz_Int 2288
+VDIVSDrm 2289
+VDIVSDrm_Int 2290
+VDIVSDrr 2291
+VDIVSDrr_Int 2292
+VDIVSHZrm 2293
+VDIVSHZrm_Int 2294
+VDIVSHZrmk_Int 2295
+VDIVSHZrmkz_Int 2296
+VDIVSHZrr 2297
+VDIVSHZrr_Int 2298
+VDIVSHZrrb_Int 2299
+VDIVSHZrrbk_Int 2300
+VDIVSHZrrbkz_Int 2301
+VDIVSHZrrk_Int 2302
+VDIVSHZrrkz_Int 2303
+VDIVSSZrm 2304
+VDIVSSZrm_Int 2305
+VDIVSSZrmk_Int 2306
+VDIVSSZrmkz_Int 2307
+VDIVSSZrr 2308
+VDIVSSZrr_Int 2309
+VDIVSSZrrb_Int 2310
+VDIVSSZrrbk_Int 2311
+VDIVSSZrrbkz_Int 2312
+VDIVSSZrrk_Int 2313
+VDIVSSZrrkz_Int 2314
+VDIVSSrm 2315
+VDIVSSrm_Int 2316
+VDIVSSrr 2317
+VDIVSSrr_Int 2318
+VDPBF 2319
+VDPPDrmi 2320
+VDPPDrri 2321
+VDPPHPSZ 2322
+VDPPHPSZm 2323
+VDPPHPSZmb 2324
+VDPPHPSZmbk 2325
+VDPPHPSZmbkz 2326
+VDPPHPSZmk 2327
+VDPPHPSZmkz 2328
+VDPPHPSZr 2329
+VDPPHPSZrk 2330
+VDPPHPSZrkz 2331
+VDPPSYrmi 2332
+VDPPSYrri 2333
+VDPPSrmi 2334
+VDPPSrri 2335
+VERRm 2336
+VERRr 2337
+VERWm 2338
+VERWr 2339
+VEXP 2340
+VEXPANDPDZ 2341
+VEXPANDPDZrm 2342
+VEXPANDPDZrmk 2343
+VEXPANDPDZrmkz 2344
+VEXPANDPDZrr 2345
+VEXPANDPDZrrk 2346
+VEXPANDPDZrrkz 2347
+VEXPANDPSZ 2348
+VEXPANDPSZrm 2349
+VEXPANDPSZrmk 2350
+VEXPANDPSZrmkz 2351
+VEXPANDPSZrr 2352
+VEXPANDPSZrrk 2353
+VEXPANDPSZrrkz 2354
+VEXTRACTF 2355
+VEXTRACTI 2356
+VEXTRACTPSZmri 2357
+VEXTRACTPSZrri 2358
+VEXTRACTPSmri 2359
+VEXTRACTPSrri 2360
+VFCMADDCPHZ 2361
+VFCMADDCPHZm 2362
+VFCMADDCPHZmb 2363
+VFCMADDCPHZmbk 2364
+VFCMADDCPHZmbkz 2365
+VFCMADDCPHZmk 2366
+VFCMADDCPHZmkz 2367
+VFCMADDCPHZr 2368
+VFCMADDCPHZrb 2369
+VFCMADDCPHZrbk 2370
+VFCMADDCPHZrbkz 2371
+VFCMADDCPHZrk 2372
+VFCMADDCPHZrkz 2373
+VFCMADDCSHZm 2374
+VFCMADDCSHZmk 2375
+VFCMADDCSHZmkz 2376
+VFCMADDCSHZr 2377
+VFCMADDCSHZrb 2378
+VFCMADDCSHZrbk 2379
+VFCMADDCSHZrbkz 2380
+VFCMADDCSHZrk 2381
+VFCMADDCSHZrkz 2382
+VFCMULCPHZ 2383
+VFCMULCPHZrm 2384
+VFCMULCPHZrmb 2385
+VFCMULCPHZrmbk 2386
+VFCMULCPHZrmbkz 2387
+VFCMULCPHZrmk 2388
+VFCMULCPHZrmkz 2389
+VFCMULCPHZrr 2390
+VFCMULCPHZrrb 2391
+VFCMULCPHZrrbk 2392
+VFCMULCPHZrrbkz 2393
+VFCMULCPHZrrk 2394
+VFCMULCPHZrrkz 2395
+VFCMULCSHZrm 2396
+VFCMULCSHZrmk 2397
+VFCMULCSHZrmkz 2398
+VFCMULCSHZrr 2399
+VFCMULCSHZrrb 2400
+VFCMULCSHZrrbk 2401
+VFCMULCSHZrrbkz 2402
+VFCMULCSHZrrk 2403
+VFCMULCSHZrrkz 2404
+VFIXUPIMMPDZ 2405
+VFIXUPIMMPDZrmbi 2406
+VFIXUPIMMPDZrmbik 2407
+VFIXUPIMMPDZrmbikz 2408
+VFIXUPIMMPDZrmi 2409
+VFIXUPIMMPDZrmik 2410
+VFIXUPIMMPDZrmikz 2411
+VFIXUPIMMPDZrri 2412
+VFIXUPIMMPDZrrib 2413
+VFIXUPIMMPDZrribk 2414
+VFIXUPIMMPDZrribkz 2415
+VFIXUPIMMPDZrrik 2416
+VFIXUPIMMPDZrrikz 2417
+VFIXUPIMMPSZ 2418
+VFIXUPIMMPSZrmbi 2419
+VFIXUPIMMPSZrmbik 2420
+VFIXUPIMMPSZrmbikz 2421
+VFIXUPIMMPSZrmi 2422
+VFIXUPIMMPSZrmik 2423
+VFIXUPIMMPSZrmikz 2424
+VFIXUPIMMPSZrri 2425
+VFIXUPIMMPSZrrib 2426
+VFIXUPIMMPSZrribk 2427
+VFIXUPIMMPSZrribkz 2428
+VFIXUPIMMPSZrrik 2429
+VFIXUPIMMPSZrrikz 2430
+VFIXUPIMMSDZrmi 2431
+VFIXUPIMMSDZrmik 2432
+VFIXUPIMMSDZrmikz 2433
+VFIXUPIMMSDZrri 2434
+VFIXUPIMMSDZrrib 2435
+VFIXUPIMMSDZrribk 2436
+VFIXUPIMMSDZrribkz 2437
+VFIXUPIMMSDZrrik 2438
+VFIXUPIMMSDZrrikz 2439
+VFIXUPIMMSSZrmi 2440
+VFIXUPIMMSSZrmik 2441
+VFIXUPIMMSSZrmikz 2442
+VFIXUPIMMSSZrri 2443
+VFIXUPIMMSSZrrib 2444
+VFIXUPIMMSSZrribk 2445
+VFIXUPIMMSSZrribkz 2446
+VFIXUPIMMSSZrrik 2447
+VFIXUPIMMSSZrrikz 2448
+VFMADD 2449
+VFMADDCPHZ 2450
+VFMADDCPHZm 2451
+VFMADDCPHZmb 2452
+VFMADDCPHZmbk 2453
+VFMADDCPHZmbkz 2454
+VFMADDCPHZmk 2455
+VFMADDCPHZmkz 2456
+VFMADDCPHZr 2457
+VFMADDCPHZrb 2458
+VFMADDCPHZrbk 2459
+VFMADDCPHZrbkz 2460
+VFMADDCPHZrk 2461
+VFMADDCPHZrkz 2462
+VFMADDCSHZm 2463
+VFMADDCSHZmk 2464
+VFMADDCSHZmkz 2465
+VFMADDCSHZr 2466
+VFMADDCSHZrb 2467
+VFMADDCSHZrbk 2468
+VFMADDCSHZrbkz 2469
+VFMADDCSHZrk 2470
+VFMADDCSHZrkz 2471
+VFMADDPD 2472
+VFMADDPS 2473
+VFMADDSD 2474
+VFMADDSS 2475
+VFMADDSUB 2476
+VFMADDSUBPD 2477
+VFMADDSUBPS 2478
+VFMSUB 2479
+VFMSUBADD 2480
+VFMSUBADDPD 2481
+VFMSUBADDPS 2482
+VFMSUBPD 2483
+VFMSUBPS 2484
+VFMSUBSD 2485
+VFMSUBSS 2486
+VFMULCPHZ 2487
+VFMULCPHZrm 2488
+VFMULCPHZrmb 2489
+VFMULCPHZrmbk 2490
+VFMULCPHZrmbkz 2491
+VFMULCPHZrmk 2492
+VFMULCPHZrmkz 2493
+VFMULCPHZrr 2494
+VFMULCPHZrrb 2495
+VFMULCPHZrrbk 2496
+VFMULCPHZrrbkz 2497
+VFMULCPHZrrk 2498
+VFMULCPHZrrkz 2499
+VFMULCSHZrm 2500
+VFMULCSHZrmk 2501
+VFMULCSHZrmkz 2502
+VFMULCSHZrr 2503
+VFMULCSHZrrb 2504
+VFMULCSHZrrbk 2505
+VFMULCSHZrrbkz 2506
+VFMULCSHZrrk 2507
+VFMULCSHZrrkz 2508
+VFNMADD 2509
+VFNMADDPD 2510
+VFNMADDPS 2511
+VFNMADDSD 2512
+VFNMADDSS 2513
+VFNMSUB 2514
+VFNMSUBPD 2515
+VFNMSUBPS 2516
+VFNMSUBSD 2517
+VFNMSUBSS 2518
+VFPCLASSBF 2519
+VFPCLASSPDZ 2520
+VFPCLASSPDZmbi 2521
+VFPCLASSPDZmbik 2522
+VFPCLASSPDZmi 2523
+VFPCLASSPDZmik 2524
+VFPCLASSPDZri 2525
+VFPCLASSPDZrik 2526
+VFPCLASSPHZ 2527
+VFPCLASSPHZmbi 2528
+VFPCLASSPHZmbik 2529
+VFPCLASSPHZmi 2530
+VFPCLASSPHZmik 2531
+VFPCLASSPHZri 2532
+VFPCLASSPHZrik 2533
+VFPCLASSPSZ 2534
+VFPCLASSPSZmbi 2535
+VFPCLASSPSZmbik 2536
+VFPCLASSPSZmi 2537
+VFPCLASSPSZmik 2538
+VFPCLASSPSZri 2539
+VFPCLASSPSZrik 2540
+VFPCLASSSDZmi 2541
+VFPCLASSSDZmik 2542
+VFPCLASSSDZri 2543
+VFPCLASSSDZrik 2544
+VFPCLASSSHZmi 2545
+VFPCLASSSHZmik 2546
+VFPCLASSSHZri 2547
+VFPCLASSSHZrik 2548
+VFPCLASSSSZmi 2549
+VFPCLASSSSZmik 2550
+VFPCLASSSSZri 2551
+VFPCLASSSSZrik 2552
+VFRCZPDYrm 2553
+VFRCZPDYrr 2554
+VFRCZPDrm 2555
+VFRCZPDrr 2556
+VFRCZPSYrm 2557
+VFRCZPSYrr 2558
+VFRCZPSrm 2559
+VFRCZPSrr 2560
+VFRCZSDrm 2561
+VFRCZSDrr 2562
+VFRCZSSrm 2563
+VFRCZSSrr 2564
+VGATHERDPDYrm 2565
+VGATHERDPDZ 2566
+VGATHERDPDZrm 2567
+VGATHERDPDrm 2568
+VGATHERDPSYrm 2569
+VGATHERDPSZ 2570
+VGATHERDPSZrm 2571
+VGATHERDPSrm 2572
+VGATHERPF 2573
+VGATHERQPDYrm 2574
+VGATHERQPDZ 2575
+VGATHERQPDZrm 2576
+VGATHERQPDrm 2577
+VGATHERQPSYrm 2578
+VGATHERQPSZ 2579
+VGATHERQPSZrm 2580
+VGATHERQPSrm 2581
+VGETEXPBF 2582
+VGETEXPPDZ 2583
+VGETEXPPDZm 2584
+VGETEXPPDZmb 2585
+VGETEXPPDZmbk 2586
+VGETEXPPDZmbkz 2587
+VGETEXPPDZmk 2588
+VGETEXPPDZmkz 2589
+VGETEXPPDZr 2590
+VGETEXPPDZrb 2591
+VGETEXPPDZrbk 2592
+VGETEXPPDZrbkz 2593
+VGETEXPPDZrk 2594
+VGETEXPPDZrkz 2595
+VGETEXPPHZ 2596
+VGETEXPPHZm 2597
+VGETEXPPHZmb 2598
+VGETEXPPHZmbk 2599
+VGETEXPPHZmbkz 2600
+VGETEXPPHZmk 2601
+VGETEXPPHZmkz 2602
+VGETEXPPHZr 2603
+VGETEXPPHZrb 2604
+VGETEXPPHZrbk 2605
+VGETEXPPHZrbkz 2606
+VGETEXPPHZrk 2607
+VGETEXPPHZrkz 2608
+VGETEXPPSZ 2609
+VGETEXPPSZm 2610
+VGETEXPPSZmb 2611
+VGETEXPPSZmbk 2612
+VGETEXPPSZmbkz 2613
+VGETEXPPSZmk 2614
+VGETEXPPSZmkz 2615
+VGETEXPPSZr 2616
+VGETEXPPSZrb 2617
+VGETEXPPSZrbk 2618
+VGETEXPPSZrbkz 2619
+VGETEXPPSZrk 2620
+VGETEXPPSZrkz 2621
+VGETEXPSDZm 2622
+VGETEXPSDZmk 2623
+VGETEXPSDZmkz 2624
+VGETEXPSDZr 2625
+VGETEXPSDZrb 2626
+VGETEXPSDZrbk 2627
+VGETEXPSDZrbkz 2628
+VGETEXPSDZrk 2629
+VGETEXPSDZrkz 2630
+VGETEXPSHZm 2631
+VGETEXPSHZmk 2632
+VGETEXPSHZmkz 2633
+VGETEXPSHZr 2634
+VGETEXPSHZrb 2635
+VGETEXPSHZrbk 2636
+VGETEXPSHZrbkz 2637
+VGETEXPSHZrk 2638
+VGETEXPSHZrkz 2639
+VGETEXPSSZm 2640
+VGETEXPSSZmk 2641
+VGETEXPSSZmkz 2642
+VGETEXPSSZr 2643
+VGETEXPSSZrb 2644
+VGETEXPSSZrbk 2645
+VGETEXPSSZrbkz 2646
+VGETEXPSSZrk 2647
+VGETEXPSSZrkz 2648
+VGETMANTBF 2649
+VGETMANTPDZ 2650
+VGETMANTPDZrmbi 2651
+VGETMANTPDZrmbik 2652
+VGETMANTPDZrmbikz 2653
+VGETMANTPDZrmi 2654
+VGETMANTPDZrmik 2655
+VGETMANTPDZrmikz 2656
+VGETMANTPDZrri 2657
+VGETMANTPDZrrib 2658
+VGETMANTPDZrribk 2659
+VGETMANTPDZrribkz 2660
+VGETMANTPDZrrik 2661
+VGETMANTPDZrrikz 2662
+VGETMANTPHZ 2663
+VGETMANTPHZrmbi 2664
+VGETMANTPHZrmbik 2665
+VGETMANTPHZrmbikz 2666
+VGETMANTPHZrmi 2667
+VGETMANTPHZrmik 2668
+VGETMANTPHZrmikz 2669
+VGETMANTPHZrri 2670
+VGETMANTPHZrrib 2671
+VGETMANTPHZrribk 2672
+VGETMANTPHZrribkz 2673
+VGETMANTPHZrrik 2674
+VGETMANTPHZrrikz 2675
+VGETMANTPSZ 2676
+VGETMANTPSZrmbi 2677
+VGETMANTPSZrmbik 2678
+VGETMANTPSZrmbikz 2679
+VGETMANTPSZrmi 2680
+VGETMANTPSZrmik 2681
+VGETMANTPSZrmikz 2682
+VGETMANTPSZrri 2683
+VGETMANTPSZrrib 2684
+VGETMANTPSZrribk 2685
+VGETMANTPSZrribkz 2686
+VGETMANTPSZrrik 2687
+VGETMANTPSZrrikz 2688
+VGETMANTSDZrmi 2689
+VGETMANTSDZrmik 2690
+VGETMANTSDZrmikz 2691
+VGETMANTSDZrri 2692
+VGETMANTSDZrrib 2693
+VGETMANTSDZrribk 2694
+VGETMANTSDZrribkz 2695
+VGETMANTSDZrrik 2696
+VGETMANTSDZrrikz 2697
+VGETMANTSHZrmi 2698
+VGETMANTSHZrmik 2699
+VGETMANTSHZrmikz 2700
+VGETMANTSHZrri 2701
+VGETMANTSHZrrib 2702
+VGETMANTSHZrribk 2703
+VGETMANTSHZrribkz 2704
+VGETMANTSHZrrik 2705
+VGETMANTSHZrrikz 2706
+VGETMANTSSZrmi 2707
+VGETMANTSSZrmik 2708
+VGETMANTSSZrmikz 2709
+VGETMANTSSZrri 2710
+VGETMANTSSZrrib 2711
+VGETMANTSSZrribk 2712
+VGETMANTSSZrribkz 2713
+VGETMANTSSZrrik 2714
+VGETMANTSSZrrikz 2715
+VGF 2716
+VHADDPDYrm 2717
+VHADDPDYrr 2718
+VHADDPDrm 2719
+VHADDPDrr 2720
+VHADDPSYrm 2721
+VHADDPSYrr 2722
+VHADDPSrm 2723
+VHADDPSrr 2724
+VHSUBPDYrm 2725
+VHSUBPDYrr 2726
+VHSUBPDrm 2727
+VHSUBPDrr 2728
+VHSUBPSYrm 2729
+VHSUBPSYrr 2730
+VHSUBPSrm 2731
+VHSUBPSrr 2732
+VINSERTF 2733
+VINSERTI 2734
+VINSERTPSZrmi 2735
+VINSERTPSZrri 2736
+VINSERTPSrmi 2737
+VINSERTPSrri 2738
+VLDDQUYrm 2739
+VLDDQUrm 2740
+VLDMXCSR 2741
+VMASKMOVDQU 2742
+VMASKMOVPDYmr 2743
+VMASKMOVPDYrm 2744
+VMASKMOVPDmr 2745
+VMASKMOVPDrm 2746
+VMASKMOVPSYmr 2747
+VMASKMOVPSYrm 2748
+VMASKMOVPSmr 2749
+VMASKMOVPSrm 2750
+VMAXBF 2751
+VMAXCPDYrm 2752
+VMAXCPDYrr 2753
+VMAXCPDZ 2754
+VMAXCPDZrm 2755
+VMAXCPDZrmb 2756
+VMAXCPDZrmbk 2757
+VMAXCPDZrmbkz 2758
+VMAXCPDZrmk 2759
+VMAXCPDZrmkz 2760
+VMAXCPDZrr 2761
+VMAXCPDZrrk 2762
+VMAXCPDZrrkz 2763
+VMAXCPDrm 2764
+VMAXCPDrr 2765
+VMAXCPHZ 2766
+VMAXCPHZrm 2767
+VMAXCPHZrmb 2768
+VMAXCPHZrmbk 2769
+VMAXCPHZrmbkz 2770
+VMAXCPHZrmk 2771
+VMAXCPHZrmkz 2772
+VMAXCPHZrr 2773
+VMAXCPHZrrk 2774
+VMAXCPHZrrkz 2775
+VMAXCPSYrm 2776
+VMAXCPSYrr 2777
+VMAXCPSZ 2778
+VMAXCPSZrm 2779
+VMAXCPSZrmb 2780
+VMAXCPSZrmbk 2781
+VMAXCPSZrmbkz 2782
+VMAXCPSZrmk 2783
+VMAXCPSZrmkz 2784
+VMAXCPSZrr 2785
+VMAXCPSZrrk 2786
+VMAXCPSZrrkz 2787
+VMAXCPSrm 2788
+VMAXCPSrr 2789
+VMAXCSDZrm 2790
+VMAXCSDZrr 2791
+VMAXCSDrm 2792
+VMAXCSDrr 2793
+VMAXCSHZrm 2794
+VMAXCSHZrr 2795
+VMAXCSSZrm 2796
+VMAXCSSZrr 2797
+VMAXCSSrm 2798
+VMAXCSSrr 2799
+VMAXPDYrm 2800
+VMAXPDYrr 2801
+VMAXPDZ 2802
+VMAXPDZrm 2803
+VMAXPDZrmb 2804
+VMAXPDZrmbk 2805
+VMAXPDZrmbkz 2806
+VMAXPDZrmk 2807
+VMAXPDZrmkz 2808
+VMAXPDZrr 2809
+VMAXPDZrrb 2810
+VMAXPDZrrbk 2811
+VMAXPDZrrbkz 2812
+VMAXPDZrrk 2813
+VMAXPDZrrkz 2814
+VMAXPDrm 2815
+VMAXPDrr 2816
+VMAXPHZ 2817
+VMAXPHZrm 2818
+VMAXPHZrmb 2819
+VMAXPHZrmbk 2820
+VMAXPHZrmbkz 2821
+VMAXPHZrmk 2822
+VMAXPHZrmkz 2823
+VMAXPHZrr 2824
+VMAXPHZrrb 2825
+VMAXPHZrrbk 2826
+VMAXPHZrrbkz 2827
+VMAXPHZrrk 2828
+VMAXPHZrrkz 2829
+VMAXPSYrm 2830
+VMAXPSYrr 2831
+VMAXPSZ 2832
+VMAXPSZrm 2833
+VMAXPSZrmb 2834
+VMAXPSZrmbk 2835
+VMAXPSZrmbkz 2836
+VMAXPSZrmk 2837
+VMAXPSZrmkz 2838
+VMAXPSZrr 2839
+VMAXPSZrrb 2840
+VMAXPSZrrbk 2841
+VMAXPSZrrbkz 2842
+VMAXPSZrrk 2843
+VMAXPSZrrkz 2844
+VMAXPSrm 2845
+VMAXPSrr 2846
+VMAXSDZrm 2847
+VMAXSDZrm_Int 2848
+VMAXSDZrmk_Int 2849
+VMAXSDZrmkz_Int 2850
+VMAXSDZrr 2851
+VMAXSDZrr_Int 2852
+VMAXSDZrrb_Int 2853
+VMAXSDZrrbk_Int 2854
+VMAXSDZrrbkz_Int 2855
+VMAXSDZrrk_Int 2856
+VMAXSDZrrkz_Int 2857
+VMAXSDrm 2858
+VMAXSDrm_Int 2859
+VMAXSDrr 2860
+VMAXSDrr_Int 2861
+VMAXSHZrm 2862
+VMAXSHZrm_Int 2863
+VMAXSHZrmk_Int 2864
+VMAXSHZrmkz_Int 2865
+VMAXSHZrr 2866
+VMAXSHZrr_Int 2867
+VMAXSHZrrb_Int 2868
+VMAXSHZrrbk_Int 2869
+VMAXSHZrrbkz_Int 2870
+VMAXSHZrrk_Int 2871
+VMAXSHZrrkz_Int 2872
+VMAXSSZrm 2873
+VMAXSSZrm_Int 2874
+VMAXSSZrmk_Int 2875
+VMAXSSZrmkz_Int 2876
+VMAXSSZrr 2877
+VMAXSSZrr_Int 2878
+VMAXSSZrrb_Int 2879
+VMAXSSZrrbk_Int 2880
+VMAXSSZrrbkz_Int 2881
+VMAXSSZrrk_Int 2882
+VMAXSSZrrkz_Int 2883
+VMAXSSrm 2884
+VMAXSSrm_Int 2885
+VMAXSSrr 2886
+VMAXSSrr_Int 2887
+VMCALL 2888
+VMCLEARm 2889
+VMFUNC 2890
+VMINBF 2891
+VMINCPDYrm 2892
+VMINCPDYrr 2893
+VMINCPDZ 2894
+VMINCPDZrm 2895
+VMINCPDZrmb 2896
+VMINCPDZrmbk 2897
+VMINCPDZrmbkz 2898
+VMINCPDZrmk 2899
+VMINCPDZrmkz 2900
+VMINCPDZrr 2901
+VMINCPDZrrk 2902
+VMINCPDZrrkz 2903
+VMINCPDrm 2904
+VMINCPDrr 2905
+VMINCPHZ 2906
+VMINCPHZrm 2907
+VMINCPHZrmb 2908
+VMINCPHZrmbk 2909
+VMINCPHZrmbkz 2910
+VMINCPHZrmk 2911
+VMINCPHZrmkz 2912
+VMINCPHZrr 2913
+VMINCPHZrrk 2914
+VMINCPHZrrkz 2915
+VMINCPSYrm 2916
+VMINCPSYrr 2917
+VMINCPSZ 2918
+VMINCPSZrm 2919
+VMINCPSZrmb 2920
+VMINCPSZrmbk 2921
+VMINCPSZrmbkz 2922
+VMINCPSZrmk 2923
+VMINCPSZrmkz 2924
+VMINCPSZrr 2925
+VMINCPSZrrk 2926
+VMINCPSZrrkz 2927
+VMINCPSrm 2928
+VMINCPSrr 2929
+VMINCSDZrm 2930
+VMINCSDZrr 2931
+VMINCSDrm 2932
+VMINCSDrr 2933
+VMINCSHZrm 2934
+VMINCSHZrr 2935
+VMINCSSZrm 2936
+VMINCSSZrr 2937
+VMINCSSrm 2938
+VMINCSSrr 2939
+VMINMAXBF 2940
+VMINMAXPDZ 2941
+VMINMAXPDZrmbi 2942
+VMINMAXPDZrmbik 2943
+VMINMAXPDZrmbikz 2944
+VMINMAXPDZrmi 2945
+VMINMAXPDZrmik 2946
+VMINMAXPDZrmikz 2947
+VMINMAXPDZrri 2948
+VMINMAXPDZrrib 2949
+VMINMAXPDZrribk 2950
+VMINMAXPDZrribkz 2951
+VMINMAXPDZrrik 2952
+VMINMAXPDZrrikz 2953
+VMINMAXPHZ 2954
+VMINMAXPHZrmbi 2955
+VMINMAXPHZrmbik 2956
+VMINMAXPHZrmbikz 2957
+VMINMAXPHZrmi 2958
+VMINMAXPHZrmik 2959
+VMINMAXPHZrmikz 2960
+VMINMAXPHZrri 2961
+VMINMAXPHZrrib 2962
+VMINMAXPHZrribk 2963
+VMINMAXPHZrribkz 2964
+VMINMAXPHZrrik 2965
+VMINMAXPHZrrikz 2966
+VMINMAXPSZ 2967
+VMINMAXPSZrmbi 2968
+VMINMAXPSZrmbik 2969
+VMINMAXPSZrmbikz 2970
+VMINMAXPSZrmi 2971
+VMINMAXPSZrmik 2972
+VMINMAXPSZrmikz 2973
+VMINMAXPSZrri 2974
+VMINMAXPSZrrib 2975
+VMINMAXPSZrribk 2976
+VMINMAXPSZrribkz 2977
+VMINMAXPSZrrik 2978
+VMINMAXPSZrrikz 2979
+VMINMAXSDrmi 2980
+VMINMAXSDrmi_Int 2981
+VMINMAXSDrmik_Int 2982
+VMINMAXSDrmikz_Int 2983
+VMINMAXSDrri 2984
+VMINMAXSDrri_Int 2985
+VMINMAXSDrrib_Int 2986
+VMINMAXSDrribk_Int 2987
+VMINMAXSDrribkz_Int 2988
+VMINMAXSDrrik_Int 2989
+VMINMAXSDrrikz_Int 2990
+VMINMAXSHrmi 2991
+VMINMAXSHrmi_Int 2992
+VMINMAXSHrmik_Int 2993
+VMINMAXSHrmikz_Int 2994
+VMINMAXSHrri 2995
+VMINMAXSHrri_Int 2996
+VMINMAXSHrrib_Int 2997
+VMINMAXSHrribk_Int 2998
+VMINMAXSHrribkz_Int 2999
+VMINMAXSHrrik_Int 3000
+VMINMAXSHrrikz_Int 3001
+VMINMAXSSrmi 3002
+VMINMAXSSrmi_Int 3003
+VMINMAXSSrmik_Int 3004
+VMINMAXSSrmikz_Int 3005
+VMINMAXSSrri 3006
+VMINMAXSSrri_Int 3007
+VMINMAXSSrrib_Int 3008
+VMINMAXSSrribk_Int 3009
+VMINMAXSSrribkz_Int 3010
+VMINMAXSSrrik_Int 3011
+VMINMAXSSrrikz_Int 3012
+VMINPDYrm 3013
+VMINPDYrr 3014
+VMINPDZ 3015
+VMINPDZrm 3016
+VMINPDZrmb 3017
+VMINPDZrmbk 3018
+VMINPDZrmbkz 3019
+VMINPDZrmk 3020
+VMINPDZrmkz 3021
+VMINPDZrr 3022
+VMINPDZrrb 3023
+VMINPDZrrbk 3024
+VMINPDZrrbkz 3025
+VMINPDZrrk 3026
+VMINPDZrrkz 3027
+VMINPDrm 3028
+VMINPDrr 3029
+VMINPHZ 3030
+VMINPHZrm 3031
+VMINPHZrmb 3032
+VMINPHZrmbk 3033
+VMINPHZrmbkz 3034
+VMINPHZrmk 3035
+VMINPHZrmkz 3036
+VMINPHZrr 3037
+VMINPHZrrb 3038
+VMINPHZrrbk 3039
+VMINPHZrrbkz 3040
+VMINPHZrrk 3041
+VMINPHZrrkz 3042
+VMINPSYrm 3043
+VMINPSYrr 3044
+VMINPSZ 3045
+VMINPSZrm 3046
+VMINPSZrmb 3047
+VMINPSZrmbk 3048
+VMINPSZrmbkz 3049
+VMINPSZrmk 3050
+VMINPSZrmkz 3051
+VMINPSZrr 3052
+VMINPSZrrb 3053
+VMINPSZrrbk 3054
+VMINPSZrrbkz 3055
+VMINPSZrrk 3056
+VMINPSZrrkz 3057
+VMINPSrm 3058
+VMINPSrr 3059
+VMINSDZrm 3060
+VMINSDZrm_Int 3061
+VMINSDZrmk_Int 3062
+VMINSDZrmkz_Int 3063
+VMINSDZrr 3064
+VMINSDZrr_Int 3065
+VMINSDZrrb_Int 3066
+VMINSDZrrbk_Int 3067
+VMINSDZrrbkz_Int 3068
+VMINSDZrrk_Int 3069
+VMINSDZrrkz_Int 3070
+VMINSDrm 3071
+VMINSDrm_Int 3072
+VMINSDrr 3073
+VMINSDrr_Int 3074
+VMINSHZrm 3075
+VMINSHZrm_Int 3076
+VMINSHZrmk_Int 3077
+VMINSHZrmkz_Int 3078
+VMINSHZrr 3079
+VMINSHZrr_Int 3080
+VMINSHZrrb_Int 3081
+VMINSHZrrbk_Int 3082
+VMINSHZrrbkz_Int 3083
+VMINSHZrrk_Int 3084
+VMINSHZrrkz_Int 3085
+VMINSSZrm 3086
+VMINSSZrm_Int 3087
+VMINSSZrmk_Int 3088
+VMINSSZrmkz_Int 3089
+VMINSSZrr 3090
+VMINSSZrr_Int 3091
+VMINSSZrrb_Int 3092
+VMINSSZrrbk_Int 3093
+VMINSSZrrbkz_Int 3094
+VMINSSZrrk_Int 3095
+VMINSSZrrkz_Int 3096
+VMINSSrm 3097
+VMINSSrm_Int 3098
+VMINSSrr 3099
+VMINSSrr_Int 3100
+VMLAUNCH 3101
+VMLOAD 3102
+VMMCALL 3103
+VMOV 3104
+VMOVAPDYmr 3105
+VMOVAPDYrm 3106
+VMOVAPDYrr 3107
+VMOVAPDYrr_REV 3108
+VMOVAPDZ 3109
+VMOVAPDZmr 3110
+VMOVAPDZmrk 3111
+VMOVAPDZrm 3112
+VMOVAPDZrmk 3113
+VMOVAPDZrmkz 3114
+VMOVAPDZrr 3115
+VMOVAPDZrr_REV 3116
+VMOVAPDZrrk 3117
+VMOVAPDZrrk_REV 3118
+VMOVAPDZrrkz 3119
+VMOVAPDZrrkz_REV 3120
+VMOVAPDmr 3121
+VMOVAPDrm 3122
+VMOVAPDrr 3123
+VMOVAPDrr_REV 3124
+VMOVAPSYmr 3125
+VMOVAPSYrm 3126
+VMOVAPSYrr 3127
+VMOVAPSYrr_REV 3128
+VMOVAPSZ 3129
+VMOVAPSZmr 3130
+VMOVAPSZmrk 3131
+VMOVAPSZrm 3132
+VMOVAPSZrmk 3133
+VMOVAPSZrmkz 3134
+VMOVAPSZrr 3135
+VMOVAPSZrr_REV 3136
+VMOVAPSZrrk 3137
+VMOVAPSZrrk_REV 3138
+VMOVAPSZrrkz 3139
+VMOVAPSZrrkz_REV 3140
+VMOVAPSmr 3141
+VMOVAPSrm 3142
+VMOVAPSrr 3143
+VMOVAPSrr_REV 3144
+VMOVDDUPYrm 3145
+VMOVDDUPYrr 3146
+VMOVDDUPZ 3147
+VMOVDDUPZrm 3148
+VMOVDDUPZrmk 3149
+VMOVDDUPZrmkz 3150
+VMOVDDUPZrr 3151
+VMOVDDUPZrrk 3152
+VMOVDDUPZrrkz 3153
+VMOVDDUPrm 3154
+VMOVDDUPrr 3155
+VMOVDI 3156
+VMOVDQA 3157
+VMOVDQAYmr 3158
+VMOVDQAYrm 3159
+VMOVDQAYrr 3160
+VMOVDQAYrr_REV 3161
+VMOVDQAmr 3162
+VMOVDQArm 3163
+VMOVDQArr 3164
+VMOVDQArr_REV 3165
+VMOVDQU 3166
+VMOVDQUYmr 3167
+VMOVDQUYrm 3168
+VMOVDQUYrr 3169
+VMOVDQUYrr_REV 3170
+VMOVDQUmr 3171
+VMOVDQUrm 3172
+VMOVDQUrr 3173
+VMOVDQUrr_REV 3174
+VMOVHLPSZrr 3175
+VMOVHLPSrr 3176
+VMOVHPDZ 3177
+VMOVHPDmr 3178
+VMOVHPDrm 3179
+VMOVHPSZ 3180
+VMOVHPSmr 3181
+VMOVHPSrm 3182
+VMOVLHPSZrr 3183
+VMOVLHPSrr 3184
+VMOVLPDZ 3185
+VMOVLPDmr 3186
+VMOVLPDrm 3187
+VMOVLPSZ 3188
+VMOVLPSmr 3189
+VMOVLPSrm 3190
+VMOVMSKPDYrr 3191
+VMOVMSKPDrr 3192
+VMOVMSKPSYrr 3193
+VMOVMSKPSrr 3194
+VMOVNTDQAYrm 3195
+VMOVNTDQAZ 3196
+VMOVNTDQAZrm 3197
+VMOVNTDQArm 3198
+VMOVNTDQYmr 3199
+VMOVNTDQZ 3200
+VMOVNTDQZmr 3201
+VMOVNTDQmr 3202
+VMOVNTPDYmr 3203
+VMOVNTPDZ 3204
+VMOVNTPDZmr 3205
+VMOVNTPDmr 3206
+VMOVNTPSYmr 3207
+VMOVNTPSZ 3208
+VMOVNTPSZmr 3209
+VMOVNTPSmr 3210
+VMOVPDI 3211
+VMOVPQI 3212
+VMOVPQIto 3213
+VMOVQI 3214
+VMOVRSBZ 3215
+VMOVRSBZm 3216
+VMOVRSBZmk 3217
+VMOVRSBZmkz 3218
+VMOVRSDZ 3219
+VMOVRSDZm 3220
+VMOVRSDZmk 3221
+VMOVRSDZmkz 3222
+VMOVRSQZ 3223
+VMOVRSQZm 3224
+VMOVRSQZmk 3225
+VMOVRSQZmkz 3226
+VMOVRSWZ 3227
+VMOVRSWZm 3228
+VMOVRSWZmk 3229
+VMOVRSWZmkz 3230
+VMOVSDZmr 3231
+VMOVSDZmrk 3232
+VMOVSDZrm 3233
+VMOVSDZrm_alt 3234
+VMOVSDZrmk 3235
+VMOVSDZrmkz 3236
+VMOVSDZrr 3237
+VMOVSDZrr_REV 3238
+VMOVSDZrrk 3239
+VMOVSDZrrk_REV 3240
+VMOVSDZrrkz 3241
+VMOVSDZrrkz_REV 3242
+VMOVSDmr 3243
+VMOVSDrm 3244
+VMOVSDrm_alt 3245
+VMOVSDrr 3246
+VMOVSDrr_REV 3247
+VMOVSDto 3248
+VMOVSH 3249
+VMOVSHDUPYrm 3250
+VMOVSHDUPYrr 3251
+VMOVSHDUPZ 3252
+VMOVSHDUPZrm 3253
+VMOVSHDUPZrmk 3254
+VMOVSHDUPZrmkz 3255
+VMOVSHDUPZrr 3256
+VMOVSHDUPZrrk 3257
+VMOVSHDUPZrrkz 3258
+VMOVSHDUPrm 3259
+VMOVSHDUPrr 3260
+VMOVSHZmr 3261
+VMOVSHZmrk 3262
+VMOVSHZrm 3263
+VMOVSHZrm_alt 3264
+VMOVSHZrmk 3265
+VMOVSHZrmkz 3266
+VMOVSHZrr 3267
+VMOVSHZrr_REV 3268
+VMOVSHZrrk 3269
+VMOVSHZrrk_REV 3270
+VMOVSHZrrkz 3271
+VMOVSHZrrkz_REV 3272
+VMOVSHtoW 3273
+VMOVSLDUPYrm 3274
+VMOVSLDUPYrr 3275
+VMOVSLDUPZ 3276
+VMOVSLDUPZrm 3277
+VMOVSLDUPZrmk 3278
+VMOVSLDUPZrmkz 3279
+VMOVSLDUPZrr 3280
+VMOVSLDUPZrrk 3281
+VMOVSLDUPZrrkz 3282
+VMOVSLDUPrm 3283
+VMOVSLDUPrr 3284
+VMOVSS 3285
+VMOVSSZmr 3286
+VMOVSSZmrk 3287
+VMOVSSZrm 3288
+VMOVSSZrm_alt 3289
+VMOVSSZrmk 3290
+VMOVSSZrmkz 3291
+VMOVSSZrr 3292
+VMOVSSZrr_REV 3293
+VMOVSSZrrk 3294
+VMOVSSZrrk_REV 3295
+VMOVSSZrrkz 3296
+VMOVSSZrrkz_REV 3297
+VMOVSSmr 3298
+VMOVSSrm 3299
+VMOVSSrm_alt 3300
+VMOVSSrr 3301
+VMOVSSrr_REV 3302
+VMOVUPDYmr 3303
+VMOVUPDYrm 3304
+VMOVUPDYrr 3305
+VMOVUPDYrr_REV 3306
+VMOVUPDZ 3307
+VMOVUPDZmr 3308
+VMOVUPDZmrk 3309
+VMOVUPDZrm 3310
+VMOVUPDZrmk 3311
+VMOVUPDZrmkz 3312
+VMOVUPDZrr 3313
+VMOVUPDZrr_REV 3314
+VMOVUPDZrrk 3315
+VMOVUPDZrrk_REV 3316
+VMOVUPDZrrkz 3317
+VMOVUPDZrrkz_REV 3318
+VMOVUPDmr 3319
+VMOVUPDrm 3320
+VMOVUPDrr 3321
+VMOVUPDrr_REV 3322
+VMOVUPSYmr 3323
+VMOVUPSYrm 3324
+VMOVUPSYrr 3325
+VMOVUPSYrr_REV 3326
+VMOVUPSZ 3327
+VMOVUPSZmr 3328
+VMOVUPSZmrk 3329
+VMOVUPSZrm 3330
+VMOVUPSZrmk 3331
+VMOVUPSZrmkz 3332
+VMOVUPSZrr 3333
+VMOVUPSZrr_REV 3334
+VMOVUPSZrrk 3335
+VMOVUPSZrrk_REV 3336
+VMOVUPSZrrkz 3337
+VMOVUPSZrrkz_REV 3338
+VMOVUPSmr 3339
+VMOVUPSrm 3340
+VMOVUPSrr 3341
+VMOVUPSrr_REV 3342
+VMOVW 3343
+VMOVWmr 3344
+VMOVWrm 3345
+VMOVZPDILo 3346
+VMOVZPQILo 3347
+VMOVZPWILo 3348
+VMPSADBWYrmi 3349
+VMPSADBWYrri 3350
+VMPSADBWZ 3351
+VMPSADBWZrmi 3352
+VMPSADBWZrmik 3353
+VMPSADBWZrmikz 3354
+VMPSADBWZrri 3355
+VMPSADBWZrrik 3356
+VMPSADBWZrrikz 3357
+VMPSADBWrmi 3358
+VMPSADBWrri 3359
+VMPTRLDm 3360
+VMPTRSTm 3361
+VMREAD 3362
+VMRESUME 3363
+VMRUN 3364
+VMSAVE 3365
+VMULBF 3366
+VMULPDYrm 3367
+VMULPDYrr 3368
+VMULPDZ 3369
+VMULPDZrm 3370
+VMULPDZrmb 3371
+VMULPDZrmbk 3372
+VMULPDZrmbkz 3373
+VMULPDZrmk 3374
+VMULPDZrmkz 3375
+VMULPDZrr 3376
+VMULPDZrrb 3377
+VMULPDZrrbk 3378
+VMULPDZrrbkz 3379
+VMULPDZrrk 3380
+VMULPDZrrkz 3381
+VMULPDrm 3382
+VMULPDrr 3383
+VMULPHZ 3384
+VMULPHZrm 3385
+VMULPHZrmb 3386
+VMULPHZrmbk 3387
+VMULPHZrmbkz 3388
+VMULPHZrmk 3389
+VMULPHZrmkz 3390
+VMULPHZrr 3391
+VMULPHZrrb 3392
+VMULPHZrrbk 3393
+VMULPHZrrbkz 3394
+VMULPHZrrk 3395
+VMULPHZrrkz 3396
+VMULPSYrm 3397
+VMULPSYrr 3398
+VMULPSZ 3399
+VMULPSZrm 3400
+VMULPSZrmb 3401
+VMULPSZrmbk 3402
+VMULPSZrmbkz 3403
+VMULPSZrmk 3404
+VMULPSZrmkz 3405
+VMULPSZrr 3406
+VMULPSZrrb 3407
+VMULPSZrrbk 3408
+VMULPSZrrbkz 3409
+VMULPSZrrk 3410
+VMULPSZrrkz 3411
+VMULPSrm 3412
+VMULPSrr 3413
+VMULSDZrm 3414
+VMULSDZrm_Int 3415
+VMULSDZrmk_Int 3416
+VMULSDZrmkz_Int 3417
+VMULSDZrr 3418
+VMULSDZrr_Int 3419
+VMULSDZrrb_Int 3420
+VMULSDZrrbk_Int 3421
+VMULSDZrrbkz_Int 3422
+VMULSDZrrk_Int 3423
+VMULSDZrrkz_Int 3424
+VMULSDrm 3425
+VMULSDrm_Int 3426
+VMULSDrr 3427
+VMULSDrr_Int 3428
+VMULSHZrm 3429
+VMULSHZrm_Int 3430
+VMULSHZrmk_Int 3431
+VMULSHZrmkz_Int 3432
+VMULSHZrr 3433
+VMULSHZrr_Int 3434
+VMULSHZrrb_Int 3435
+VMULSHZrrbk_Int 3436
+VMULSHZrrbkz_Int 3437
+VMULSHZrrk_Int 3438
+VMULSHZrrkz_Int 3439
+VMULSSZrm 3440
+VMULSSZrm_Int 3441
+VMULSSZrmk_Int 3442
+VMULSSZrmkz_Int 3443
+VMULSSZrr 3444
+VMULSSZrr_Int 3445
+VMULSSZrrb_Int 3446
+VMULSSZrrbk_Int 3447
+VMULSSZrrbkz_Int 3448
+VMULSSZrrk_Int 3449
+VMULSSZrrkz_Int 3450
+VMULSSrm 3451
+VMULSSrm_Int 3452
+VMULSSrr 3453
+VMULSSrr_Int 3454
+VMWRITE 3455
+VMXOFF 3456
+VMXON 3457
+VORPDYrm 3458
+VORPDYrr 3459
+VORPDZ 3460
+VORPDZrm 3461
+VORPDZrmb 3462
+VORPDZrmbk 3463
+VORPDZrmbkz 3464
+VORPDZrmk 3465
+VORPDZrmkz 3466
+VORPDZrr 3467
+VORPDZrrk 3468
+VORPDZrrkz 3469
+VORPDrm 3470
+VORPDrr 3471
+VORPSYrm 3472
+VORPSYrr 3473
+VORPSZ 3474
+VORPSZrm 3475
+VORPSZrmb 3476
+VORPSZrmbk 3477
+VORPSZrmbkz 3478
+VORPSZrmk 3479
+VORPSZrmkz 3480
+VORPSZrr 3481
+VORPSZrrk 3482
+VORPSZrrkz 3483
+VORPSrm 3484
+VORPSrr 3485
+VP 3486
+VPABSBYrm 3487
+VPABSBYrr 3488
+VPABSBZ 3489
+VPABSBZrm 3490
+VPABSBZrmk 3491
+VPABSBZrmkz 3492
+VPABSBZrr 3493
+VPABSBZrrk 3494
+VPABSBZrrkz 3495
+VPABSBrm 3496
+VPABSBrr 3497
+VPABSDYrm 3498
+VPABSDYrr 3499
+VPABSDZ 3500
+VPABSDZrm 3501
+VPABSDZrmb 3502
+VPABSDZrmbk 3503
+VPABSDZrmbkz 3504
+VPABSDZrmk 3505
+VPABSDZrmkz 3506
+VPABSDZrr 3507
+VPABSDZrrk 3508
+VPABSDZrrkz 3509
+VPABSDrm 3510
+VPABSDrr 3511
+VPABSQZ 3512
+VPABSQZrm 3513
+VPABSQZrmb 3514
+VPABSQZrmbk 3515
+VPABSQZrmbkz 3516
+VPABSQZrmk 3517
+VPABSQZrmkz 3518
+VPABSQZrr 3519
+VPABSQZrrk 3520
+VPABSQZrrkz 3521
+VPABSWYrm 3522
+VPABSWYrr 3523
+VPABSWZ 3524
+VPABSWZrm 3525
+VPABSWZrmk 3526
+VPABSWZrmkz 3527
+VPABSWZrr 3528
+VPABSWZrrk 3529
+VPABSWZrrkz 3530
+VPABSWrm 3531
+VPABSWrr 3532
+VPACKSSDWYrm 3533
+VPACKSSDWYrr 3534
+VPACKSSDWZ 3535
+VPACKSSDWZrm 3536
+VPACKSSDWZrmb 3537
+VPACKSSDWZrmbk 3538
+VPACKSSDWZrmbkz 3539
+VPACKSSDWZrmk 3540
+VPACKSSDWZrmkz 3541
+VPACKSSDWZrr 3542
+VPACKSSDWZrrk 3543
+VPACKSSDWZrrkz 3544
+VPACKSSDWrm 3545
+VPACKSSDWrr 3546
+VPACKSSWBYrm 3547
+VPACKSSWBYrr 3548
+VPACKSSWBZ 3549
+VPACKSSWBZrm 3550
+VPACKSSWBZrmk 3551
+VPACKSSWBZrmkz 3552
+VPACKSSWBZrr 3553
+VPACKSSWBZrrk 3554
+VPACKSSWBZrrkz 3555
+VPACKSSWBrm 3556
+VPACKSSWBrr 3557
+VPACKUSDWYrm 3558
+VPACKUSDWYrr 3559
+VPACKUSDWZ 3560
+VPACKUSDWZrm 3561
+VPACKUSDWZrmb 3562
+VPACKUSDWZrmbk 3563
+VPACKUSDWZrmbkz 3564
+VPACKUSDWZrmk 3565
+VPACKUSDWZrmkz 3566
+VPACKUSDWZrr 3567
+VPACKUSDWZrrk 3568
+VPACKUSDWZrrkz 3569
+VPACKUSDWrm 3570
+VPACKUSDWrr 3571
+VPACKUSWBYrm 3572
+VPACKUSWBYrr 3573
+VPACKUSWBZ 3574
+VPACKUSWBZrm 3575
+VPACKUSWBZrmk 3576
+VPACKUSWBZrmkz 3577
+VPACKUSWBZrr 3578
+VPACKUSWBZrrk 3579
+VPACKUSWBZrrkz 3580
+VPACKUSWBrm 3581
+VPACKUSWBrr 3582
+VPADDBYrm 3583
+VPADDBYrr 3584
+VPADDBZ 3585
+VPADDBZrm 3586
+VPADDBZrmk 3587
+VPADDBZrmkz 3588
+VPADDBZrr 3589
+VPADDBZrrk 3590
+VPADDBZrrkz 3591
+VPADDBrm 3592
+VPADDBrr 3593
+VPADDDYrm 3594
+VPADDDYrr 3595
+VPADDDZ 3596
+VPADDDZrm 3597
+VPADDDZrmb 3598
+VPADDDZrmbk 3599
+VPADDDZrmbkz 3600
+VPADDDZrmk 3601
+VPADDDZrmkz 3602
+VPADDDZrr 3603
+VPADDDZrrk 3604
+VPADDDZrrkz 3605
+VPADDDrm 3606
+VPADDDrr 3607
+VPADDQYrm 3608
+VPADDQYrr 3609
+VPADDQZ 3610
+VPADDQZrm 3611
+VPADDQZrmb 3612
+VPADDQZrmbk 3613
+VPADDQZrmbkz 3614
+VPADDQZrmk 3615
+VPADDQZrmkz 3616
+VPADDQZrr 3617
+VPADDQZrrk 3618
+VPADDQZrrkz 3619
+VPADDQrm 3620
+VPADDQrr 3621
+VPADDSBYrm 3622
+VPADDSBYrr 3623
+VPADDSBZ 3624
+VPADDSBZrm 3625
+VPADDSBZrmk 3626
+VPADDSBZrmkz 3627
+VPADDSBZrr 3628
+VPADDSBZrrk 3629
+VPADDSBZrrkz 3630
+VPADDSBrm 3631
+VPADDSBrr 3632
+VPADDSWYrm 3633
+VPADDSWYrr 3634
+VPADDSWZ 3635
+VPADDSWZrm 3636
+VPADDSWZrmk 3637
+VPADDSWZrmkz 3638
+VPADDSWZrr 3639
+VPADDSWZrrk 3640
+VPADDSWZrrkz 3641
+VPADDSWrm 3642
+VPADDSWrr 3643
+VPADDUSBYrm 3644
+VPADDUSBYrr 3645
+VPADDUSBZ 3646
+VPADDUSBZrm 3647
+VPADDUSBZrmk 3648
+VPADDUSBZrmkz 3649
+VPADDUSBZrr 3650
+VPADDUSBZrrk 3651
+VPADDUSBZrrkz 3652
+VPADDUSBrm 3653
+VPADDUSBrr 3654
+VPADDUSWYrm 3655
+VPADDUSWYrr 3656
+VPADDUSWZ 3657
+VPADDUSWZrm 3658
+VPADDUSWZrmk 3659
+VPADDUSWZrmkz 3660
+VPADDUSWZrr 3661
+VPADDUSWZrrk 3662
+VPADDUSWZrrkz 3663
+VPADDUSWrm 3664
+VPADDUSWrr 3665
+VPADDWYrm 3666
+VPADDWYrr 3667
+VPADDWZ 3668
+VPADDWZrm 3669
+VPADDWZrmk 3670
+VPADDWZrmkz 3671
+VPADDWZrr 3672
+VPADDWZrrk 3673
+VPADDWZrrkz 3674
+VPADDWrm 3675
+VPADDWrr 3676
+VPALIGNRYrmi 3677
+VPALIGNRYrri 3678
+VPALIGNRZ 3679
+VPALIGNRZrmi 3680
+VPALIGNRZrmik 3681
+VPALIGNRZrmikz 3682
+VPALIGNRZrri 3683
+VPALIGNRZrrik 3684
+VPALIGNRZrrikz 3685
+VPALIGNRrmi 3686
+VPALIGNRrri 3687
+VPANDDZ 3688
+VPANDDZrm 3689
+VPANDDZrmb 3690
+VPANDDZrmbk 3691
+VPANDDZrmbkz 3692
+VPANDDZrmk 3693
+VPANDDZrmkz 3694
+VPANDDZrr 3695
+VPANDDZrrk 3696
+VPANDDZrrkz 3697
+VPANDNDZ 3698
+VPANDNDZrm 3699
+VPANDNDZrmb 3700
+VPANDNDZrmbk 3701
+VPANDNDZrmbkz 3702
+VPANDNDZrmk 3703
+VPANDNDZrmkz 3704
+VPANDNDZrr 3705
+VPANDNDZrrk 3706
+VPANDNDZrrkz 3707
+VPANDNQZ 3708
+VPANDNQZrm 3709
+VPANDNQZrmb 3710
+VPANDNQZrmbk 3711
+VPANDNQZrmbkz 3712
+VPANDNQZrmk 3713
+VPANDNQZrmkz 3714
+VPANDNQZrr 3715
+VPANDNQZrrk 3716
+VPANDNQZrrkz 3717
+VPANDNYrm 3718
+VPANDNYrr 3719
+VPANDNrm 3720
+VPANDNrr 3721
+VPANDQZ 3722
+VPANDQZrm 3723
+VPANDQZrmb 3724
+VPANDQZrmbk 3725
+VPANDQZrmbkz 3726
+VPANDQZrmk 3727
+VPANDQZrmkz 3728
+VPANDQZrr 3729
+VPANDQZrrk 3730
+VPANDQZrrkz 3731
+VPANDYrm 3732
+VPANDYrr 3733
+VPANDrm 3734
+VPANDrr 3735
+VPAVGBYrm 3736
+VPAVGBYrr 3737
+VPAVGBZ 3738
+VPAVGBZrm 3739
+VPAVGBZrmk 3740
+VPAVGBZrmkz 3741
+VPAVGBZrr 3742
+VPAVGBZrrk 3743
+VPAVGBZrrkz 3744
+VPAVGBrm 3745
+VPAVGBrr 3746
+VPAVGWYrm 3747
+VPAVGWYrr 3748
+VPAVGWZ 3749
+VPAVGWZrm 3750
+VPAVGWZrmk 3751
+VPAVGWZrmkz 3752
+VPAVGWZrr 3753
+VPAVGWZrrk 3754
+VPAVGWZrrkz 3755
+VPAVGWrm 3756
+VPAVGWrr 3757
+VPBLENDDYrmi 3758
+VPBLENDDYrri 3759
+VPBLENDDrmi 3760
+VPBLENDDrri 3761
+VPBLENDMBZ 3762
+VPBLENDMBZrm 3763
+VPBLENDMBZrmk 3764
+VPBLENDMBZrmkz 3765
+VPBLENDMBZrr 3766
+VPBLENDMBZrrk 3767
+VPBLENDMBZrrkz 3768
+VPBLENDMDZ 3769
+VPBLENDMDZrm 3770
+VPBLENDMDZrmb 3771
+VPBLENDMDZrmbk 3772
+VPBLENDMDZrmbkz 3773
+VPBLENDMDZrmk 3774
+VPBLENDMDZrmkz 3775
+VPBLENDMDZrr 3776
+VPBLENDMDZrrk 3777
+VPBLENDMDZrrkz 3778
+VPBLENDMQZ 3779
+VPBLENDMQZrm 3780
+VPBLENDMQZrmb 3781
+VPBLENDMQZrmbk 3782
+VPBLENDMQZrmbkz 3783
+VPBLENDMQZrmk 3784
+VPBLENDMQZrmkz 3785
+VPBLENDMQZrr 3786
+VPBLENDMQZrrk 3787
+VPBLENDMQZrrkz 3788
+VPBLENDMWZ 3789
+VPBLENDMWZrm 3790
+VPBLENDMWZrmk 3791
+VPBLENDMWZrmkz 3792
+VPBLENDMWZrr 3793
+VPBLENDMWZrrk 3794
+VPBLENDMWZrrkz 3795
+VPBLENDVBYrmr 3796
+VPBLENDVBYrrr 3797
+VPBLENDVBrmr 3798
+VPBLENDVBrrr 3799
+VPBLENDWYrmi 3800
+VPBLENDWYrri 3801
+VPBLENDWrmi 3802
+VPBLENDWrri 3803
+VPBROADCASTBYrm 3804
+VPBROADCASTBYrr 3805
+VPBROADCASTBZ 3806
+VPBROADCASTBZrm 3807
+VPBROADCASTBZrmk 3808
+VPBROADCASTBZrmkz 3809
+VPBROADCASTBZrr 3810
+VPBROADCASTBZrrk 3811
+VPBROADCASTBZrrkz 3812
+VPBROADCASTBrZ 3813
+VPBROADCASTBrZrr 3814
+VPBROADCASTBrZrrk 3815
+VPBROADCASTBrZrrkz 3816
+VPBROADCASTBrm 3817
+VPBROADCASTBrr 3818
+VPBROADCASTDYrm 3819
+VPBROADCASTDYrr 3820
+VPBROADCASTDZ 3821
+VPBROADCASTDZrm 3822
+VPBROADCASTDZrmk 3823
+VPBROADCASTDZrmkz 3824
+VPBROADCASTDZrr 3825
+VPBROADCASTDZrrk 3826
+VPBROADCASTDZrrkz 3827
+VPBROADCASTDrZ 3828
+VPBROADCASTDrZrr 3829
+VPBROADCASTDrZrrk 3830
+VPBROADCASTDrZrrkz 3831
+VPBROADCASTDrm 3832
+VPBROADCASTDrr 3833
+VPBROADCASTMB 3834
+VPBROADCASTMW 3835
+VPBROADCASTQYrm 3836
+VPBROADCASTQYrr 3837
+VPBROADCASTQZ 3838
+VPBROADCASTQZrm 3839
+VPBROADCASTQZrmk 3840
+VPBROADCASTQZrmkz 3841
+VPBROADCASTQZrr 3842
+VPBROADCASTQZrrk 3843
+VPBROADCASTQZrrkz 3844
+VPBROADCASTQrZ 3845
+VPBROADCASTQrZrr 3846
+VPBROADCASTQrZrrk 3847
+VPBROADCASTQrZrrkz 3848
+VPBROADCASTQrm 3849
+VPBROADCASTQrr 3850
+VPBROADCASTWYrm 3851
+VPBROADCASTWYrr 3852
+VPBROADCASTWZ 3853
+VPBROADCASTWZrm 3854
+VPBROADCASTWZrmk 3855
+VPBROADCASTWZrmkz 3856
+VPBROADCASTWZrr 3857
+VPBROADCASTWZrrk 3858
+VPBROADCASTWZrrkz 3859
+VPBROADCASTWrZ 3860
+VPBROADCASTWrZrr 3861
+VPBROADCASTWrZrrk 3862
+VPBROADCASTWrZrrkz 3863
+VPBROADCASTWrm 3864
+VPBROADCASTWrr 3865
+VPCLMULQDQYrmi 3866
+VPCLMULQDQYrri 3867
+VPCLMULQDQZ 3868
+VPCLMULQDQZrmi 3869
+VPCLMULQDQZrri 3870
+VPCLMULQDQrmi 3871
+VPCLMULQDQrri 3872
+VPCMOVYrmr 3873
+VPCMOVYrrm 3874
+VPCMOVYrrr 3875
+VPCMOVYrrr_REV 3876
+VPCMOVrmr 3877
+VPCMOVrrm 3878
+VPCMOVrrr 3879
+VPCMOVrrr_REV 3880
+VPCMPBZ 3881
+VPCMPBZrmi 3882
+VPCMPBZrmik 3883
+VPCMPBZrri 3884
+VPCMPBZrrik 3885
+VPCMPDZ 3886
+VPCMPDZrmbi 3887
+VPCMPDZrmbik 3888
+VPCMPDZrmi 3889
+VPCMPDZrmik 3890
+VPCMPDZrri 3891
+VPCMPDZrrik 3892
+VPCMPEQBYrm 3893
+VPCMPEQBYrr 3894
+VPCMPEQBZ 3895
+VPCMPEQBZrm 3896
+VPCMPEQBZrmk 3897
+VPCMPEQBZrr 3898
+VPCMPEQBZrrk 3899
+VPCMPEQBrm 3900
+VPCMPEQBrr 3901
+VPCMPEQDYrm 3902
+VPCMPEQDYrr 3903
+VPCMPEQDZ 3904
+VPCMPEQDZrm 3905
+VPCMPEQDZrmb 3906
+VPCMPEQDZrmbk 3907
+VPCMPEQDZrmk 3908
+VPCMPEQDZrr 3909
+VPCMPEQDZrrk 3910
+VPCMPEQDrm 3911
+VPCMPEQDrr 3912
+VPCMPEQQYrm 3913
+VPCMPEQQYrr 3914
+VPCMPEQQZ 3915
+VPCMPEQQZrm 3916
+VPCMPEQQZrmb 3917
+VPCMPEQQZrmbk 3918
+VPCMPEQQZrmk 3919
+VPCMPEQQZrr 3920
+VPCMPEQQZrrk 3921
+VPCMPEQQrm 3922
+VPCMPEQQrr 3923
+VPCMPEQWYrm 3924
+VPCMPEQWYrr 3925
+VPCMPEQWZ 3926
+VPCMPEQWZrm 3927
+VPCMPEQWZrmk 3928
+VPCMPEQWZrr 3929
+VPCMPEQWZrrk 3930
+VPCMPEQWrm 3931
+VPCMPEQWrr 3932
+VPCMPESTRIrmi 3933
+VPCMPESTRIrri 3934
+VPCMPESTRMrmi 3935
+VPCMPESTRMrri 3936
+VPCMPGTBYrm 3937
+VPCMPGTBYrr 3938
+VPCMPGTBZ 3939
+VPCMPGTBZrm 3940
+VPCMPGTBZrmk 3941
+VPCMPGTBZrr 3942
+VPCMPGTBZrrk 3943
+VPCMPGTBrm 3944
+VPCMPGTBrr 3945
+VPCMPGTDYrm 3946
+VPCMPGTDYrr 3947
+VPCMPGTDZ 3948
+VPCMPGTDZrm 3949
+VPCMPGTDZrmb 3950
+VPCMPGTDZrmbk 3951
+VPCMPGTDZrmk 3952
+VPCMPGTDZrr 3953
+VPCMPGTDZrrk 3954
+VPCMPGTDrm 3955
+VPCMPGTDrr 3956
+VPCMPGTQYrm 3957
+VPCMPGTQYrr 3958
+VPCMPGTQZ 3959
+VPCMPGTQZrm 3960
+VPCMPGTQZrmb 3961
+VPCMPGTQZrmbk 3962
+VPCMPGTQZrmk 3963
+VPCMPGTQZrr 3964
+VPCMPGTQZrrk 3965
+VPCMPGTQrm 3966
+VPCMPGTQrr 3967
+VPCMPGTWYrm 3968
+VPCMPGTWYrr 3969
+VPCMPGTWZ 3970
+VPCMPGTWZrm 3971
+VPCMPGTWZrmk 3972
+VPCMPGTWZrr 3973
+VPCMPGTWZrrk 3974
+VPCMPGTWrm 3975
+VPCMPGTWrr 3976
+VPCMPISTRIrmi 3977
+VPCMPISTRIrri 3978
+VPCMPISTRMrmi 3979
+VPCMPISTRMrri 3980
+VPCMPQZ 3981
+VPCMPQZrmbi 3982
+VPCMPQZrmbik 3983
+VPCMPQZrmi 3984
+VPCMPQZrmik 3985
+VPCMPQZrri 3986
+VPCMPQZrrik 3987
+VPCMPUBZ 3988
+VPCMPUBZrmi 3989
+VPCMPUBZrmik 3990
+VPCMPUBZrri 3991
+VPCMPUBZrrik 3992
+VPCMPUDZ 3993
+VPCMPUDZrmbi 3994
+VPCMPUDZrmbik 3995
+VPCMPUDZrmi 3996
+VPCMPUDZrmik 3997
+VPCMPUDZrri 3998
+VPCMPUDZrrik 3999
+VPCMPUQZ 4000
+VPCMPUQZrmbi 4001
+VPCMPUQZrmbik 4002
+VPCMPUQZrmi 4003
+VPCMPUQZrmik 4004
+VPCMPUQZrri 4005
+VPCMPUQZrrik 4006
+VPCMPUWZ 4007
+VPCMPUWZrmi 4008
+VPCMPUWZrmik 4009
+VPCMPUWZrri 4010
+VPCMPUWZrrik 4011
+VPCMPWZ 4012
+VPCMPWZrmi 4013
+VPCMPWZrmik 4014
+VPCMPWZrri 4015
+VPCMPWZrrik 4016
+VPCOMBmi 4017
+VPCOMBri 4018
+VPCOMDmi 4019
+VPCOMDri 4020
+VPCOMPRESSBZ 4021
+VPCOMPRESSBZmr 4022
+VPCOMPRESSBZmrk 4023
+VPCOMPRESSBZrr 4024
+VPCOMPRESSBZrrk 4025
+VPCOMPRESSBZrrkz 4026
+VPCOMPRESSDZ 4027
+VPCOMPRESSDZmr 4028
+VPCOMPRESSDZmrk 4029
+VPCOMPRESSDZrr 4030
+VPCOMPRESSDZrrk 4031
+VPCOMPRESSDZrrkz 4032
+VPCOMPRESSQZ 4033
+VPCOMPRESSQZmr 4034
+VPCOMPRESSQZmrk 4035
+VPCOMPRESSQZrr 4036
+VPCOMPRESSQZrrk 4037
+VPCOMPRESSQZrrkz 4038
+VPCOMPRESSWZ 4039
+VPCOMPRESSWZmr 4040
+VPCOMPRESSWZmrk 4041
+VPCOMPRESSWZrr 4042
+VPCOMPRESSWZrrk 4043
+VPCOMPRESSWZrrkz 4044
+VPCOMQmi 4045
+VPCOMQri 4046
+VPCOMUBmi 4047
+VPCOMUBri 4048
+VPCOMUDmi 4049
+VPCOMUDri 4050
+VPCOMUQmi 4051
+VPCOMUQri 4052
+VPCOMUWmi 4053
+VPCOMUWri 4054
+VPCOMWmi 4055
+VPCOMWri 4056
+VPCONFLICTDZ 4057
+VPCONFLICTDZrm 4058
+VPCONFLICTDZrmb 4059
+VPCONFLICTDZrmbk 4060
+VPCONFLICTDZrmbkz 4061
+VPCONFLICTDZrmk 4062
+VPCONFLICTDZrmkz 4063
+VPCONFLICTDZrr 4064
+VPCONFLICTDZrrk 4065
+VPCONFLICTDZrrkz 4066
+VPCONFLICTQZ 4067
+VPCONFLICTQZrm 4068
+VPCONFLICTQZrmb 4069
+VPCONFLICTQZrmbk 4070
+VPCONFLICTQZrmbkz 4071
+VPCONFLICTQZrmk 4072
+VPCONFLICTQZrmkz 4073
+VPCONFLICTQZrr 4074
+VPCONFLICTQZrrk 4075
+VPCONFLICTQZrrkz 4076
+VPDPBSSDSYrm 4077
+VPDPBSSDSYrr 4078
+VPDPBSSDSZ 4079
+VPDPBSSDSZrm 4080
+VPDPBSSDSZrmb 4081
+VPDPBSSDSZrmbk 4082
+VPDPBSSDSZrmbkz 4083
+VPDPBSSDSZrmk 4084
+VPDPBSSDSZrmkz 4085
+VPDPBSSDSZrr 4086
+VPDPBSSDSZrrk 4087
+VPDPBSSDSZrrkz 4088
+VPDPBSSDSrm 4089
+VPDPBSSDSrr 4090
+VPDPBSSDYrm 4091
+VPDPBSSDYrr 4092
+VPDPBSSDZ 4093
+VPDPBSSDZrm 4094
+VPDPBSSDZrmb 4095
+VPDPBSSDZrmbk 4096
+VPDPBSSDZrmbkz 4097
+VPDPBSSDZrmk 4098
+VPDPBSSDZrmkz 4099
+VPDPBSSDZrr 4100
+VPDPBSSDZrrk 4101
+VPDPBSSDZrrkz 4102
+VPDPBSSDrm 4103
+VPDPBSSDrr 4104
+VPDPBSUDSYrm 4105
+VPDPBSUDSYrr 4106
+VPDPBSUDSZ 4107
+VPDPBSUDSZrm 4108
+VPDPBSUDSZrmb 4109
+VPDPBSUDSZrmbk 4110
+VPDPBSUDSZrmbkz 4111
+VPDPBSUDSZrmk 4112
+VPDPBSUDSZrmkz 4113
+VPDPBSUDSZrr 4114
+VPDPBSUDSZrrk 4115
+VPDPBSUDSZrrkz 4116
+VPDPBSUDSrm 4117
+VPDPBSUDSrr 4118
+VPDPBSUDYrm 4119
+VPDPBSUDYrr 4120
+VPDPBSUDZ 4121
+VPDPBSUDZrm 4122
+VPDPBSUDZrmb 4123
+VPDPBSUDZrmbk 4124
+VPDPBSUDZrmbkz 4125
+VPDPBSUDZrmk 4126
+VPDPBSUDZrmkz 4127
+VPDPBSUDZrr 4128
+VPDPBSUDZrrk 4129
+VPDPBSUDZrrkz 4130
+VPDPBSUDrm 4131
+VPDPBSUDrr 4132
+VPDPBUSDSYrm 4133
+VPDPBUSDSYrr 4134
+VPDPBUSDSZ 4135
+VPDPBUSDSZrm 4136
+VPDPBUSDSZrmb 4137
+VPDPBUSDSZrmbk 4138
+VPDPBUSDSZrmbkz 4139
+VPDPBUSDSZrmk 4140
+VPDPBUSDSZrmkz 4141
+VPDPBUSDSZrr 4142
+VPDPBUSDSZrrk 4143
+VPDPBUSDSZrrkz 4144
+VPDPBUSDSrm 4145
+VPDPBUSDSrr 4146
+VPDPBUSDYrm 4147
+VPDPBUSDYrr 4148
+VPDPBUSDZ 4149
+VPDPBUSDZrm 4150
+VPDPBUSDZrmb 4151
+VPDPBUSDZrmbk 4152
+VPDPBUSDZrmbkz 4153
+VPDPBUSDZrmk 4154
+VPDPBUSDZrmkz 4155
+VPDPBUSDZrr 4156
+VPDPBUSDZrrk 4157
+VPDPBUSDZrrkz 4158
+VPDPBUSDrm 4159
+VPDPBUSDrr 4160
+VPDPBUUDSYrm 4161
+VPDPBUUDSYrr 4162
+VPDPBUUDSZ 4163
+VPDPBUUDSZrm 4164
+VPDPBUUDSZrmb 4165
+VPDPBUUDSZrmbk 4166
+VPDPBUUDSZrmbkz 4167
+VPDPBUUDSZrmk 4168
+VPDPBUUDSZrmkz 4169
+VPDPBUUDSZrr 4170
+VPDPBUUDSZrrk 4171
+VPDPBUUDSZrrkz 4172
+VPDPBUUDSrm 4173
+VPDPBUUDSrr 4174
+VPDPBUUDYrm 4175
+VPDPBUUDYrr 4176
+VPDPBUUDZ 4177
+VPDPBUUDZrm 4178
+VPDPBUUDZrmb 4179
+VPDPBUUDZrmbk 4180
+VPDPBUUDZrmbkz 4181
+VPDPBUUDZrmk 4182
+VPDPBUUDZrmkz 4183
+VPDPBUUDZrr 4184
+VPDPBUUDZrrk 4185
+VPDPBUUDZrrkz 4186
+VPDPBUUDrm 4187
+VPDPBUUDrr 4188
+VPDPWSSDSYrm 4189
+VPDPWSSDSYrr 4190
+VPDPWSSDSZ 4191
+VPDPWSSDSZrm 4192
+VPDPWSSDSZrmb 4193
+VPDPWSSDSZrmbk 4194
+VPDPWSSDSZrmbkz 4195
+VPDPWSSDSZrmk 4196
+VPDPWSSDSZrmkz 4197
+VPDPWSSDSZrr 4198
+VPDPWSSDSZrrk 4199
+VPDPWSSDSZrrkz 4200
+VPDPWSSDSrm 4201
+VPDPWSSDSrr 4202
+VPDPWSSDYrm 4203
+VPDPWSSDYrr 4204
+VPDPWSSDZ 4205
+VPDPWSSDZrm 4206
+VPDPWSSDZrmb 4207
+VPDPWSSDZrmbk 4208
+VPDPWSSDZrmbkz 4209
+VPDPWSSDZrmk 4210
+VPDPWSSDZrmkz 4211
+VPDPWSSDZrr 4212
+VPDPWSSDZrrk 4213
+VPDPWSSDZrrkz 4214
+VPDPWSSDrm 4215
+VPDPWSSDrr 4216
+VPDPWSUDSYrm 4217
+VPDPWSUDSYrr 4218
+VPDPWSUDSZ 4219
+VPDPWSUDSZrm 4220
+VPDPWSUDSZrmb 4221
+VPDPWSUDSZrmbk 4222
+VPDPWSUDSZrmbkz 4223
+VPDPWSUDSZrmk 4224
+VPDPWSUDSZrmkz 4225
+VPDPWSUDSZrr 4226
+VPDPWSUDSZrrk 4227
+VPDPWSUDSZrrkz 4228
+VPDPWSUDSrm 4229
+VPDPWSUDSrr 4230
+VPDPWSUDYrm 4231
+VPDPWSUDYrr 4232
+VPDPWSUDZ 4233
+VPDPWSUDZrm 4234
+VPDPWSUDZrmb 4235
+VPDPWSUDZrmbk 4236
+VPDPWSUDZrmbkz 4237
+VPDPWSUDZrmk 4238
+VPDPWSUDZrmkz 4239
+VPDPWSUDZrr 4240
+VPDPWSUDZrrk 4241
+VPDPWSUDZrrkz 4242
+VPDPWSUDrm 4243
+VPDPWSUDrr 4244
+VPDPWUSDSYrm 4245
+VPDPWUSDSYrr 4246
+VPDPWUSDSZ 4247
+VPDPWUSDSZrm 4248
+VPDPWUSDSZrmb 4249
+VPDPWUSDSZrmbk 4250
+VPDPWUSDSZrmbkz 4251
+VPDPWUSDSZrmk 4252
+VPDPWUSDSZrmkz 4253
+VPDPWUSDSZrr 4254
+VPDPWUSDSZrrk 4255
+VPDPWUSDSZrrkz 4256
+VPDPWUSDSrm 4257
+VPDPWUSDSrr 4258
+VPDPWUSDYrm 4259
+VPDPWUSDYrr 4260
+VPDPWUSDZ 4261
+VPDPWUSDZrm 4262
+VPDPWUSDZrmb 4263
+VPDPWUSDZrmbk 4264
+VPDPWUSDZrmbkz 4265
+VPDPWUSDZrmk 4266
+VPDPWUSDZrmkz 4267
+VPDPWUSDZrr 4268
+VPDPWUSDZrrk 4269
+VPDPWUSDZrrkz 4270
+VPDPWUSDrm 4271
+VPDPWUSDrr 4272
+VPDPWUUDSYrm 4273
+VPDPWUUDSYrr 4274
+VPDPWUUDSZ 4275
+VPDPWUUDSZrm 4276
+VPDPWUUDSZrmb 4277
+VPDPWUUDSZrmbk 4278
+VPDPWUUDSZrmbkz 4279
+VPDPWUUDSZrmk 4280
+VPDPWUUDSZrmkz 4281
+VPDPWUUDSZrr 4282
+VPDPWUUDSZrrk 4283
+VPDPWUUDSZrrkz 4284
+VPDPWUUDSrm 4285
+VPDPWUUDSrr 4286
+VPDPWUUDYrm 4287
+VPDPWUUDYrr 4288
+VPDPWUUDZ 4289
+VPDPWUUDZrm 4290
+VPDPWUUDZrmb 4291
+VPDPWUUDZrmbk 4292
+VPDPWUUDZrmbkz 4293
+VPDPWUUDZrmk 4294
+VPDPWUUDZrmkz 4295
+VPDPWUUDZrr 4296
+VPDPWUUDZrrk 4297
+VPDPWUUDZrrkz 4298
+VPDPWUUDrm 4299
+VPDPWUUDrr 4300
+VPERM 4301
+VPERMBZ 4302
+VPERMBZrm 4303
+VPERMBZrmk 4304
+VPERMBZrmkz 4305
+VPERMBZrr 4306
+VPERMBZrrk 4307
+VPERMBZrrkz 4308
+VPERMDYrm 4309
+VPERMDYrr 4310
+VPERMDZ 4311
+VPERMDZrm 4312
+VPERMDZrmb 4313
+VPERMDZrmbk 4314
+VPERMDZrmbkz 4315
+VPERMDZrmk 4316
+VPERMDZrmkz 4317
+VPERMDZrr 4318
+VPERMDZrrk 4319
+VPERMDZrrkz 4320
+VPERMI 4321
+VPERMIL 4322
+VPERMILPDYmi 4323
+VPERMILPDYri 4324
+VPERMILPDYrm 4325
+VPERMILPDYrr 4326
+VPERMILPDZ 4327
+VPERMILPDZmbi 4328
+VPERMILPDZmbik 4329
+VPERMILPDZmbikz 4330
+VPERMILPDZmi 4331
+VPERMILPDZmik 4332
+VPERMILPDZmikz 4333
+VPERMILPDZri 4334
+VPERMILPDZrik 4335
+VPERMILPDZrikz 4336
+VPERMILPDZrm 4337
+VPERMILPDZrmb 4338
+VPERMILPDZrmbk 4339
+VPERMILPDZrmbkz 4340
+VPERMILPDZrmk 4341
+VPERMILPDZrmkz 4342
+VPERMILPDZrr 4343
+VPERMILPDZrrk 4344
+VPERMILPDZrrkz 4345
+VPERMILPDmi 4346
+VPERMILPDri 4347
+VPERMILPDrm 4348
+VPERMILPDrr 4349
+VPERMILPSYmi 4350
+VPERMILPSYri 4351
+VPERMILPSYrm 4352
+VPERMILPSYrr 4353
+VPERMILPSZ 4354
+VPERMILPSZmbi 4355
+VPERMILPSZmbik 4356
+VPERMILPSZmbikz 4357
+VPERMILPSZmi 4358
+VPERMILPSZmik 4359
+VPERMILPSZmikz 4360
+VPERMILPSZri 4361
+VPERMILPSZrik 4362
+VPERMILPSZrikz 4363
+VPERMILPSZrm 4364
+VPERMILPSZrmb 4365
+VPERMILPSZrmbk 4366
+VPERMILPSZrmbkz 4367
+VPERMILPSZrmk 4368
+VPERMILPSZrmkz 4369
+VPERMILPSZrr 4370
+VPERMILPSZrrk 4371
+VPERMILPSZrrkz 4372
+VPERMILPSmi 4373
+VPERMILPSri 4374
+VPERMILPSrm 4375
+VPERMILPSrr 4376
+VPERMPDYmi 4377
+VPERMPDYri 4378
+VPERMPDZ 4379
+VPERMPDZmbi 4380
+VPERMPDZmbik 4381
+VPERMPDZmbikz 4382
+VPERMPDZmi 4383
+VPERMPDZmik 4384
+VPERMPDZmikz 4385
+VPERMPDZri 4386
+VPERMPDZrik 4387
+VPERMPDZrikz 4388
+VPERMPDZrm 4389
+VPERMPDZrmb 4390
+VPERMPDZrmbk 4391
+VPERMPDZrmbkz 4392
+VPERMPDZrmk 4393
+VPERMPDZrmkz 4394
+VPERMPDZrr 4395
+VPERMPDZrrk 4396
+VPERMPDZrrkz 4397
+VPERMPSYrm 4398
+VPERMPSYrr 4399
+VPERMPSZ 4400
+VPERMPSZrm 4401
+VPERMPSZrmb 4402
+VPERMPSZrmbk 4403
+VPERMPSZrmbkz 4404
+VPERMPSZrmk 4405
+VPERMPSZrmkz 4406
+VPERMPSZrr 4407
+VPERMPSZrrk 4408
+VPERMPSZrrkz 4409
+VPERMQYmi 4410
+VPERMQYri 4411
+VPERMQZ 4412
+VPERMQZmbi 4413
+VPERMQZmbik 4414
+VPERMQZmbikz 4415
+VPERMQZmi 4416
+VPERMQZmik 4417
+VPERMQZmikz 4418
+VPERMQZri 4419
+VPERMQZrik 4420
+VPERMQZrikz 4421
+VPERMQZrm 4422
+VPERMQZrmb 4423
+VPERMQZrmbk 4424
+VPERMQZrmbkz 4425
+VPERMQZrmk 4426
+VPERMQZrmkz 4427
+VPERMQZrr 4428
+VPERMQZrrk 4429
+VPERMQZrrkz 4430
+VPERMT 4431
+VPERMWZ 4432
+VPERMWZrm 4433
+VPERMWZrmk 4434
+VPERMWZrmkz 4435
+VPERMWZrr 4436
+VPERMWZrrk 4437
+VPERMWZrrkz 4438
+VPEXPANDBZ 4439
+VPEXPANDBZrm 4440
+VPEXPANDBZrmk 4441
+VPEXPANDBZrmkz 4442
+VPEXPANDBZrr 4443
+VPEXPANDBZrrk 4444
+VPEXPANDBZrrkz 4445
+VPEXPANDDZ 4446
+VPEXPANDDZrm 4447
+VPEXPANDDZrmk 4448
+VPEXPANDDZrmkz 4449
+VPEXPANDDZrr 4450
+VPEXPANDDZrrk 4451
+VPEXPANDDZrrkz 4452
+VPEXPANDQZ 4453
+VPEXPANDQZrm 4454
+VPEXPANDQZrmk 4455
+VPEXPANDQZrmkz 4456
+VPEXPANDQZrr 4457
+VPEXPANDQZrrk 4458
+VPEXPANDQZrrkz 4459
+VPEXPANDWZ 4460
+VPEXPANDWZrm 4461
+VPEXPANDWZrmk 4462
+VPEXPANDWZrmkz 4463
+VPEXPANDWZrr 4464
+VPEXPANDWZrrk 4465
+VPEXPANDWZrrkz 4466
+VPEXTRBZmri 4467
+VPEXTRBZrri 4468
+VPEXTRBmri 4469
+VPEXTRBrri 4470
+VPEXTRDZmri 4471
+VPEXTRDZrri 4472
+VPEXTRDmri 4473
+VPEXTRDrri 4474
+VPEXTRQZmri 4475
+VPEXTRQZrri 4476
+VPEXTRQmri 4477
+VPEXTRQrri 4478
+VPEXTRWZmri 4479
+VPEXTRWZrri 4480
+VPEXTRWZrri_REV 4481
+VPEXTRWmri 4482
+VPEXTRWrri 4483
+VPEXTRWrri_REV 4484
+VPGATHERDDYrm 4485
+VPGATHERDDZ 4486
+VPGATHERDDZrm 4487
+VPGATHERDDrm 4488
+VPGATHERDQYrm 4489
+VPGATHERDQZ 4490
+VPGATHERDQZrm 4491
+VPGATHERDQrm 4492
+VPGATHERQDYrm 4493
+VPGATHERQDZ 4494
+VPGATHERQDZrm 4495
+VPGATHERQDrm 4496
+VPGATHERQQYrm 4497
+VPGATHERQQZ 4498
+VPGATHERQQZrm 4499
+VPGATHERQQrm 4500
+VPHADDBDrm 4501
+VPHADDBDrr 4502
+VPHADDBQrm 4503
+VPHADDBQrr 4504
+VPHADDBWrm 4505
+VPHADDBWrr 4506
+VPHADDDQrm 4507
+VPHADDDQrr 4508
+VPHADDDYrm 4509
+VPHADDDYrr 4510
+VPHADDDrm 4511
+VPHADDDrr 4512
+VPHADDSWYrm 4513
+VPHADDSWYrr 4514
+VPHADDSWrm 4515
+VPHADDSWrr 4516
+VPHADDUBDrm 4517
+VPHADDUBDrr 4518
+VPHADDUBQrm 4519
+VPHADDUBQrr 4520
+VPHADDUBWrm 4521
+VPHADDUBWrr 4522
+VPHADDUDQrm 4523
+VPHADDUDQrr 4524
+VPHADDUWDrm 4525
+VPHADDUWDrr 4526
+VPHADDUWQrm 4527
+VPHADDUWQrr 4528
+VPHADDWDrm 4529
+VPHADDWDrr 4530
+VPHADDWQrm 4531
+VPHADDWQrr 4532
+VPHADDWYrm 4533
+VPHADDWYrr 4534
+VPHADDWrm 4535
+VPHADDWrr 4536
+VPHMINPOSUWrm 4537
+VPHMINPOSUWrr 4538
+VPHSUBBWrm 4539
+VPHSUBBWrr 4540
+VPHSUBDQrm 4541
+VPHSUBDQrr 4542
+VPHSUBDYrm 4543
+VPHSUBDYrr 4544
+VPHSUBDrm 4545
+VPHSUBDrr 4546
+VPHSUBSWYrm 4547
+VPHSUBSWYrr 4548
+VPHSUBSWrm 4549
+VPHSUBSWrr 4550
+VPHSUBWDrm 4551
+VPHSUBWDrr 4552
+VPHSUBWYrm 4553
+VPHSUBWYrr 4554
+VPHSUBWrm 4555
+VPHSUBWrr 4556
+VPINSRBZrmi 4557
+VPINSRBZrri 4558
+VPINSRBrmi 4559
+VPINSRBrri 4560
+VPINSRDZrmi 4561
+VPINSRDZrri 4562
+VPINSRDrmi 4563
+VPINSRDrri 4564
+VPINSRQZrmi 4565
+VPINSRQZrri 4566
+VPINSRQrmi 4567
+VPINSRQrri 4568
+VPINSRWZrmi 4569
+VPINSRWZrri 4570
+VPINSRWrmi 4571
+VPINSRWrri 4572
+VPLZCNTDZ 4573
+VPLZCNTDZrm 4574
+VPLZCNTDZrmb 4575
+VPLZCNTDZrmbk 4576
+VPLZCNTDZrmbkz 4577
+VPLZCNTDZrmk 4578
+VPLZCNTDZrmkz 4579
+VPLZCNTDZrr 4580
+VPLZCNTDZrrk 4581
+VPLZCNTDZrrkz 4582
+VPLZCNTQZ 4583
+VPLZCNTQZrm 4584
+VPLZCNTQZrmb 4585
+VPLZCNTQZrmbk 4586
+VPLZCNTQZrmbkz 4587
+VPLZCNTQZrmk 4588
+VPLZCNTQZrmkz 4589
+VPLZCNTQZrr 4590
+VPLZCNTQZrrk 4591
+VPLZCNTQZrrkz 4592
+VPMACSDDrm 4593
+VPMACSDDrr 4594
+VPMACSDQHrm 4595
+VPMACSDQHrr 4596
+VPMACSDQLrm 4597
+VPMACSDQLrr 4598
+VPMACSSDDrm 4599
+VPMACSSDDrr 4600
+VPMACSSDQHrm 4601
+VPMACSSDQHrr 4602
+VPMACSSDQLrm 4603
+VPMACSSDQLrr 4604
+VPMACSSWDrm 4605
+VPMACSSWDrr 4606
+VPMACSSWWrm 4607
+VPMACSSWWrr 4608
+VPMACSWDrm 4609
+VPMACSWDrr 4610
+VPMACSWWrm 4611
+VPMACSWWrr 4612
+VPMADCSSWDrm 4613
+VPMADCSSWDrr 4614
+VPMADCSWDrm 4615
+VPMADCSWDrr 4616
+VPMADD 4617
+VPMADDUBSWYrm 4618
+VPMADDUBSWYrr 4619
+VPMADDUBSWZ 4620
+VPMADDUBSWZrm 4621
+VPMADDUBSWZrmk 4622
+VPMADDUBSWZrmkz 4623
+VPMADDUBSWZrr 4624
+VPMADDUBSWZrrk 4625
+VPMADDUBSWZrrkz 4626
+VPMADDUBSWrm 4627
+VPMADDUBSWrr 4628
+VPMADDWDYrm 4629
+VPMADDWDYrr 4630
+VPMADDWDZ 4631
+VPMADDWDZrm 4632
+VPMADDWDZrmk 4633
+VPMADDWDZrmkz 4634
+VPMADDWDZrr 4635
+VPMADDWDZrrk 4636
+VPMADDWDZrrkz 4637
+VPMADDWDrm 4638
+VPMADDWDrr 4639
+VPMASKMOVDYmr 4640
+VPMASKMOVDYrm 4641
+VPMASKMOVDmr 4642
+VPMASKMOVDrm 4643
+VPMASKMOVQYmr 4644
+VPMASKMOVQYrm 4645
+VPMASKMOVQmr 4646
+VPMASKMOVQrm 4647
+VPMAXSBYrm 4648
+VPMAXSBYrr 4649
+VPMAXSBZ 4650
+VPMAXSBZrm 4651
+VPMAXSBZrmk 4652
+VPMAXSBZrmkz 4653
+VPMAXSBZrr 4654
+VPMAXSBZrrk 4655
+VPMAXSBZrrkz 4656
+VPMAXSBrm 4657
+VPMAXSBrr 4658
+VPMAXSDYrm 4659
+VPMAXSDYrr 4660
+VPMAXSDZ 4661
+VPMAXSDZrm 4662
+VPMAXSDZrmb 4663
+VPMAXSDZrmbk 4664
+VPMAXSDZrmbkz 4665
+VPMAXSDZrmk 4666
+VPMAXSDZrmkz 4667
+VPMAXSDZrr 4668
+VPMAXSDZrrk 4669
+VPMAXSDZrrkz 4670
+VPMAXSDrm 4671
+VPMAXSDrr 4672
+VPMAXSQZ 4673
+VPMAXSQZrm 4674
+VPMAXSQZrmb 4675
+VPMAXSQZrmbk 4676
+VPMAXSQZrmbkz 4677
+VPMAXSQZrmk 4678
+VPMAXSQZrmkz 4679
+VPMAXSQZrr 4680
+VPMAXSQZrrk 4681
+VPMAXSQZrrkz 4682
+VPMAXSWYrm 4683
+VPMAXSWYrr 4684
+VPMAXSWZ 4685
+VPMAXSWZrm 4686
+VPMAXSWZrmk 4687
+VPMAXSWZrmkz 4688
+VPMAXSWZrr 4689
+VPMAXSWZrrk 4690
+VPMAXSWZrrkz 4691
+VPMAXSWrm 4692
+VPMAXSWrr 4693
+VPMAXUBYrm 4694
+VPMAXUBYrr 4695
+VPMAXUBZ 4696
+VPMAXUBZrm 4697
+VPMAXUBZrmk 4698
+VPMAXUBZrmkz 4699
+VPMAXUBZrr 4700
+VPMAXUBZrrk 4701
+VPMAXUBZrrkz 4702
+VPMAXUBrm 4703
+VPMAXUBrr 4704
+VPMAXUDYrm 4705
+VPMAXUDYrr 4706
+VPMAXUDZ 4707
+VPMAXUDZrm 4708
+VPMAXUDZrmb 4709
+VPMAXUDZrmbk 4710
+VPMAXUDZrmbkz 4711
+VPMAXUDZrmk 4712
+VPMAXUDZrmkz 4713
+VPMAXUDZrr 4714
+VPMAXUDZrrk 4715
+VPMAXUDZrrkz 4716
+VPMAXUDrm 4717
+VPMAXUDrr 4718
+VPMAXUQZ 4719
+VPMAXUQZrm 4720
+VPMAXUQZrmb 4721
+VPMAXUQZrmbk 4722
+VPMAXUQZrmbkz 4723
+VPMAXUQZrmk 4724
+VPMAXUQZrmkz 4725
+VPMAXUQZrr 4726
+VPMAXUQZrrk 4727
+VPMAXUQZrrkz 4728
+VPMAXUWYrm 4729
+VPMAXUWYrr 4730
+VPMAXUWZ 4731
+VPMAXUWZrm 4732
+VPMAXUWZrmk 4733
+VPMAXUWZrmkz 4734
+VPMAXUWZrr 4735
+VPMAXUWZrrk 4736
+VPMAXUWZrrkz 4737
+VPMAXUWrm 4738
+VPMAXUWrr 4739
+VPMINSBYrm 4740
+VPMINSBYrr 4741
+VPMINSBZ 4742
+VPMINSBZrm 4743
+VPMINSBZrmk 4744
+VPMINSBZrmkz 4745
+VPMINSBZrr 4746
+VPMINSBZrrk 4747
+VPMINSBZrrkz 4748
+VPMINSBrm 4749
+VPMINSBrr 4750
+VPMINSDYrm 4751
+VPMINSDYrr 4752
+VPMINSDZ 4753
+VPMINSDZrm 4754
+VPMINSDZrmb 4755
+VPMINSDZrmbk 4756
+VPMINSDZrmbkz 4757
+VPMINSDZrmk 4758
+VPMINSDZrmkz 4759
+VPMINSDZrr 4760
+VPMINSDZrrk 4761
+VPMINSDZrrkz 4762
+VPMINSDrm 4763
+VPMINSDrr 4764
+VPMINSQZ 4765
+VPMINSQZrm 4766
+VPMINSQZrmb 4767
+VPMINSQZrmbk 4768
+VPMINSQZrmbkz 4769
+VPMINSQZrmk 4770
+VPMINSQZrmkz 4771
+VPMINSQZrr 4772
+VPMINSQZrrk 4773
+VPMINSQZrrkz 4774
+VPMINSWYrm 4775
+VPMINSWYrr 4776
+VPMINSWZ 4777
+VPMINSWZrm 4778
+VPMINSWZrmk 4779
+VPMINSWZrmkz 4780
+VPMINSWZrr 4781
+VPMINSWZrrk 4782
+VPMINSWZrrkz 4783
+VPMINSWrm 4784
+VPMINSWrr 4785
+VPMINUBYrm 4786
+VPMINUBYrr 4787
+VPMINUBZ 4788
+VPMINUBZrm 4789
+VPMINUBZrmk 4790
+VPMINUBZrmkz 4791
+VPMINUBZrr 4792
+VPMINUBZrrk 4793
+VPMINUBZrrkz 4794
+VPMINUBrm 4795
+VPMINUBrr 4796
+VPMINUDYrm 4797
+VPMINUDYrr 4798
+VPMINUDZ 4799
+VPMINUDZrm 4800
+VPMINUDZrmb 4801
+VPMINUDZrmbk 4802
+VPMINUDZrmbkz 4803
+VPMINUDZrmk 4804
+VPMINUDZrmkz 4805
+VPMINUDZrr 4806
+VPMINUDZrrk 4807
+VPMINUDZrrkz 4808
+VPMINUDrm 4809
+VPMINUDrr 4810
+VPMINUQZ 4811
+VPMINUQZrm 4812
+VPMINUQZrmb 4813
+VPMINUQZrmbk 4814
+VPMINUQZrmbkz 4815
+VPMINUQZrmk 4816
+VPMINUQZrmkz 4817
+VPMINUQZrr 4818
+VPMINUQZrrk 4819
+VPMINUQZrrkz 4820
+VPMINUWYrm 4821
+VPMINUWYrr 4822
+VPMINUWZ 4823
+VPMINUWZrm 4824
+VPMINUWZrmk 4825
+VPMINUWZrmkz 4826
+VPMINUWZrr 4827
+VPMINUWZrrk 4828
+VPMINUWZrrkz 4829
+VPMINUWrm 4830
+VPMINUWrr 4831
+VPMOVB 4832
+VPMOVD 4833
+VPMOVDBZ 4834
+VPMOVDBZmr 4835
+VPMOVDBZmrk 4836
+VPMOVDBZrr 4837
+VPMOVDBZrrk 4838
+VPMOVDBZrrkz 4839
+VPMOVDWZ 4840
+VPMOVDWZmr 4841
+VPMOVDWZmrk 4842
+VPMOVDWZrr 4843
+VPMOVDWZrrk 4844
+VPMOVDWZrrkz 4845
+VPMOVM 4846
+VPMOVMSKBYrr 4847
+VPMOVMSKBrr 4848
+VPMOVQ 4849
+VPMOVQBZ 4850
+VPMOVQBZmr 4851
+VPMOVQBZmrk 4852
+VPMOVQBZrr 4853
+VPMOVQBZrrk 4854
+VPMOVQBZrrkz 4855
+VPMOVQDZ 4856
+VPMOVQDZmr 4857
+VPMOVQDZmrk 4858
+VPMOVQDZrr 4859
+VPMOVQDZrrk 4860
+VPMOVQDZrrkz 4861
+VPMOVQWZ 4862
+VPMOVQWZmr 4863
+VPMOVQWZmrk 4864
+VPMOVQWZrr 4865
+VPMOVQWZrrk 4866
+VPMOVQWZrrkz 4867
+VPMOVSDBZ 4868
+VPMOVSDBZmr 4869
+VPMOVSDBZmrk 4870
+VPMOVSDBZrr 4871
+VPMOVSDBZrrk 4872
+VPMOVSDBZrrkz 4873
+VPMOVSDWZ 4874
+VPMOVSDWZmr 4875
+VPMOVSDWZmrk 4876
+VPMOVSDWZrr 4877
+VPMOVSDWZrrk 4878
+VPMOVSDWZrrkz 4879
+VPMOVSQBZ 4880
+VPMOVSQBZmr 4881
+VPMOVSQBZmrk 4882
+VPMOVSQBZrr 4883
+VPMOVSQBZrrk 4884
+VPMOVSQBZrrkz 4885
+VPMOVSQDZ 4886
+VPMOVSQDZmr 4887
+VPMOVSQDZmrk 4888
+VPMOVSQDZrr 4889
+VPMOVSQDZrrk 4890
+VPMOVSQDZrrkz 4891
+VPMOVSQWZ 4892
+VPMOVSQWZmr 4893
+VPMOVSQWZmrk 4894
+VPMOVSQWZrr 4895
+VPMOVSQWZrrk 4896
+VPMOVSQWZrrkz 4897
+VPMOVSWBZ 4898
+VPMOVSWBZmr 4899
+VPMOVSWBZmrk 4900
+VPMOVSWBZrr 4901
+VPMOVSWBZrrk 4902
+VPMOVSWBZrrkz 4903
+VPMOVSXBDYrm 4904
+VPMOVSXBDYrr 4905
+VPMOVSXBDZ 4906
+VPMOVSXBDZrm 4907
+VPMOVSXBDZrmk 4908
+VPMOVSXBDZrmkz 4909
+VPMOVSXBDZrr 4910
+VPMOVSXBDZrrk 4911
+VPMOVSXBDZrrkz 4912
+VPMOVSXBDrm 4913
+VPMOVSXBDrr 4914
+VPMOVSXBQYrm 4915
+VPMOVSXBQYrr 4916
+VPMOVSXBQZ 4917
+VPMOVSXBQZrm 4918
+VPMOVSXBQZrmk 4919
+VPMOVSXBQZrmkz 4920
+VPMOVSXBQZrr 4921
+VPMOVSXBQZrrk 4922
+VPMOVSXBQZrrkz 4923
+VPMOVSXBQrm 4924
+VPMOVSXBQrr 4925
+VPMOVSXBWYrm 4926
+VPMOVSXBWYrr 4927
+VPMOVSXBWZ 4928
+VPMOVSXBWZrm 4929
+VPMOVSXBWZrmk 4930
+VPMOVSXBWZrmkz 4931
+VPMOVSXBWZrr 4932
+VPMOVSXBWZrrk 4933
+VPMOVSXBWZrrkz 4934
+VPMOVSXBWrm 4935
+VPMOVSXBWrr 4936
+VPMOVSXDQYrm 4937
+VPMOVSXDQYrr 4938
+VPMOVSXDQZ 4939
+VPMOVSXDQZrm 4940
+VPMOVSXDQZrmk 4941
+VPMOVSXDQZrmkz 4942
+VPMOVSXDQZrr 4943
+VPMOVSXDQZrrk 4944
+VPMOVSXDQZrrkz 4945
+VPMOVSXDQrm 4946
+VPMOVSXDQrr 4947
+VPMOVSXWDYrm 4948
+VPMOVSXWDYrr 4949
+VPMOVSXWDZ 4950
+VPMOVSXWDZrm 4951
+VPMOVSXWDZrmk 4952
+VPMOVSXWDZrmkz 4953
+VPMOVSXWDZrr 4954
+VPMOVSXWDZrrk 4955
+VPMOVSXWDZrrkz 4956
+VPMOVSXWDrm 4957
+VPMOVSXWDrr 4958
+VPMOVSXWQYrm 4959
+VPMOVSXWQYrr 4960
+VPMOVSXWQZ 4961
+VPMOVSXWQZrm 4962
+VPMOVSXWQZrmk 4963
+VPMOVSXWQZrmkz 4964
+VPMOVSXWQZrr 4965
+VPMOVSXWQZrrk 4966
+VPMOVSXWQZrrkz 4967
+VPMOVSXWQrm 4968
+VPMOVSXWQrr 4969
+VPMOVUSDBZ 4970
+VPMOVUSDBZmr 4971
+VPMOVUSDBZmrk 4972
+VPMOVUSDBZrr 4973
+VPMOVUSDBZrrk 4974
+VPMOVUSDBZrrkz 4975
+VPMOVUSDWZ 4976
+VPMOVUSDWZmr 4977
+VPMOVUSDWZmrk 4978
+VPMOVUSDWZrr 4979
+VPMOVUSDWZrrk 4980
+VPMOVUSDWZrrkz 4981
+VPMOVUSQBZ 4982
+VPMOVUSQBZmr 4983
+VPMOVUSQBZmrk 4984
+VPMOVUSQBZrr 4985
+VPMOVUSQBZrrk 4986
+VPMOVUSQBZrrkz 4987
+VPMOVUSQDZ 4988
+VPMOVUSQDZmr 4989
+VPMOVUSQDZmrk 4990
+VPMOVUSQDZrr 4991
+VPMOVUSQDZrrk 4992
+VPMOVUSQDZrrkz 4993
+VPMOVUSQWZ 4994
+VPMOVUSQWZmr 4995
+VPMOVUSQWZmrk 4996
+VPMOVUSQWZrr 4997
+VPMOVUSQWZrrk 4998
+VPMOVUSQWZrrkz 4999
+VPMOVUSWBZ 5000
+VPMOVUSWBZmr 5001
+VPMOVUSWBZmrk 5002
+VPMOVUSWBZrr 5003
+VPMOVUSWBZrrk 5004
+VPMOVUSWBZrrkz 5005
+VPMOVW 5006
+VPMOVWBZ 5007
+VPMOVWBZmr 5008
+VPMOVWBZmrk 5009
+VPMOVWBZrr 5010
+VPMOVWBZrrk 5011
+VPMOVWBZrrkz 5012
+VPMOVZXBDYrm 5013
+VPMOVZXBDYrr 5014
+VPMOVZXBDZ 5015
+VPMOVZXBDZrm 5016
+VPMOVZXBDZrmk 5017
+VPMOVZXBDZrmkz 5018
+VPMOVZXBDZrr 5019
+VPMOVZXBDZrrk 5020
+VPMOVZXBDZrrkz 5021
+VPMOVZXBDrm 5022
+VPMOVZXBDrr 5023
+VPMOVZXBQYrm 5024
+VPMOVZXBQYrr 5025
+VPMOVZXBQZ 5026
+VPMOVZXBQZrm 5027
+VPMOVZXBQZrmk 5028
+VPMOVZXBQZrmkz 5029
+VPMOVZXBQZrr 5030
+VPMOVZXBQZrrk 5031
+VPMOVZXBQZrrkz 5032
+VPMOVZXBQrm 5033
+VPMOVZXBQrr 5034
+VPMOVZXBWYrm 5035
+VPMOVZXBWYrr 5036
+VPMOVZXBWZ 5037
+VPMOVZXBWZrm 5038
+VPMOVZXBWZrmk 5039
+VPMOVZXBWZrmkz 5040
+VPMOVZXBWZrr 5041
+VPMOVZXBWZrrk 5042
+VPMOVZXBWZrrkz 5043
+VPMOVZXBWrm 5044
+VPMOVZXBWrr 5045
+VPMOVZXDQYrm 5046
+VPMOVZXDQYrr 5047
+VPMOVZXDQZ 5048
+VPMOVZXDQZrm 5049
+VPMOVZXDQZrmk 5050
+VPMOVZXDQZrmkz 5051
+VPMOVZXDQZrr 5052
+VPMOVZXDQZrrk 5053
+VPMOVZXDQZrrkz 5054
+VPMOVZXDQrm 5055
+VPMOVZXDQrr 5056
+VPMOVZXWDYrm 5057
+VPMOVZXWDYrr 5058
+VPMOVZXWDZ 5059
+VPMOVZXWDZrm 5060
+VPMOVZXWDZrmk 5061
+VPMOVZXWDZrmkz 5062
+VPMOVZXWDZrr 5063
+VPMOVZXWDZrrk 5064
+VPMOVZXWDZrrkz 5065
+VPMOVZXWDrm 5066
+VPMOVZXWDrr 5067
+VPMOVZXWQYrm 5068
+VPMOVZXWQYrr 5069
+VPMOVZXWQZ 5070
+VPMOVZXWQZrm 5071
+VPMOVZXWQZrmk 5072
+VPMOVZXWQZrmkz 5073
+VPMOVZXWQZrr 5074
+VPMOVZXWQZrrk 5075
+VPMOVZXWQZrrkz 5076
+VPMOVZXWQrm 5077
+VPMOVZXWQrr 5078
+VPMULDQYrm 5079
+VPMULDQYrr 5080
+VPMULDQZ 5081
+VPMULDQZrm 5082
+VPMULDQZrmb 5083
+VPMULDQZrmbk 5084
+VPMULDQZrmbkz 5085
+VPMULDQZrmk 5086
+VPMULDQZrmkz 5087
+VPMULDQZrr 5088
+VPMULDQZrrk 5089
+VPMULDQZrrkz 5090
+VPMULDQrm 5091
+VPMULDQrr 5092
+VPMULHRSWYrm 5093
+VPMULHRSWYrr 5094
+VPMULHRSWZ 5095
+VPMULHRSWZrm 5096
+VPMULHRSWZrmk 5097
+VPMULHRSWZrmkz 5098
+VPMULHRSWZrr 5099
+VPMULHRSWZrrk 5100
+VPMULHRSWZrrkz 5101
+VPMULHRSWrm 5102
+VPMULHRSWrr 5103
+VPMULHUWYrm 5104
+VPMULHUWYrr 5105
+VPMULHUWZ 5106
+VPMULHUWZrm 5107
+VPMULHUWZrmk 5108
+VPMULHUWZrmkz 5109
+VPMULHUWZrr 5110
+VPMULHUWZrrk 5111
+VPMULHUWZrrkz 5112
+VPMULHUWrm 5113
+VPMULHUWrr 5114
+VPMULHWYrm 5115
+VPMULHWYrr 5116
+VPMULHWZ 5117
+VPMULHWZrm 5118
+VPMULHWZrmk 5119
+VPMULHWZrmkz 5120
+VPMULHWZrr 5121
+VPMULHWZrrk 5122
+VPMULHWZrrkz 5123
+VPMULHWrm 5124
+VPMULHWrr 5125
+VPMULLDYrm 5126
+VPMULLDYrr 5127
+VPMULLDZ 5128
+VPMULLDZrm 5129
+VPMULLDZrmb 5130
+VPMULLDZrmbk 5131
+VPMULLDZrmbkz 5132
+VPMULLDZrmk 5133
+VPMULLDZrmkz 5134
+VPMULLDZrr 5135
+VPMULLDZrrk 5136
+VPMULLDZrrkz 5137
+VPMULLDrm 5138
+VPMULLDrr 5139
+VPMULLQZ 5140
+VPMULLQZrm 5141
+VPMULLQZrmb 5142
+VPMULLQZrmbk 5143
+VPMULLQZrmbkz 5144
+VPMULLQZrmk 5145
+VPMULLQZrmkz 5146
+VPMULLQZrr 5147
+VPMULLQZrrk 5148
+VPMULLQZrrkz 5149
+VPMULLWYrm 5150
+VPMULLWYrr 5151
+VPMULLWZ 5152
+VPMULLWZrm 5153
+VPMULLWZrmk 5154
+VPMULLWZrmkz 5155
+VPMULLWZrr 5156
+VPMULLWZrrk 5157
+VPMULLWZrrkz 5158
+VPMULLWrm 5159
+VPMULLWrr 5160
+VPMULTISHIFTQBZ 5161
+VPMULTISHIFTQBZrm 5162
+VPMULTISHIFTQBZrmb 5163
+VPMULTISHIFTQBZrmbk 5164
+VPMULTISHIFTQBZrmbkz 5165
+VPMULTISHIFTQBZrmk 5166
+VPMULTISHIFTQBZrmkz 5167
+VPMULTISHIFTQBZrr 5168
+VPMULTISHIFTQBZrrk 5169
+VPMULTISHIFTQBZrrkz 5170
+VPMULUDQYrm 5171
+VPMULUDQYrr 5172
+VPMULUDQZ 5173
+VPMULUDQZrm 5174
+VPMULUDQZrmb 5175
+VPMULUDQZrmbk 5176
+VPMULUDQZrmbkz 5177
+VPMULUDQZrmk 5178
+VPMULUDQZrmkz 5179
+VPMULUDQZrr 5180
+VPMULUDQZrrk 5181
+VPMULUDQZrrkz 5182
+VPMULUDQrm 5183
+VPMULUDQrr 5184
+VPOPCNTBZ 5185
+VPOPCNTBZrm 5186
+VPOPCNTBZrmk 5187
+VPOPCNTBZrmkz 5188
+VPOPCNTBZrr 5189
+VPOPCNTBZrrk 5190
+VPOPCNTBZrrkz 5191
+VPOPCNTDZ 5192
+VPOPCNTDZrm 5193
+VPOPCNTDZrmb 5194
+VPOPCNTDZrmbk 5195
+VPOPCNTDZrmbkz 5196
+VPOPCNTDZrmk 5197
+VPOPCNTDZrmkz 5198
+VPOPCNTDZrr 5199
+VPOPCNTDZrrk 5200
+VPOPCNTDZrrkz 5201
+VPOPCNTQZ 5202
+VPOPCNTQZrm 5203
+VPOPCNTQZrmb 5204
+VPOPCNTQZrmbk 5205
+VPOPCNTQZrmbkz 5206
+VPOPCNTQZrmk 5207
+VPOPCNTQZrmkz 5208
+VPOPCNTQZrr 5209
+VPOPCNTQZrrk 5210
+VPOPCNTQZrrkz 5211
+VPOPCNTWZ 5212
+VPOPCNTWZrm 5213
+VPOPCNTWZrmk 5214
+VPOPCNTWZrmkz 5215
+VPOPCNTWZrr 5216
+VPOPCNTWZrrk 5217
+VPOPCNTWZrrkz 5218
+VPORDZ 5219
+VPORDZrm 5220
+VPORDZrmb 5221
+VPORDZrmbk 5222
+VPORDZrmbkz 5223
+VPORDZrmk 5224
+VPORDZrmkz 5225
+VPORDZrr 5226
+VPORDZrrk 5227
+VPORDZrrkz 5228
+VPORQZ 5229
+VPORQZrm 5230
+VPORQZrmb 5231
+VPORQZrmbk 5232
+VPORQZrmbkz 5233
+VPORQZrmk 5234
+VPORQZrmkz 5235
+VPORQZrr 5236
+VPORQZrrk 5237
+VPORQZrrkz 5238
+VPORYrm 5239
+VPORYrr 5240
+VPORrm 5241
+VPORrr 5242
+VPPERMrmr 5243
+VPPERMrrm 5244
+VPPERMrrr 5245
+VPPERMrrr_REV 5246
+VPROLDZ 5247
+VPROLDZmbi 5248
+VPROLDZmbik 5249
+VPROLDZmbikz 5250
+VPROLDZmi 5251
+VPROLDZmik 5252
+VPROLDZmikz 5253
+VPROLDZri 5254
+VPROLDZrik 5255
+VPROLDZrikz 5256
+VPROLQZ 5257
+VPROLQZmbi 5258
+VPROLQZmbik 5259
+VPROLQZmbikz 5260
+VPROLQZmi 5261
+VPROLQZmik 5262
+VPROLQZmikz 5263
+VPROLQZri 5264
+VPROLQZrik 5265
+VPROLQZrikz 5266
+VPROLVDZ 5267
+VPROLVDZrm 5268
+VPROLVDZrmb 5269
+VPROLVDZrmbk 5270
+VPROLVDZrmbkz 5271
+VPROLVDZrmk 5272
+VPROLVDZrmkz 5273
+VPROLVDZrr 5274
+VPROLVDZrrk 5275
+VPROLVDZrrkz 5276
+VPROLVQZ 5277
+VPROLVQZrm 5278
+VPROLVQZrmb 5279
+VPROLVQZrmbk 5280
+VPROLVQZrmbkz 5281
+VPROLVQZrmk 5282
+VPROLVQZrmkz 5283
+VPROLVQZrr 5284
+VPROLVQZrrk 5285
+VPROLVQZrrkz 5286
+VPRORDZ 5287
+VPRORDZmbi 5288
+VPRORDZmbik 5289
+VPRORDZmbikz 5290
+VPRORDZmi 5291
+VPRORDZmik 5292
+VPRORDZmikz 5293
+VPRORDZri 5294
+VPRORDZrik 5295
+VPRORDZrikz 5296
+VPRORQZ 5297
+VPRORQZmbi 5298
+VPRORQZmbik 5299
+VPRORQZmbikz 5300
+VPRORQZmi 5301
+VPRORQZmik 5302
+VPRORQZmikz 5303
+VPRORQZri 5304
+VPRORQZrik 5305
+VPRORQZrikz 5306
+VPRORVDZ 5307
+VPRORVDZrm 5308
+VPRORVDZrmb 5309
+VPRORVDZrmbk 5310
+VPRORVDZrmbkz 5311
+VPRORVDZrmk 5312
+VPRORVDZrmkz 5313
+VPRORVDZrr 5314
+VPRORVDZrrk 5315
+VPRORVDZrrkz 5316
+VPRORVQZ 5317
+VPRORVQZrm 5318
+VPRORVQZrmb 5319
+VPRORVQZrmbk 5320
+VPRORVQZrmbkz 5321
+VPRORVQZrmk 5322
+VPRORVQZrmkz 5323
+VPRORVQZrr 5324
+VPRORVQZrrk 5325
+VPRORVQZrrkz 5326
+VPROTBmi 5327
+VPROTBmr 5328
+VPROTBri 5329
+VPROTBrm 5330
+VPROTBrr 5331
+VPROTBrr_REV 5332
+VPROTDmi 5333
+VPROTDmr 5334
+VPROTDri 5335
+VPROTDrm 5336
+VPROTDrr 5337
+VPROTDrr_REV 5338
+VPROTQmi 5339
+VPROTQmr 5340
+VPROTQri 5341
+VPROTQrm 5342
+VPROTQrr 5343
+VPROTQrr_REV 5344
+VPROTWmi 5345
+VPROTWmr 5346
+VPROTWri 5347
+VPROTWrm 5348
+VPROTWrr 5349
+VPROTWrr_REV 5350
+VPSADBWYrm 5351
+VPSADBWYrr 5352
+VPSADBWZ 5353
+VPSADBWZrm 5354
+VPSADBWZrr 5355
+VPSADBWrm 5356
+VPSADBWrr 5357
+VPSCATTERDDZ 5358
+VPSCATTERDDZmr 5359
+VPSCATTERDQZ 5360
+VPSCATTERDQZmr 5361
+VPSCATTERQDZ 5362
+VPSCATTERQDZmr 5363
+VPSCATTERQQZ 5364
+VPSCATTERQQZmr 5365
+VPSHABmr 5366
+VPSHABrm 5367
+VPSHABrr 5368
+VPSHABrr_REV 5369
+VPSHADmr 5370
+VPSHADrm 5371
+VPSHADrr 5372
+VPSHADrr_REV 5373
+VPSHAQmr 5374
+VPSHAQrm 5375
+VPSHAQrr 5376
+VPSHAQrr_REV 5377
+VPSHAWmr 5378
+VPSHAWrm 5379
+VPSHAWrr 5380
+VPSHAWrr_REV 5381
+VPSHLBmr 5382
+VPSHLBrm 5383
+VPSHLBrr 5384
+VPSHLBrr_REV 5385
+VPSHLDDZ 5386
+VPSHLDDZrmbi 5387
+VPSHLDDZrmbik 5388
+VPSHLDDZrmbikz 5389
+VPSHLDDZrmi 5390
+VPSHLDDZrmik 5391
+VPSHLDDZrmikz 5392
+VPSHLDDZrri 5393
+VPSHLDDZrrik 5394
+VPSHLDDZrrikz 5395
+VPSHLDQZ 5396
+VPSHLDQZrmbi 5397
+VPSHLDQZrmbik 5398
+VPSHLDQZrmbikz 5399
+VPSHLDQZrmi 5400
+VPSHLDQZrmik 5401
+VPSHLDQZrmikz 5402
+VPSHLDQZrri 5403
+VPSHLDQZrrik 5404
+VPSHLDQZrrikz 5405
+VPSHLDVDZ 5406
+VPSHLDVDZm 5407
+VPSHLDVDZmb 5408
+VPSHLDVDZmbk 5409
+VPSHLDVDZmbkz 5410
+VPSHLDVDZmk 5411
+VPSHLDVDZmkz 5412
+VPSHLDVDZr 5413
+VPSHLDVDZrk 5414
+VPSHLDVDZrkz 5415
+VPSHLDVQZ 5416
+VPSHLDVQZm 5417
+VPSHLDVQZmb 5418
+VPSHLDVQZmbk 5419
+VPSHLDVQZmbkz 5420
+VPSHLDVQZmk 5421
+VPSHLDVQZmkz 5422
+VPSHLDVQZr 5423
+VPSHLDVQZrk 5424
+VPSHLDVQZrkz 5425
+VPSHLDVWZ 5426
+VPSHLDVWZm 5427
+VPSHLDVWZmk 5428
+VPSHLDVWZmkz 5429
+VPSHLDVWZr 5430
+VPSHLDVWZrk 5431
+VPSHLDVWZrkz 5432
+VPSHLDWZ 5433
+VPSHLDWZrmi 5434
+VPSHLDWZrmik 5435
+VPSHLDWZrmikz 5436
+VPSHLDWZrri 5437
+VPSHLDWZrrik 5438
+VPSHLDWZrrikz 5439
+VPSHLDmr 5440
+VPSHLDrm 5441
+VPSHLDrr 5442
+VPSHLDrr_REV 5443
+VPSHLQmr 5444
+VPSHLQrm 5445
+VPSHLQrr 5446
+VPSHLQrr_REV 5447
+VPSHLWmr 5448
+VPSHLWrm 5449
+VPSHLWrr 5450
+VPSHLWrr_REV 5451
+VPSHRDDZ 5452
+VPSHRDDZrmbi 5453
+VPSHRDDZrmbik 5454
+VPSHRDDZrmbikz 5455
+VPSHRDDZrmi 5456
+VPSHRDDZrmik 5457
+VPSHRDDZrmikz 5458
+VPSHRDDZrri 5459
+VPSHRDDZrrik 5460
+VPSHRDDZrrikz 5461
+VPSHRDQZ 5462
+VPSHRDQZrmbi 5463
+VPSHRDQZrmbik 5464
+VPSHRDQZrmbikz 5465
+VPSHRDQZrmi 5466
+VPSHRDQZrmik 5467
+VPSHRDQZrmikz 5468
+VPSHRDQZrri 5469
+VPSHRDQZrrik 5470
+VPSHRDQZrrikz 5471
+VPSHRDVDZ 5472
+VPSHRDVDZm 5473
+VPSHRDVDZmb 5474
+VPSHRDVDZmbk 5475
+VPSHRDVDZmbkz 5476
+VPSHRDVDZmk 5477
+VPSHRDVDZmkz 5478
+VPSHRDVDZr 5479
+VPSHRDVDZrk 5480
+VPSHRDVDZrkz 5481
+VPSHRDVQZ 5482
+VPSHRDVQZm 5483
+VPSHRDVQZmb 5484
+VPSHRDVQZmbk 5485
+VPSHRDVQZmbkz 5486
+VPSHRDVQZmk 5487
+VPSHRDVQZmkz 5488
+VPSHRDVQZr 5489
+VPSHRDVQZrk 5490
+VPSHRDVQZrkz 5491
+VPSHRDVWZ 5492
+VPSHRDVWZm 5493
+VPSHRDVWZmk 5494
+VPSHRDVWZmkz 5495
+VPSHRDVWZr 5496
+VPSHRDVWZrk 5497
+VPSHRDVWZrkz 5498
+VPSHRDWZ 5499
+VPSHRDWZrmi 5500
+VPSHRDWZrmik 5501
+VPSHRDWZrmikz 5502
+VPSHRDWZrri 5503
+VPSHRDWZrrik 5504
+VPSHRDWZrrikz 5505
+VPSHUFBITQMBZ 5506
+VPSHUFBITQMBZrm 5507
+VPSHUFBITQMBZrmk 5508
+VPSHUFBITQMBZrr 5509
+VPSHUFBITQMBZrrk 5510
+VPSHUFBYrm 5511
+VPSHUFBYrr 5512
+VPSHUFBZ 5513
+VPSHUFBZrm 5514
+VPSHUFBZrmk 5515
+VPSHUFBZrmkz 5516
+VPSHUFBZrr 5517
+VPSHUFBZrrk 5518
+VPSHUFBZrrkz 5519
+VPSHUFBrm 5520
+VPSHUFBrr 5521
+VPSHUFDYmi 5522
+VPSHUFDYri 5523
+VPSHUFDZ 5524
+VPSHUFDZmbi 5525
+VPSHUFDZmbik 5526
+VPSHUFDZmbikz 5527
+VPSHUFDZmi 5528
+VPSHUFDZmik 5529
+VPSHUFDZmikz 5530
+VPSHUFDZri 5531
+VPSHUFDZrik 5532
+VPSHUFDZrikz 5533
+VPSHUFDmi 5534
+VPSHUFDri 5535
+VPSHUFHWYmi 5536
+VPSHUFHWYri 5537
+VPSHUFHWZ 5538
+VPSHUFHWZmi 5539
+VPSHUFHWZmik 5540
+VPSHUFHWZmikz 5541
+VPSHUFHWZri 5542
+VPSHUFHWZrik 5543
+VPSHUFHWZrikz 5544
+VPSHUFHWmi 5545
+VPSHUFHWri 5546
+VPSHUFLWYmi 5547
+VPSHUFLWYri 5548
+VPSHUFLWZ 5549
+VPSHUFLWZmi 5550
+VPSHUFLWZmik 5551
+VPSHUFLWZmikz 5552
+VPSHUFLWZri 5553
+VPSHUFLWZrik 5554
+VPSHUFLWZrikz 5555
+VPSHUFLWmi 5556
+VPSHUFLWri 5557
+VPSIGNBYrm 5558
+VPSIGNBYrr 5559
+VPSIGNBrm 5560
+VPSIGNBrr 5561
+VPSIGNDYrm 5562
+VPSIGNDYrr 5563
+VPSIGNDrm 5564
+VPSIGNDrr 5565
+VPSIGNWYrm 5566
+VPSIGNWYrr 5567
+VPSIGNWrm 5568
+VPSIGNWrr 5569
+VPSLLDQYri 5570
+VPSLLDQZ 5571
+VPSLLDQZmi 5572
+VPSLLDQZri 5573
+VPSLLDQri 5574
+VPSLLDYri 5575
+VPSLLDYrm 5576
+VPSLLDYrr 5577
+VPSLLDZ 5578
+VPSLLDZmbi 5579
+VPSLLDZmbik 5580
+VPSLLDZmbikz 5581
+VPSLLDZmi 5582
+VPSLLDZmik 5583
+VPSLLDZmikz 5584
+VPSLLDZri 5585
+VPSLLDZrik 5586
+VPSLLDZrikz 5587
+VPSLLDZrm 5588
+VPSLLDZrmk 5589
+VPSLLDZrmkz 5590
+VPSLLDZrr 5591
+VPSLLDZrrk 5592
+VPSLLDZrrkz 5593
+VPSLLDri 5594
+VPSLLDrm 5595
+VPSLLDrr 5596
+VPSLLQYri 5597
+VPSLLQYrm 5598
+VPSLLQYrr 5599
+VPSLLQZ 5600
+VPSLLQZmbi 5601
+VPSLLQZmbik 5602
+VPSLLQZmbikz 5603
+VPSLLQZmi 5604
+VPSLLQZmik 5605
+VPSLLQZmikz 5606
+VPSLLQZri 5607
+VPSLLQZrik 5608
+VPSLLQZrikz 5609
+VPSLLQZrm 5610
+VPSLLQZrmk 5611
+VPSLLQZrmkz 5612
+VPSLLQZrr 5613
+VPSLLQZrrk 5614
+VPSLLQZrrkz 5615
+VPSLLQri 5616
+VPSLLQrm 5617
+VPSLLQrr 5618
+VPSLLVDYrm 5619
+VPSLLVDYrr 5620
+VPSLLVDZ 5621
+VPSLLVDZrm 5622
+VPSLLVDZrmb 5623
+VPSLLVDZrmbk 5624
+VPSLLVDZrmbkz 5625
+VPSLLVDZrmk 5626
+VPSLLVDZrmkz 5627
+VPSLLVDZrr 5628
+VPSLLVDZrrk 5629
+VPSLLVDZrrkz 5630
+VPSLLVDrm 5631
+VPSLLVDrr 5632
+VPSLLVQYrm 5633
+VPSLLVQYrr 5634
+VPSLLVQZ 5635
+VPSLLVQZrm 5636
+VPSLLVQZrmb 5637
+VPSLLVQZrmbk 5638
+VPSLLVQZrmbkz 5639
+VPSLLVQZrmk 5640
+VPSLLVQZrmkz 5641
+VPSLLVQZrr 5642
+VPSLLVQZrrk 5643
+VPSLLVQZrrkz 5644
+VPSLLVQrm 5645
+VPSLLVQrr 5646
+VPSLLVWZ 5647
+VPSLLVWZrm 5648
+VPSLLVWZrmk 5649
+VPSLLVWZrmkz 5650
+VPSLLVWZrr 5651
+VPSLLVWZrrk 5652
+VPSLLVWZrrkz 5653
+VPSLLWYri 5654
+VPSLLWYrm 5655
+VPSLLWYrr 5656
+VPSLLWZ 5657
+VPSLLWZmi 5658
+VPSLLWZmik 5659
+VPSLLWZmikz 5660
+VPSLLWZri 5661
+VPSLLWZrik 5662
+VPSLLWZrikz 5663
+VPSLLWZrm 5664
+VPSLLWZrmk 5665
+VPSLLWZrmkz 5666
+VPSLLWZrr 5667
+VPSLLWZrrk 5668
+VPSLLWZrrkz 5669
+VPSLLWri 5670
+VPSLLWrm 5671
+VPSLLWrr 5672
+VPSRADYri 5673
+VPSRADYrm 5674
+VPSRADYrr 5675
+VPSRADZ 5676
+VPSRADZmbi 5677
+VPSRADZmbik 5678
+VPSRADZmbikz 5679
+VPSRADZmi 5680
+VPSRADZmik 5681
+VPSRADZmikz 5682
+VPSRADZri 5683
+VPSRADZrik 5684
+VPSRADZrikz 5685
+VPSRADZrm 5686
+VPSRADZrmk 5687
+VPSRADZrmkz 5688
+VPSRADZrr 5689
+VPSRADZrrk 5690
+VPSRADZrrkz 5691
+VPSRADri 5692
+VPSRADrm 5693
+VPSRADrr 5694
+VPSRAQZ 5695
+VPSRAQZmbi 5696
+VPSRAQZmbik 5697
+VPSRAQZmbikz 5698
+VPSRAQZmi 5699
+VPSRAQZmik 5700
+VPSRAQZmikz 5701
+VPSRAQZri 5702
+VPSRAQZrik 5703
+VPSRAQZrikz 5704
+VPSRAQZrm 5705
+VPSRAQZrmk 5706
+VPSRAQZrmkz 5707
+VPSRAQZrr 5708
+VPSRAQZrrk 5709
+VPSRAQZrrkz 5710
+VPSRAVDYrm 5711
+VPSRAVDYrr 5712
+VPSRAVDZ 5713
+VPSRAVDZrm 5714
+VPSRAVDZrmb 5715
+VPSRAVDZrmbk 5716
+VPSRAVDZrmbkz 5717
+VPSRAVDZrmk 5718
+VPSRAVDZrmkz 5719
+VPSRAVDZrr 5720
+VPSRAVDZrrk 5721
+VPSRAVDZrrkz 5722
+VPSRAVDrm 5723
+VPSRAVDrr 5724
+VPSRAVQZ 5725
+VPSRAVQZrm 5726
+VPSRAVQZrmb 5727
+VPSRAVQZrmbk 5728
+VPSRAVQZrmbkz 5729
+VPSRAVQZrmk 5730
+VPSRAVQZrmkz 5731
+VPSRAVQZrr 5732
+VPSRAVQZrrk 5733
+VPSRAVQZrrkz 5734
+VPSRAVWZ 5735
+VPSRAVWZrm 5736
+VPSRAVWZrmk 5737
+VPSRAVWZrmkz 5738
+VPSRAVWZrr 5739
+VPSRAVWZrrk 5740
+VPSRAVWZrrkz 5741
+VPSRAWYri 5742
+VPSRAWYrm 5743
+VPSRAWYrr 5744
+VPSRAWZ 5745
+VPSRAWZmi 5746
+VPSRAWZmik 5747
+VPSRAWZmikz 5748
+VPSRAWZri 5749
+VPSRAWZrik 5750
+VPSRAWZrikz 5751
+VPSRAWZrm 5752
+VPSRAWZrmk 5753
+VPSRAWZrmkz 5754
+VPSRAWZrr 5755
+VPSRAWZrrk 5756
+VPSRAWZrrkz 5757
+VPSRAWri 5758
+VPSRAWrm 5759
+VPSRAWrr 5760
+VPSRLDQYri 5761
+VPSRLDQZ 5762
+VPSRLDQZmi 5763
+VPSRLDQZri 5764
+VPSRLDQri 5765
+VPSRLDYri 5766
+VPSRLDYrm 5767
+VPSRLDYrr 5768
+VPSRLDZ 5769
+VPSRLDZmbi 5770
+VPSRLDZmbik 5771
+VPSRLDZmbikz 5772
+VPSRLDZmi 5773
+VPSRLDZmik 5774
+VPSRLDZmikz 5775
+VPSRLDZri 5776
+VPSRLDZrik 5777
+VPSRLDZrikz 5778
+VPSRLDZrm 5779
+VPSRLDZrmk 5780
+VPSRLDZrmkz 5781
+VPSRLDZrr 5782
+VPSRLDZrrk 5783
+VPSRLDZrrkz 5784
+VPSRLDri 5785
+VPSRLDrm 5786
+VPSRLDrr 5787
+VPSRLQYri 5788
+VPSRLQYrm 5789
+VPSRLQYrr 5790
+VPSRLQZ 5791
+VPSRLQZmbi 5792
+VPSRLQZmbik 5793
+VPSRLQZmbikz 5794
+VPSRLQZmi 5795
+VPSRLQZmik 5796
+VPSRLQZmikz 5797
+VPSRLQZri 5798
+VPSRLQZrik 5799
+VPSRLQZrikz 5800
+VPSRLQZrm 5801
+VPSRLQZrmk 5802
+VPSRLQZrmkz 5803
+VPSRLQZrr 5804
+VPSRLQZrrk 5805
+VPSRLQZrrkz 5806
+VPSRLQri 5807
+VPSRLQrm 5808
+VPSRLQrr 5809
+VPSRLVDYrm 5810
+VPSRLVDYrr 5811
+VPSRLVDZ 5812
+VPSRLVDZrm 5813
+VPSRLVDZrmb 5814
+VPSRLVDZrmbk 5815
+VPSRLVDZrmbkz 5816
+VPSRLVDZrmk 5817
+VPSRLVDZrmkz 5818
+VPSRLVDZrr 5819
+VPSRLVDZrrk 5820
+VPSRLVDZrrkz 5821
+VPSRLVDrm 5822
+VPSRLVDrr 5823
+VPSRLVQYrm 5824
+VPSRLVQYrr 5825
+VPSRLVQZ 5826
+VPSRLVQZrm 5827
+VPSRLVQZrmb 5828
+VPSRLVQZrmbk 5829
+VPSRLVQZrmbkz 5830
+VPSRLVQZrmk 5831
+VPSRLVQZrmkz 5832
+VPSRLVQZrr 5833
+VPSRLVQZrrk 5834
+VPSRLVQZrrkz 5835
+VPSRLVQrm 5836
+VPSRLVQrr 5837
+VPSRLVWZ 5838
+VPSRLVWZrm 5839
+VPSRLVWZrmk 5840
+VPSRLVWZrmkz 5841
+VPSRLVWZrr 5842
+VPSRLVWZrrk 5843
+VPSRLVWZrrkz 5844
+VPSRLWYri 5845
+VPSRLWYrm 5846
+VPSRLWYrr 5847
+VPSRLWZ 5848
+VPSRLWZmi 5849
+VPSRLWZmik 5850
+VPSRLWZmikz 5851
+VPSRLWZri 5852
+VPSRLWZrik 5853
+VPSRLWZrikz 5854
+VPSRLWZrm 5855
+VPSRLWZrmk 5856
+VPSRLWZrmkz 5857
+VPSRLWZrr 5858
+VPSRLWZrrk 5859
+VPSRLWZrrkz 5860
+VPSRLWri 5861
+VPSRLWrm 5862
+VPSRLWrr 5863
+VPSUBBYrm 5864
+VPSUBBYrr 5865
+VPSUBBZ 5866
+VPSUBBZrm 5867
+VPSUBBZrmk 5868
+VPSUBBZrmkz 5869
+VPSUBBZrr 5870
+VPSUBBZrrk 5871
+VPSUBBZrrkz 5872
+VPSUBBrm 5873
+VPSUBBrr 5874
+VPSUBDYrm 5875
+VPSUBDYrr 5876
+VPSUBDZ 5877
+VPSUBDZrm 5878
+VPSUBDZrmb 5879
+VPSUBDZrmbk 5880
+VPSUBDZrmbkz 5881
+VPSUBDZrmk 5882
+VPSUBDZrmkz 5883
+VPSUBDZrr 5884
+VPSUBDZrrk 5885
+VPSUBDZrrkz 5886
+VPSUBDrm 5887
+VPSUBDrr 5888
+VPSUBQYrm 5889
+VPSUBQYrr 5890
+VPSUBQZ 5891
+VPSUBQZrm 5892
+VPSUBQZrmb 5893
+VPSUBQZrmbk 5894
+VPSUBQZrmbkz 5895
+VPSUBQZrmk 5896
+VPSUBQZrmkz 5897
+VPSUBQZrr 5898
+VPSUBQZrrk 5899
+VPSUBQZrrkz 5900
+VPSUBQrm 5901
+VPSUBQrr 5902
+VPSUBSBYrm 5903
+VPSUBSBYrr 5904
+VPSUBSBZ 5905
+VPSUBSBZrm 5906
+VPSUBSBZrmk 5907
+VPSUBSBZrmkz 5908
+VPSUBSBZrr 5909
+VPSUBSBZrrk 5910
+VPSUBSBZrrkz 5911
+VPSUBSBrm 5912
+VPSUBSBrr 5913
+VPSUBSWYrm 5914
+VPSUBSWYrr 5915
+VPSUBSWZ 5916
+VPSUBSWZrm 5917
+VPSUBSWZrmk 5918
+VPSUBSWZrmkz 5919
+VPSUBSWZrr 5920
+VPSUBSWZrrk 5921
+VPSUBSWZrrkz 5922
+VPSUBSWrm 5923
+VPSUBSWrr 5924
+VPSUBUSBYrm 5925
+VPSUBUSBYrr 5926
+VPSUBUSBZ 5927
+VPSUBUSBZrm 5928
+VPSUBUSBZrmk 5929
+VPSUBUSBZrmkz 5930
+VPSUBUSBZrr 5931
+VPSUBUSBZrrk 5932
+VPSUBUSBZrrkz 5933
+VPSUBUSBrm 5934
+VPSUBUSBrr 5935
+VPSUBUSWYrm 5936
+VPSUBUSWYrr 5937
+VPSUBUSWZ 5938
+VPSUBUSWZrm 5939
+VPSUBUSWZrmk 5940
+VPSUBUSWZrmkz 5941
+VPSUBUSWZrr 5942
+VPSUBUSWZrrk 5943
+VPSUBUSWZrrkz 5944
+VPSUBUSWrm 5945
+VPSUBUSWrr 5946
+VPSUBWYrm 5947
+VPSUBWYrr 5948
+VPSUBWZ 5949
+VPSUBWZrm 5950
+VPSUBWZrmk 5951
+VPSUBWZrmkz 5952
+VPSUBWZrr 5953
+VPSUBWZrrk 5954
+VPSUBWZrrkz 5955
+VPSUBWrm 5956
+VPSUBWrr 5957
+VPTERNLOGDZ 5958
+VPTERNLOGDZrmbi 5959
+VPTERNLOGDZrmbik 5960
+VPTERNLOGDZrmbikz 5961
+VPTERNLOGDZrmi 5962
+VPTERNLOGDZrmik 5963
+VPTERNLOGDZrmikz 5964
+VPTERNLOGDZrri 5965
+VPTERNLOGDZrrik 5966
+VPTERNLOGDZrrikz 5967
+VPTERNLOGQZ 5968
+VPTERNLOGQZrmbi 5969
+VPTERNLOGQZrmbik 5970
+VPTERNLOGQZrmbikz 5971
+VPTERNLOGQZrmi 5972
+VPTERNLOGQZrmik 5973
+VPTERNLOGQZrmikz 5974
+VPTERNLOGQZrri 5975
+VPTERNLOGQZrrik 5976
+VPTERNLOGQZrrikz 5977
+VPTESTMBZ 5978
+VPTESTMBZrm 5979
+VPTESTMBZrmk 5980
+VPTESTMBZrr 5981
+VPTESTMBZrrk 5982
+VPTESTMDZ 5983
+VPTESTMDZrm 5984
+VPTESTMDZrmb 5985
+VPTESTMDZrmbk 5986
+VPTESTMDZrmk 5987
+VPTESTMDZrr 5988
+VPTESTMDZrrk 5989
+VPTESTMQZ 5990
+VPTESTMQZrm 5991
+VPTESTMQZrmb 5992
+VPTESTMQZrmbk 5993
+VPTESTMQZrmk 5994
+VPTESTMQZrr 5995
+VPTESTMQZrrk 5996
+VPTESTMWZ 5997
+VPTESTMWZrm 5998
+VPTESTMWZrmk 5999
+VPTESTMWZrr 6000
+VPTESTMWZrrk 6001
+VPTESTNMBZ 6002
+VPTESTNMBZrm 6003
+VPTESTNMBZrmk 6004
+VPTESTNMBZrr 6005
+VPTESTNMBZrrk 6006
+VPTESTNMDZ 6007
+VPTESTNMDZrm 6008
+VPTESTNMDZrmb 6009
+VPTESTNMDZrmbk 6010
+VPTESTNMDZrmk 6011
+VPTESTNMDZrr 6012
+VPTESTNMDZrrk 6013
+VPTESTNMQZ 6014
+VPTESTNMQZrm 6015
+VPTESTNMQZrmb 6016
+VPTESTNMQZrmbk 6017
+VPTESTNMQZrmk 6018
+VPTESTNMQZrr 6019
+VPTESTNMQZrrk 6020
+VPTESTNMWZ 6021
+VPTESTNMWZrm 6022
+VPTESTNMWZrmk 6023
+VPTESTNMWZrr 6024
+VPTESTNMWZrrk 6025
+VPTESTYrm 6026
+VPTESTYrr 6027
+VPTESTrm 6028
+VPTESTrr 6029
+VPUNPCKHBWYrm 6030
+VPUNPCKHBWYrr 6031
+VPUNPCKHBWZ 6032
+VPUNPCKHBWZrm 6033
+VPUNPCKHBWZrmk 6034
+VPUNPCKHBWZrmkz 6035
+VPUNPCKHBWZrr 6036
+VPUNPCKHBWZrrk 6037
+VPUNPCKHBWZrrkz 6038
+VPUNPCKHBWrm 6039
+VPUNPCKHBWrr 6040
+VPUNPCKHDQYrm 6041
+VPUNPCKHDQYrr 6042
+VPUNPCKHDQZ 6043
+VPUNPCKHDQZrm 6044
+VPUNPCKHDQZrmb 6045
+VPUNPCKHDQZrmbk 6046
+VPUNPCKHDQZrmbkz 6047
+VPUNPCKHDQZrmk 6048
+VPUNPCKHDQZrmkz 6049
+VPUNPCKHDQZrr 6050
+VPUNPCKHDQZrrk 6051
+VPUNPCKHDQZrrkz 6052
+VPUNPCKHDQrm 6053
+VPUNPCKHDQrr 6054
+VPUNPCKHQDQYrm 6055
+VPUNPCKHQDQYrr 6056
+VPUNPCKHQDQZ 6057
+VPUNPCKHQDQZrm 6058
+VPUNPCKHQDQZrmb 6059
+VPUNPCKHQDQZrmbk 6060
+VPUNPCKHQDQZrmbkz 6061
+VPUNPCKHQDQZrmk 6062
+VPUNPCKHQDQZrmkz 6063
+VPUNPCKHQDQZrr 6064
+VPUNPCKHQDQZrrk 6065
+VPUNPCKHQDQZrrkz 6066
+VPUNPCKHQDQrm 6067
+VPUNPCKHQDQrr 6068
+VPUNPCKHWDYrm 6069
+VPUNPCKHWDYrr 6070
+VPUNPCKHWDZ 6071
+VPUNPCKHWDZrm 6072
+VPUNPCKHWDZrmk 6073
+VPUNPCKHWDZrmkz 6074
+VPUNPCKHWDZrr 6075
+VPUNPCKHWDZrrk 6076
+VPUNPCKHWDZrrkz 6077
+VPUNPCKHWDrm 6078
+VPUNPCKHWDrr 6079
+VPUNPCKLBWYrm 6080
+VPUNPCKLBWYrr 6081
+VPUNPCKLBWZ 6082
+VPUNPCKLBWZrm 6083
+VPUNPCKLBWZrmk 6084
+VPUNPCKLBWZrmkz 6085
+VPUNPCKLBWZrr 6086
+VPUNPCKLBWZrrk 6087
+VPUNPCKLBWZrrkz 6088
+VPUNPCKLBWrm 6089
+VPUNPCKLBWrr 6090
+VPUNPCKLDQYrm 6091
+VPUNPCKLDQYrr 6092
+VPUNPCKLDQZ 6093
+VPUNPCKLDQZrm 6094
+VPUNPCKLDQZrmb 6095
+VPUNPCKLDQZrmbk 6096
+VPUNPCKLDQZrmbkz 6097
+VPUNPCKLDQZrmk 6098
+VPUNPCKLDQZrmkz 6099
+VPUNPCKLDQZrr 6100
+VPUNPCKLDQZrrk 6101
+VPUNPCKLDQZrrkz 6102
+VPUNPCKLDQrm 6103
+VPUNPCKLDQrr 6104
+VPUNPCKLQDQYrm 6105
+VPUNPCKLQDQYrr 6106
+VPUNPCKLQDQZ 6107
+VPUNPCKLQDQZrm 6108
+VPUNPCKLQDQZrmb 6109
+VPUNPCKLQDQZrmbk 6110
+VPUNPCKLQDQZrmbkz 6111
+VPUNPCKLQDQZrmk 6112
+VPUNPCKLQDQZrmkz 6113
+VPUNPCKLQDQZrr 6114
+VPUNPCKLQDQZrrk 6115
+VPUNPCKLQDQZrrkz 6116
+VPUNPCKLQDQrm 6117
+VPUNPCKLQDQrr 6118
+VPUNPCKLWDYrm 6119
+VPUNPCKLWDYrr 6120
+VPUNPCKLWDZ 6121
+VPUNPCKLWDZrm 6122
+VPUNPCKLWDZrmk 6123
+VPUNPCKLWDZrmkz 6124
+VPUNPCKLWDZrr 6125
+VPUNPCKLWDZrrk 6126
+VPUNPCKLWDZrrkz 6127
+VPUNPCKLWDrm 6128
+VPUNPCKLWDrr 6129
+VPXORDZ 6130
+VPXORDZrm 6131
+VPXORDZrmb 6132
+VPXORDZrmbk 6133
+VPXORDZrmbkz 6134
+VPXORDZrmk 6135
+VPXORDZrmkz 6136
+VPXORDZrr 6137
+VPXORDZrrk 6138
+VPXORDZrrkz 6139
+VPXORQZ 6140
+VPXORQZrm 6141
+VPXORQZrmb 6142
+VPXORQZrmbk 6143
+VPXORQZrmbkz 6144
+VPXORQZrmk 6145
+VPXORQZrmkz 6146
+VPXORQZrr 6147
+VPXORQZrrk 6148
+VPXORQZrrkz 6149
+VPXORYrm 6150
+VPXORYrr 6151
+VPXORrm 6152
+VPXORrr 6153
+VRANGEPDZ 6154
+VRANGEPDZrmbi 6155
+VRANGEPDZrmbik 6156
+VRANGEPDZrmbikz 6157
+VRANGEPDZrmi 6158
+VRANGEPDZrmik 6159
+VRANGEPDZrmikz 6160
+VRANGEPDZrri 6161
+VRANGEPDZrrib 6162
+VRANGEPDZrribk 6163
+VRANGEPDZrribkz 6164
+VRANGEPDZrrik 6165
+VRANGEPDZrrikz 6166
+VRANGEPSZ 6167
+VRANGEPSZrmbi 6168
+VRANGEPSZrmbik 6169
+VRANGEPSZrmbikz 6170
+VRANGEPSZrmi 6171
+VRANGEPSZrmik 6172
+VRANGEPSZrmikz 6173
+VRANGEPSZrri 6174
+VRANGEPSZrrib 6175
+VRANGEPSZrribk 6176
+VRANGEPSZrribkz 6177
+VRANGEPSZrrik 6178
+VRANGEPSZrrikz 6179
+VRANGESDZrmi 6180
+VRANGESDZrmik 6181
+VRANGESDZrmikz 6182
+VRANGESDZrri 6183
+VRANGESDZrrib 6184
+VRANGESDZrribk 6185
+VRANGESDZrribkz 6186
+VRANGESDZrrik 6187
+VRANGESDZrrikz 6188
+VRANGESSZrmi 6189
+VRANGESSZrmik 6190
+VRANGESSZrmikz 6191
+VRANGESSZrri 6192
+VRANGESSZrrib 6193
+VRANGESSZrribk 6194
+VRANGESSZrribkz 6195
+VRANGESSZrrik 6196
+VRANGESSZrrikz 6197
+VRCP 6198
+VRCPBF 6199
+VRCPPHZ 6200
+VRCPPHZm 6201
+VRCPPHZmb 6202
+VRCPPHZmbk 6203
+VRCPPHZmbkz 6204
+VRCPPHZmk 6205
+VRCPPHZmkz 6206
+VRCPPHZr 6207
+VRCPPHZrk 6208
+VRCPPHZrkz 6209
+VRCPPSYm 6210
+VRCPPSYr 6211
+VRCPPSm 6212
+VRCPPSr 6213
+VRCPSHZrm 6214
+VRCPSHZrmk 6215
+VRCPSHZrmkz 6216
+VRCPSHZrr 6217
+VRCPSHZrrk 6218
+VRCPSHZrrkz 6219
+VRCPSSm 6220
+VRCPSSm_Int 6221
+VRCPSSr 6222
+VRCPSSr_Int 6223
+VREDUCEBF 6224
+VREDUCEPDZ 6225
+VREDUCEPDZrmbi 6226
+VREDUCEPDZrmbik 6227
+VREDUCEPDZrmbikz 6228
+VREDUCEPDZrmi 6229
+VREDUCEPDZrmik 6230
+VREDUCEPDZrmikz 6231
+VREDUCEPDZrri 6232
+VREDUCEPDZrrib 6233
+VREDUCEPDZrribk 6234
+VREDUCEPDZrribkz 6235
+VREDUCEPDZrrik 6236
+VREDUCEPDZrrikz 6237
+VREDUCEPHZ 6238
+VREDUCEPHZrmbi 6239
+VREDUCEPHZrmbik 6240
+VREDUCEPHZrmbikz 6241
+VREDUCEPHZrmi 6242
+VREDUCEPHZrmik 6243
+VREDUCEPHZrmikz 6244
+VREDUCEPHZrri 6245
+VREDUCEPHZrrib 6246
+VREDUCEPHZrribk 6247
+VREDUCEPHZrribkz 6248
+VREDUCEPHZrrik 6249
+VREDUCEPHZrrikz 6250
+VREDUCEPSZ 6251
+VREDUCEPSZrmbi 6252
+VREDUCEPSZrmbik 6253
+VREDUCEPSZrmbikz 6254
+VREDUCEPSZrmi 6255
+VREDUCEPSZrmik 6256
+VREDUCEPSZrmikz 6257
+VREDUCEPSZrri 6258
+VREDUCEPSZrrib 6259
+VREDUCEPSZrribk 6260
+VREDUCEPSZrribkz 6261
+VREDUCEPSZrrik 6262
+VREDUCEPSZrrikz 6263
+VREDUCESDZrmi 6264
+VREDUCESDZrmik 6265
+VREDUCESDZrmikz 6266
+VREDUCESDZrri 6267
+VREDUCESDZrrib 6268
+VREDUCESDZrribk 6269
+VREDUCESDZrribkz 6270
+VREDUCESDZrrik 6271
+VREDUCESDZrrikz 6272
+VREDUCESHZrmi 6273
+VREDUCESHZrmik 6274
+VREDUCESHZrmikz 6275
+VREDUCESHZrri 6276
+VREDUCESHZrrib 6277
+VREDUCESHZrribk 6278
+VREDUCESHZrribkz 6279
+VREDUCESHZrrik 6280
+VREDUCESHZrrikz 6281
+VREDUCESSZrmi 6282
+VREDUCESSZrmik 6283
+VREDUCESSZrmikz 6284
+VREDUCESSZrri 6285
+VREDUCESSZrrib 6286
+VREDUCESSZrribk 6287
+VREDUCESSZrribkz 6288
+VREDUCESSZrrik 6289
+VREDUCESSZrrikz 6290
+VRNDSCALEBF 6291
+VRNDSCALEPDZ 6292
+VRNDSCALEPDZrmbi 6293
+VRNDSCALEPDZrmbik 6294
+VRNDSCALEPDZrmbikz 6295
+VRNDSCALEPDZrmi 6296
+VRNDSCALEPDZrmik 6297
+VRNDSCALEPDZrmikz 6298
+VRNDSCALEPDZrri 6299
+VRNDSCALEPDZrrib 6300
+VRNDSCALEPDZrribk 6301
+VRNDSCALEPDZrribkz 6302
+VRNDSCALEPDZrrik 6303
+VRNDSCALEPDZrrikz 6304
+VRNDSCALEPHZ 6305
+VRNDSCALEPHZrmbi 6306
+VRNDSCALEPHZrmbik 6307
+VRNDSCALEPHZrmbikz 6308
+VRNDSCALEPHZrmi 6309
+VRNDSCALEPHZrmik 6310
+VRNDSCALEPHZrmikz 6311
+VRNDSCALEPHZrri 6312
+VRNDSCALEPHZrrib 6313
+VRNDSCALEPHZrribk 6314
+VRNDSCALEPHZrribkz 6315
+VRNDSCALEPHZrrik 6316
+VRNDSCALEPHZrrikz 6317
+VRNDSCALEPSZ 6318
+VRNDSCALEPSZrmbi 6319
+VRNDSCALEPSZrmbik 6320
+VRNDSCALEPSZrmbikz 6321
+VRNDSCALEPSZrmi 6322
+VRNDSCALEPSZrmik 6323
+VRNDSCALEPSZrmikz 6324
+VRNDSCALEPSZrri 6325
+VRNDSCALEPSZrrib 6326
+VRNDSCALEPSZrribk 6327
+VRNDSCALEPSZrribkz 6328
+VRNDSCALEPSZrrik 6329
+VRNDSCALEPSZrrikz 6330
+VRNDSCALESDZrmi 6331
+VRNDSCALESDZrmi_Int 6332
+VRNDSCALESDZrmik_Int 6333
+VRNDSCALESDZrmikz_Int 6334
+VRNDSCALESDZrri 6335
+VRNDSCALESDZrri_Int 6336
+VRNDSCALESDZrrib_Int 6337
+VRNDSCALESDZrribk_Int 6338
+VRNDSCALESDZrribkz_Int 6339
+VRNDSCALESDZrrik_Int 6340
+VRNDSCALESDZrrikz_Int 6341
+VRNDSCALESHZrmi 6342
+VRNDSCALESHZrmi_Int 6343
+VRNDSCALESHZrmik_Int 6344
+VRNDSCALESHZrmikz_Int 6345
+VRNDSCALESHZrri 6346
+VRNDSCALESHZrri_Int 6347
+VRNDSCALESHZrrib_Int 6348
+VRNDSCALESHZrribk_Int 6349
+VRNDSCALESHZrribkz_Int 6350
+VRNDSCALESHZrrik_Int 6351
+VRNDSCALESHZrrikz_Int 6352
+VRNDSCALESSZrmi 6353
+VRNDSCALESSZrmi_Int 6354
+VRNDSCALESSZrmik_Int 6355
+VRNDSCALESSZrmikz_Int 6356
+VRNDSCALESSZrri 6357
+VRNDSCALESSZrri_Int 6358
+VRNDSCALESSZrrib_Int 6359
+VRNDSCALESSZrribk_Int 6360
+VRNDSCALESSZrribkz_Int 6361
+VRNDSCALESSZrrik_Int 6362
+VRNDSCALESSZrrikz_Int 6363
+VROUNDPDYmi 6364
+VROUNDPDYri 6365
+VROUNDPDmi 6366
+VROUNDPDri 6367
+VROUNDPSYmi 6368
+VROUNDPSYri 6369
+VROUNDPSmi 6370
+VROUNDPSri 6371
+VROUNDSDmi 6372
+VROUNDSDmi_Int 6373
+VROUNDSDri 6374
+VROUNDSDri_Int 6375
+VROUNDSSmi 6376
+VROUNDSSmi_Int 6377
+VROUNDSSri 6378
+VROUNDSSri_Int 6379
+VRSQRT 6380
+VRSQRTBF 6381
+VRSQRTPHZ 6382
+VRSQRTPHZm 6383
+VRSQRTPHZmb 6384
+VRSQRTPHZmbk 6385
+VRSQRTPHZmbkz 6386
+VRSQRTPHZmk 6387
+VRSQRTPHZmkz 6388
+VRSQRTPHZr 6389
+VRSQRTPHZrk 6390
+VRSQRTPHZrkz 6391
+VRSQRTPSYm 6392
+VRSQRTPSYr 6393
+VRSQRTPSm 6394
+VRSQRTPSr 6395
+VRSQRTSHZrm 6396
+VRSQRTSHZrmk 6397
+VRSQRTSHZrmkz 6398
+VRSQRTSHZrr 6399
+VRSQRTSHZrrk 6400
+VRSQRTSHZrrkz 6401
+VRSQRTSSm 6402
+VRSQRTSSm_Int 6403
+VRSQRTSSr 6404
+VRSQRTSSr_Int 6405
+VSCALEFBF 6406
+VSCALEFPDZ 6407
+VSCALEFPDZrm 6408
+VSCALEFPDZrmb 6409
+VSCALEFPDZrmbk 6410
+VSCALEFPDZrmbkz 6411
+VSCALEFPDZrmk 6412
+VSCALEFPDZrmkz 6413
+VSCALEFPDZrr 6414
+VSCALEFPDZrrb 6415
+VSCALEFPDZrrbk 6416
+VSCALEFPDZrrbkz 6417
+VSCALEFPDZrrk 6418
+VSCALEFPDZrrkz 6419
+VSCALEFPHZ 6420
+VSCALEFPHZrm 6421
+VSCALEFPHZrmb 6422
+VSCALEFPHZrmbk 6423
+VSCALEFPHZrmbkz 6424
+VSCALEFPHZrmk 6425
+VSCALEFPHZrmkz 6426
+VSCALEFPHZrr 6427
+VSCALEFPHZrrb 6428
+VSCALEFPHZrrbk 6429
+VSCALEFPHZrrbkz 6430
+VSCALEFPHZrrk 6431
+VSCALEFPHZrrkz 6432
+VSCALEFPSZ 6433
+VSCALEFPSZrm 6434
+VSCALEFPSZrmb 6435
+VSCALEFPSZrmbk 6436
+VSCALEFPSZrmbkz 6437
+VSCALEFPSZrmk 6438
+VSCALEFPSZrmkz 6439
+VSCALEFPSZrr 6440
+VSCALEFPSZrrb 6441
+VSCALEFPSZrrbk 6442
+VSCALEFPSZrrbkz 6443
+VSCALEFPSZrrk 6444
+VSCALEFPSZrrkz 6445
+VSCALEFSDZrm 6446
+VSCALEFSDZrmk 6447
+VSCALEFSDZrmkz 6448
+VSCALEFSDZrr 6449
+VSCALEFSDZrrb_Int 6450
+VSCALEFSDZrrbk_Int 6451
+VSCALEFSDZrrbkz_Int 6452
+VSCALEFSDZrrk 6453
+VSCALEFSDZrrkz 6454
+VSCALEFSHZrm 6455
+VSCALEFSHZrmk 6456
+VSCALEFSHZrmkz 6457
+VSCALEFSHZrr 6458
+VSCALEFSHZrrb_Int 6459
+VSCALEFSHZrrbk_Int 6460
+VSCALEFSHZrrbkz_Int 6461
+VSCALEFSHZrrk 6462
+VSCALEFSHZrrkz 6463
+VSCALEFSSZrm 6464
+VSCALEFSSZrmk 6465
+VSCALEFSSZrmkz 6466
+VSCALEFSSZrr 6467
+VSCALEFSSZrrb_Int 6468
+VSCALEFSSZrrbk_Int 6469
+VSCALEFSSZrrbkz_Int 6470
+VSCALEFSSZrrk 6471
+VSCALEFSSZrrkz 6472
+VSCATTERDPDZ 6473
+VSCATTERDPDZmr 6474
+VSCATTERDPSZ 6475
+VSCATTERDPSZmr 6476
+VSCATTERPF 6477
+VSCATTERQPDZ 6478
+VSCATTERQPDZmr 6479
+VSCATTERQPSZ 6480
+VSCATTERQPSZmr 6481
+VSHA 6482
+VSHUFF 6483
+VSHUFI 6484
+VSHUFPDYrmi 6485
+VSHUFPDYrri 6486
+VSHUFPDZ 6487
+VSHUFPDZrmbi 6488
+VSHUFPDZrmbik 6489
+VSHUFPDZrmbikz 6490
+VSHUFPDZrmi 6491
+VSHUFPDZrmik 6492
+VSHUFPDZrmikz 6493
+VSHUFPDZrri 6494
+VSHUFPDZrrik 6495
+VSHUFPDZrrikz 6496
+VSHUFPDrmi 6497
+VSHUFPDrri 6498
+VSHUFPSYrmi 6499
+VSHUFPSYrri 6500
+VSHUFPSZ 6501
+VSHUFPSZrmbi 6502
+VSHUFPSZrmbik 6503
+VSHUFPSZrmbikz 6504
+VSHUFPSZrmi 6505
+VSHUFPSZrmik 6506
+VSHUFPSZrmikz 6507
+VSHUFPSZrri 6508
+VSHUFPSZrrik 6509
+VSHUFPSZrrikz 6510
+VSHUFPSrmi 6511
+VSHUFPSrri 6512
+VSM 6513
+VSQRTBF 6514
+VSQRTPDYm 6515
+VSQRTPDYr 6516
+VSQRTPDZ 6517
+VSQRTPDZm 6518
+VSQRTPDZmb 6519
+VSQRTPDZmbk 6520
+VSQRTPDZmbkz 6521
+VSQRTPDZmk 6522
+VSQRTPDZmkz 6523
+VSQRTPDZr 6524
+VSQRTPDZrb 6525
+VSQRTPDZrbk 6526
+VSQRTPDZrbkz 6527
+VSQRTPDZrk 6528
+VSQRTPDZrkz 6529
+VSQRTPDm 6530
+VSQRTPDr 6531
+VSQRTPHZ 6532
+VSQRTPHZm 6533
+VSQRTPHZmb 6534
+VSQRTPHZmbk 6535
+VSQRTPHZmbkz 6536
+VSQRTPHZmk 6537
+VSQRTPHZmkz 6538
+VSQRTPHZr 6539
+VSQRTPHZrb 6540
+VSQRTPHZrbk 6541
+VSQRTPHZrbkz 6542
+VSQRTPHZrk 6543
+VSQRTPHZrkz 6544
+VSQRTPSYm 6545
+VSQRTPSYr 6546
+VSQRTPSZ 6547
+VSQRTPSZm 6548
+VSQRTPSZmb 6549
+VSQRTPSZmbk 6550
+VSQRTPSZmbkz 6551
+VSQRTPSZmk 6552
+VSQRTPSZmkz 6553
+VSQRTPSZr 6554
+VSQRTPSZrb 6555
+VSQRTPSZrbk 6556
+VSQRTPSZrbkz 6557
+VSQRTPSZrk 6558
+VSQRTPSZrkz 6559
+VSQRTPSm 6560
+VSQRTPSr 6561
+VSQRTSDZm 6562
+VSQRTSDZm_Int 6563
+VSQRTSDZmk_Int 6564
+VSQRTSDZmkz_Int 6565
+VSQRTSDZr 6566
+VSQRTSDZr_Int 6567
+VSQRTSDZrb_Int 6568
+VSQRTSDZrbk_Int 6569
+VSQRTSDZrbkz_Int 6570
+VSQRTSDZrk_Int 6571
+VSQRTSDZrkz_Int 6572
+VSQRTSDm 6573
+VSQRTSDm_Int 6574
+VSQRTSDr 6575
+VSQRTSDr_Int 6576
+VSQRTSHZm 6577
+VSQRTSHZm_Int 6578
+VSQRTSHZmk_Int 6579
+VSQRTSHZmkz_Int 6580
+VSQRTSHZr 6581
+VSQRTSHZr_Int 6582
+VSQRTSHZrb_Int 6583
+VSQRTSHZrbk_Int 6584
+VSQRTSHZrbkz_Int 6585
+VSQRTSHZrk_Int 6586
+VSQRTSHZrkz_Int 6587
+VSQRTSSZm 6588
+VSQRTSSZm_Int 6589
+VSQRTSSZmk_Int 6590
+VSQRTSSZmkz_Int 6591
+VSQRTSSZr 6592
+VSQRTSSZr_Int 6593
+VSQRTSSZrb_Int 6594
+VSQRTSSZrbk_Int 6595
+VSQRTSSZrbkz_Int 6596
+VSQRTSSZrk_Int 6597
+VSQRTSSZrkz_Int 6598
+VSQRTSSm 6599
+VSQRTSSm_Int 6600
+VSQRTSSr 6601
+VSQRTSSr_Int 6602
+VSTMXCSR 6603
+VSUBBF 6604
+VSUBPDYrm 6605
+VSUBPDYrr 6606
+VSUBPDZ 6607
+VSUBPDZrm 6608
+VSUBPDZrmb 6609
+VSUBPDZrmbk 6610
+VSUBPDZrmbkz 6611
+VSUBPDZrmk 6612
+VSUBPDZrmkz 6613
+VSUBPDZrr 6614
+VSUBPDZrrb 6615
+VSUBPDZrrbk 6616
+VSUBPDZrrbkz 6617
+VSUBPDZrrk 6618
+VSUBPDZrrkz 6619
+VSUBPDrm 6620
+VSUBPDrr 6621
+VSUBPHZ 6622
+VSUBPHZrm 6623
+VSUBPHZrmb 6624
+VSUBPHZrmbk 6625
+VSUBPHZrmbkz 6626
+VSUBPHZrmk 6627
+VSUBPHZrmkz 6628
+VSUBPHZrr 6629
+VSUBPHZrrb 6630
+VSUBPHZrrbk 6631
+VSUBPHZrrbkz 6632
+VSUBPHZrrk 6633
+VSUBPHZrrkz 6634
+VSUBPSYrm 6635
+VSUBPSYrr 6636
+VSUBPSZ 6637
+VSUBPSZrm 6638
+VSUBPSZrmb 6639
+VSUBPSZrmbk 6640
+VSUBPSZrmbkz 6641
+VSUBPSZrmk 6642
+VSUBPSZrmkz 6643
+VSUBPSZrr 6644
+VSUBPSZrrb 6645
+VSUBPSZrrbk 6646
+VSUBPSZrrbkz 6647
+VSUBPSZrrk 6648
+VSUBPSZrrkz 6649
+VSUBPSrm 6650
+VSUBPSrr 6651
+VSUBSDZrm 6652
+VSUBSDZrm_Int 6653
+VSUBSDZrmk_Int 6654
+VSUBSDZrmkz_Int 6655
+VSUBSDZrr 6656
+VSUBSDZrr_Int 6657
+VSUBSDZrrb_Int 6658
+VSUBSDZrrbk_Int 6659
+VSUBSDZrrbkz_Int 6660
+VSUBSDZrrk_Int 6661
+VSUBSDZrrkz_Int 6662
+VSUBSDrm 6663
+VSUBSDrm_Int 6664
+VSUBSDrr 6665
+VSUBSDrr_Int 6666
+VSUBSHZrm 6667
+VSUBSHZrm_Int 6668
+VSUBSHZrmk_Int 6669
+VSUBSHZrmkz_Int 6670
+VSUBSHZrr 6671
+VSUBSHZrr_Int 6672
+VSUBSHZrrb_Int 6673
+VSUBSHZrrbk_Int 6674
+VSUBSHZrrbkz_Int 6675
+VSUBSHZrrk_Int 6676
+VSUBSHZrrkz_Int 6677
+VSUBSSZrm 6678
+VSUBSSZrm_Int 6679
+VSUBSSZrmk_Int 6680
+VSUBSSZrmkz_Int 6681
+VSUBSSZrr 6682
+VSUBSSZrr_Int 6683
+VSUBSSZrrb_Int 6684
+VSUBSSZrrbk_Int 6685
+VSUBSSZrrbkz_Int 6686
+VSUBSSZrrk_Int 6687
+VSUBSSZrrkz_Int 6688
+VSUBSSrm 6689
+VSUBSSrm_Int 6690
+VSUBSSrr 6691
+VSUBSSrr_Int 6692
+VTESTPDYrm 6693
+VTESTPDYrr 6694
+VTESTPDrm 6695
+VTESTPDrr 6696
+VTESTPSYrm 6697
+VTESTPSYrr 6698
+VTESTPSrm 6699
+VTESTPSrr 6700
+VUCOMISDZrm 6701
+VUCOMISDZrm_Int 6702
+VUCOMISDZrr 6703
+VUCOMISDZrr_Int 6704
+VUCOMISDZrrb 6705
+VUCOMISDrm 6706
+VUCOMISDrm_Int 6707
+VUCOMISDrr 6708
+VUCOMISDrr_Int 6709
+VUCOMISHZrm 6710
+VUCOMISHZrm_Int 6711
+VUCOMISHZrr 6712
+VUCOMISHZrr_Int 6713
+VUCOMISHZrrb 6714
+VUCOMISSZrm 6715
+VUCOMISSZrm_Int 6716
+VUCOMISSZrr 6717
+VUCOMISSZrr_Int 6718
+VUCOMISSZrrb 6719
+VUCOMISSrm 6720
+VUCOMISSrm_Int 6721
+VUCOMISSrr 6722
+VUCOMISSrr_Int 6723
+VUCOMXSDZrm 6724
+VUCOMXSDZrm_Int 6725
+VUCOMXSDZrr 6726
+VUCOMXSDZrr_Int 6727
+VUCOMXSDZrrb_Int 6728
+VUCOMXSHZrm 6729
+VUCOMXSHZrm_Int 6730
+VUCOMXSHZrr 6731
+VUCOMXSHZrr_Int 6732
+VUCOMXSHZrrb_Int 6733
+VUCOMXSSZrm 6734
+VUCOMXSSZrm_Int 6735
+VUCOMXSSZrr 6736
+VUCOMXSSZrr_Int 6737
+VUCOMXSSZrrb_Int 6738
+VUNPCKHPDYrm 6739
+VUNPCKHPDYrr 6740
+VUNPCKHPDZ 6741
+VUNPCKHPDZrm 6742
+VUNPCKHPDZrmb 6743
+VUNPCKHPDZrmbk 6744
+VUNPCKHPDZrmbkz 6745
+VUNPCKHPDZrmk 6746
+VUNPCKHPDZrmkz 6747
+VUNPCKHPDZrr 6748
+VUNPCKHPDZrrk 6749
+VUNPCKHPDZrrkz 6750
+VUNPCKHPDrm 6751
+VUNPCKHPDrr 6752
+VUNPCKHPSYrm 6753
+VUNPCKHPSYrr 6754
+VUNPCKHPSZ 6755
+VUNPCKHPSZrm 6756
+VUNPCKHPSZrmb 6757
+VUNPCKHPSZrmbk 6758
+VUNPCKHPSZrmbkz 6759
+VUNPCKHPSZrmk 6760
+VUNPCKHPSZrmkz 6761
+VUNPCKHPSZrr 6762
+VUNPCKHPSZrrk 6763
+VUNPCKHPSZrrkz 6764
+VUNPCKHPSrm 6765
+VUNPCKHPSrr 6766
+VUNPCKLPDYrm 6767
+VUNPCKLPDYrr 6768
+VUNPCKLPDZ 6769
+VUNPCKLPDZrm 6770
+VUNPCKLPDZrmb 6771
+VUNPCKLPDZrmbk 6772
+VUNPCKLPDZrmbkz 6773
+VUNPCKLPDZrmk 6774
+VUNPCKLPDZrmkz 6775
+VUNPCKLPDZrr 6776
+VUNPCKLPDZrrk 6777
+VUNPCKLPDZrrkz 6778
+VUNPCKLPDrm 6779
+VUNPCKLPDrr 6780
+VUNPCKLPSYrm 6781
+VUNPCKLPSYrr 6782
+VUNPCKLPSZ 6783
+VUNPCKLPSZrm 6784
+VUNPCKLPSZrmb 6785
+VUNPCKLPSZrmbk 6786
+VUNPCKLPSZrmbkz 6787
+VUNPCKLPSZrmk 6788
+VUNPCKLPSZrmkz 6789
+VUNPCKLPSZrr 6790
+VUNPCKLPSZrrk 6791
+VUNPCKLPSZrrkz 6792
+VUNPCKLPSrm 6793
+VUNPCKLPSrr 6794
+VXORPDYrm 6795
+VXORPDYrr 6796
+VXORPDZ 6797
+VXORPDZrm 6798
+VXORPDZrmb 6799
+VXORPDZrmbk 6800
+VXORPDZrmbkz 6801
+VXORPDZrmk 6802
+VXORPDZrmkz 6803
+VXORPDZrr 6804
+VXORPDZrrk 6805
+VXORPDZrrkz 6806
+VXORPDrm 6807
+VXORPDrr 6808
+VXORPSYrm 6809
+VXORPSYrr 6810
+VXORPSZ 6811
+VXORPSZrm 6812
+VXORPSZrmb 6813
+VXORPSZrmbk 6814
+VXORPSZrmbkz 6815
+VXORPSZrmk 6816
+VXORPSZrmkz 6817
+VXORPSZrr 6818
+VXORPSZrrk 6819
+VXORPSZrrkz 6820
+VXORPSrm 6821
+VXORPSrr 6822
+VZEROALL 6823
+VZEROUPPER 6824
+V_SET 6825
+V_SETALLONES 6826
+WAIT 6827
+WBINVD 6828
+WBNOINVD 6829
+WRFLAGS 6830
+WRFSBASE 6831
+WRGSBASE 6832
+WRMSR 6833
+WRMSRLIST 6834
+WRMSRNS 6835
+WRMSRNSir 6836
+WRMSRNSir_EVEX 6837
+WRPKRUr 6838
+WRSSD 6839
+WRSSD_EVEX 6840
+WRSSQ 6841
+WRSSQ_EVEX 6842
+WRUSSD 6843
+WRUSSD_EVEX 6844
+WRUSSQ 6845
+WRUSSQ_EVEX 6846
+XABORT 6847
+XABORT_DEF 6848
+XACQUIRE_PREFIX 6849
+XADD 6850
+XAM_F 6851
+XAM_Fp 6852
+XBEGIN 6853
+XCHG 6854
+XCH_F 6855
+XCRYPTCBC 6856
+XCRYPTCFB 6857
+XCRYPTCTR 6858
+XCRYPTECB 6859
+XCRYPTOFB 6860
+XEND 6861
+XGETBV 6862
+XLAT 6863
+XOR 6864
+XORPDrm 6865
+XORPDrr 6866
+XORPSrm 6867
+XORPSrr 6868
+XRELEASE_PREFIX 6869
+XRESLDTRK 6870
+XRSTOR 6871
+XRSTORS 6872
+XSAVE 6873
+XSAVEC 6874
+XSAVEOPT 6875
+XSAVES 6876
+XSETBV 6877
+XSHA 6878
+XSTORE 6879
+XSUSLDTRK 6880
+XTEST 6881
+Immediate 6882
+CImmediate 6883
+FPImmediate 6884
+MBB 6885
+FrameIndex 6886
+ConstantPoolIndex 6887
+TargetIndex 6888
+JumpTableIndex 6889
+ExternalSymbol 6890
+GlobalAddress 6891
+BlockAddress 6892
+RegisterMask 6893
+RegisterLiveOut 6894
+Metadata 6895
+MCSymbol 6896
+CFIIndex 6897
+IntrinsicID 6898
+Predicate 6899
+ShuffleMask 6900
+PhyReg_GR8 6901
+PhyReg_GRH8 6902
+PhyReg_GR8_NOREX2 6903
+PhyReg_GR8_NOREX 6904
+PhyReg_GR8_ABCD_H 6905
+PhyReg_GR8_ABCD_L 6906
+PhyReg_GRH16 6907
+PhyReg_GR16 6908
+PhyReg_GR16_NOREX2 6909
+PhyReg_GR16_NOREX 6910
+PhyReg_VK1 6911
+PhyReg_VK16 6912
+PhyReg_VK2 6913
+PhyReg_VK4 6914
+PhyReg_VK8 6915
+PhyReg_VK16WM 6916
+PhyReg_VK1WM 6917
+PhyReg_VK2WM 6918
+PhyReg_VK4WM 6919
+PhyReg_VK8WM 6920
+PhyReg_SEGMENT_REG 6921
+PhyReg_GR16_ABCD 6922
+PhyReg_FPCCR 6923
+PhyReg_FR16X 6924
+PhyReg_FR16 6925
+PhyReg_VK16PAIR 6926
+PhyReg_VK1PAIR 6927
+PhyReg_VK2PAIR 6928
+PhyReg_VK4PAIR 6929
+PhyReg_VK8PAIR 6930
+PhyReg_VK1PAIR_with_sub_mask_0_in_VK1WM 6931
+PhyReg_LOW32_ADDR_ACCESS_RBP 6932
+PhyReg_LOW32_ADDR_ACCESS 6933
+PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit 6934
+PhyReg_FR32X 6935
+PhyReg_GR32 6936
+PhyReg_GR32_NOSP 6937
+PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2 6938
+PhyReg_DEBUG_REG 6939
+PhyReg_FR32 6940
+PhyReg_GR32_NOREX2 6941
+PhyReg_GR32_NOREX2_NOSP 6942
+PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX 6943
+PhyReg_GR32_NOREX 6944
+PhyReg_VK32 6945
+PhyReg_GR32_NOREX_NOSP 6946
+PhyReg_RFP32 6947
+PhyReg_VK32WM 6948
+PhyReg_GR32_ABCD 6949
+PhyReg_GR32_TC 6950
+PhyReg_GR32_ABCD_and_GR32_TC 6951
+PhyReg_GR32_AD 6952
+PhyReg_GR32_ArgRef 6953
+PhyReg_GR32_BPSP 6954
+PhyReg_GR32_BSI 6955
+PhyReg_GR32_CB 6956
+PhyReg_GR32_DC 6957
+PhyReg_GR32_DIBP 6958
+PhyReg_GR32_SIDI 6959
+PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit 6960
+PhyReg_CCR 6961
+PhyReg_DFCCR 6962
+PhyReg_GR32_ABCD_and_GR32_BSI 6963
+PhyReg_GR32_AD_and_GR32_ArgRef 6964
+PhyReg_GR32_ArgRef_and_GR32_CB 6965
+PhyReg_GR32_BPSP_and_GR32_DIBP 6966
+PhyReg_GR32_BPSP_and_GR32_TC 6967
+PhyReg_GR32_BSI_and_GR32_SIDI 6968
+PhyReg_GR32_DIBP_and_GR32_SIDI 6969
+PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit 6970
+PhyReg_LOW32_ADDR_ACCESS_with_sub_32bit 6971
+PhyReg_RFP64 6972
+PhyReg_GR64 6973
+PhyReg_FR64X 6974
+PhyReg_GR64_with_sub_8bit 6975
+PhyReg_GR64_NOSP 6976
+PhyReg_GR64_NOREX2 6977
+PhyReg_CONTROL_REG 6978
+PhyReg_FR64 6979
+PhyReg_GR64_with_sub_16bit_in_GR16_NOREX2 6980
+PhyReg_GR64_NOREX2_NOSP 6981
+PhyReg_GR64PLTSafe 6982
+PhyReg_GR64_TC 6983
+PhyReg_GR64_NOREX 6984
+PhyReg_GR64_TCW64 6985
+PhyReg_GR64_TC_with_sub_8bit 6986
+PhyReg_GR64_NOREX2_NOSP_and_GR64_TC 6987
+PhyReg_GR64_TCW64_with_sub_8bit 6988
+PhyReg_GR64_TC_and_GR64_TCW64 6989
+PhyReg_GR64_with_sub_16bit_in_GR16_NOREX 6990
+PhyReg_VK64 6991
+PhyReg_VR64 6992
+PhyReg_GR64PLTSafe_and_GR64_TC 6993
+PhyReg_GR64_NOREX2_NOSP_and_GR64_TCW64 6994
+PhyReg_GR64_NOREX_NOSP 6995
+PhyReg_GR64_NOREX_and_GR64_TC 6996
+PhyReg_GR64_TCW64_and_GR64_TC_with_sub_8bit 6997
+PhyReg_VK64WM 6998
+PhyReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64 6999
+PhyReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX 7000
+PhyReg_GR64PLTSafe_and_GR64_TCW64 7001
+PhyReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC 7002
+PhyReg_GR64_NOREX_and_GR64_TCW64 7003
+PhyReg_GR64_ABCD 7004
+PhyReg_GR64_with_sub_32bit_in_GR32_TC 7005
+PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC 7006
+PhyReg_GR64_AD 7007
+PhyReg_GR64_ArgRef 7008
+PhyReg_GR64_and_LOW32_ADDR_ACCESS_RBP 7009
+PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef 7010
+PhyReg_GR64_with_sub_32bit_in_GR32_BPSP 7011
+PhyReg_GR64_with_sub_32bit_in_GR32_BSI 7012
+PhyReg_GR64_with_sub_32bit_in_GR32_CB 7013
+PhyReg_GR64_with_sub_32bit_in_GR32_DIBP 7014
+PhyReg_GR64_with_sub_32bit_in_GR32_SIDI 7015
+PhyReg_GR64_A 7016
+PhyReg_GR64_ArgRef_and_GR64_TC 7017
+PhyReg_GR64_and_LOW32_ADDR_ACCESS 7018
+PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI 7019
+PhyReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef 7020
+PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB 7021
+PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP 7022
+PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC 7023
+PhyReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI 7024
+PhyReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI 7025
+PhyReg_RST 7026
+PhyReg_RFP80 7027
+PhyReg_RFP80_7 7028
+PhyReg_VR128X 7029
+PhyReg_VR128 7030
+PhyReg_VR256X 7031
+PhyReg_VR256 7032
+PhyReg_VR512 7033
+PhyReg_VR512_0_15 7034
+PhyReg_TILE 7035
+PhyReg_TILEPAIR 7036
+VirtReg_GR8 7037
+VirtReg_GRH8 7038
+VirtReg_GR8_NOREX2 7039
+VirtReg_GR8_NOREX 7040
+VirtReg_GR8_ABCD_H 7041
+VirtReg_GR8_ABCD_L 7042
+VirtReg_GRH16 7043
+VirtReg_GR16 7044
+VirtReg_GR16_NOREX2 7045
+VirtReg_GR16_NOREX 7046
+VirtReg_VK1 7047
+VirtReg_VK16 7048
+VirtReg_VK2 7049
+VirtReg_VK4 7050
+VirtReg_VK8 7051
+VirtReg_VK16WM 7052
+VirtReg_VK1WM 7053
+VirtReg_VK2WM 7054
+VirtReg_VK4WM 7055
+VirtReg_VK8WM 7056
+VirtReg_SEGMENT_REG 7057
+VirtReg_GR16_ABCD 7058
+VirtReg_FPCCR 7059
+VirtReg_FR16X 7060
+VirtReg_FR16 7061
+VirtReg_VK16PAIR 7062
+VirtReg_VK1PAIR 7063
+VirtReg_VK2PAIR 7064
+VirtReg_VK4PAIR 7065
+VirtReg_VK8PAIR 7066
+VirtReg_VK1PAIR_with_sub_mask_0_in_VK1WM 7067
+VirtReg_LOW32_ADDR_ACCESS_RBP 7068
+VirtReg_LOW32_ADDR_ACCESS 7069
+VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit 7070
+VirtReg_FR32X 7071
+VirtReg_GR32 7072
+VirtReg_GR32_NOSP 7073
+VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2 7074
+VirtReg_DEBUG_REG 7075
+VirtReg_FR32 7076
+VirtReg_GR32_NOREX2 7077
+VirtReg_GR32_NOREX2_NOSP 7078
+VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX 7079
+VirtReg_GR32_NOREX 7080
+VirtReg_VK32 7081
+VirtReg_GR32_NOREX_NOSP 7082
+VirtReg_RFP32 7083
+VirtReg_VK32WM 7084
+VirtReg_GR32_ABCD 7085
+VirtReg_GR32_TC 7086
+VirtReg_GR32_ABCD_and_GR32_TC 7087
+VirtReg_GR32_AD 7088
+VirtReg_GR32_ArgRef 7089
+VirtReg_GR32_BPSP 7090
+VirtReg_GR32_BSI 7091
+VirtReg_GR32_CB 7092
+VirtReg_GR32_DC 7093
+VirtReg_GR32_DIBP 7094
+VirtReg_GR32_SIDI 7095
+VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit 7096
+VirtReg_CCR 7097
+VirtReg_DFCCR 7098
+VirtReg_GR32_ABCD_and_GR32_BSI 7099
+VirtReg_GR32_AD_and_GR32_ArgRef 7100
+VirtReg_GR32_ArgRef_and_GR32_CB 7101
+VirtReg_GR32_BPSP_and_GR32_DIBP 7102
+VirtReg_GR32_BPSP_and_GR32_TC 7103
+VirtReg_GR32_BSI_and_GR32_SIDI 7104
+VirtReg_GR32_DIBP_and_GR32_SIDI 7105
+VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit 7106
+VirtReg_LOW32_ADDR_ACCESS_with_sub_32bit 7107
+VirtReg_RFP64 7108
+VirtReg_GR64 7109
+VirtReg_FR64X 7110
+VirtReg_GR64_with_sub_8bit 7111
+VirtReg_GR64_NOSP 7112
+VirtReg_GR64_NOREX2 7113
+VirtReg_CONTROL_REG 7114
+VirtReg_FR64 7115
+VirtReg_GR64_with_sub_16bit_in_GR16_NOREX2 7116
+VirtReg_GR64_NOREX2_NOSP 7117
+VirtReg_GR64PLTSafe 7118
+VirtReg_GR64_TC 7119
+VirtReg_GR64_NOREX 7120
+VirtReg_GR64_TCW64 7121
+VirtReg_GR64_TC_with_sub_8bit 7122
+VirtReg_GR64_NOREX2_NOSP_and_GR64_TC 7123
+VirtReg_GR64_TCW64_with_sub_8bit 7124
+VirtReg_GR64_TC_and_GR64_TCW64 7125
+VirtReg_GR64_with_sub_16bit_in_GR16_NOREX 7126
+VirtReg_VK64 7127
+VirtReg_VR64 7128
+VirtReg_GR64PLTSafe_and_GR64_TC 7129
+VirtReg_GR64_NOREX2_NOSP_and_GR64_TCW64 7130
+VirtReg_GR64_NOREX_NOSP 7131
+VirtReg_GR64_NOREX_and_GR64_TC 7132
+VirtReg_GR64_TCW64_and_GR64_TC_with_sub_8bit 7133
+VirtReg_VK64WM 7134
+VirtReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64 7135
+VirtReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX 7136
+VirtReg_GR64PLTSafe_and_GR64_TCW64 7137
+VirtReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC 7138
+VirtReg_GR64_NOREX_and_GR64_TCW64 7139
+VirtReg_GR64_ABCD 7140
+VirtReg_GR64_with_sub_32bit_in_GR32_TC 7141
+VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC 7142
+VirtReg_GR64_AD 7143
+VirtReg_GR64_ArgRef 7144
+VirtReg_GR64_and_LOW32_ADDR_ACCESS_RBP 7145
+VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef 7146
+VirtReg_GR64_with_sub_32bit_in_GR32_BPSP 7147
+VirtReg_GR64_with_sub_32bit_in_GR32_BSI 7148
+VirtReg_GR64_with_sub_32bit_in_GR32_CB 7149
+VirtReg_GR64_with_sub_32bit_in_GR32_DIBP 7150
+VirtReg_GR64_with_sub_32bit_in_GR32_SIDI 7151
+VirtReg_GR64_A 7152
+VirtReg_GR64_ArgRef_and_GR64_TC 7153
+VirtReg_GR64_and_LOW32_ADDR_ACCESS 7154
+VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI 7155
+VirtReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef 7156
+VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB 7157
+VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP 7158
+VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC 7159
+VirtReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI 7160
+VirtReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI 7161
+VirtReg_RST 7162
+VirtReg_RFP80 7163
+VirtReg_RFP80_7 7164
+VirtReg_VR128X 7165
+VirtReg_VR128 7166
+VirtReg_VR256X 7167
+VirtReg_VR256 7168
+VirtReg_VR512 7169
+VirtReg_VR512_0_15 7170
+VirtReg_TILE 7171
+VirtReg_TILEPAIR 7172
diff --git a/llvm/test/tools/llvm-ir2vec/triplets.mir b/llvm/test/tools/llvm-ir2vec/triplets.mir
new file mode 100644
index 0000000..274984a
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/triplets.mir
@@ -0,0 +1,61 @@
+# REQUIRES: x86_64-linux
+# RUN: llvm-ir2vec triplets --mode=mir %s -o 2>&1 %t1.log
+# RUN: diff %S/output/reference_triplets.txt %t1.log
+
+--- |
+ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+ target triple = "x86_64-unknown-linux-gnu"
+
+ define dso_local noundef i32 @add_function(i32 noundef %a, i32 noundef %b) {
+ entry:
+ %sum = add nsw i32 %a, %b
+ ret i32 %sum
+ }
+
+ define dso_local noundef i32 @mul_function(i32 noundef %x, i32 noundef %y) {
+ entry:
+ %product = mul nsw i32 %x, %y
+ ret i32 %product
+ }
+...
+---
+name: add_function
+alignment: 16
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: gr32 }
+ - { id: 1, class: gr32 }
+ - { id: 2, class: gr32 }
+liveins:
+ - { reg: '$edi', virtual-reg: '%0' }
+ - { reg: '$esi', virtual-reg: '%1' }
+body: |
+ bb.0.entry:
+ liveins: $edi, $esi
+
+ %1:gr32 = COPY $esi
+ %0:gr32 = COPY $edi
+ %2:gr32 = nsw ADD32rr %0, %1, implicit-def dead $eflags
+ $eax = COPY %2
+ RET 0, $eax
+
+---
+name: mul_function
+alignment: 16
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: gr32 }
+ - { id: 1, class: gr32 }
+ - { id: 2, class: gr32 }
+liveins:
+ - { reg: '$edi', virtual-reg: '%0' }
+ - { reg: '$esi', virtual-reg: '%1' }
+body: |
+ bb.0.entry:
+ liveins: $edi, $esi
+
+ %1:gr32 = COPY $esi
+ %0:gr32 = COPY $edi
+ %2:gr32 = nsw IMUL32rr %0, %1, implicit-def dead $eflags
+ $eax = COPY %2
+ RET 0, $eax
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-sve-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-sve-instructions.s
index 49af4df..c20409e 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-sve-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-sve-instructions.s
@@ -6864,7 +6864,7 @@ zip2 z31.s, z31.s, z31.s
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2.0] [2.1] [2.2] [3] [4.0] [4.1] [5] [6] [7] [8] [9] [10] [11] [12] [13] [14]
-# CHECK-NEXT: - - - - - - - 245.00 651.00 651.00 570.50 272.50 83.75 83.75 81.75 81.75 1536.75 1281.75 794.25 748.25
+# CHECK-NEXT: - - - - - - - 245.00 651.00 651.00 570.50 272.50 83.75 83.75 81.75 81.75 1540.75 1285.75 790.25 744.25
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2.0] [2.1] [2.2] [3] [4.0] [4.1] [5] [6] [7] [8] [9] [10] [11] [12] [13] [14] Instructions:
@@ -9617,39 +9617,39 @@ zip2 z31.s, z31.s, z31.s
# CHECK-NEXT: - - - - - - - - 9.00 9.00 - - - - - - 9.00 9.00 - - st4w { z21.s - z24.s }, p5, [x10, #20, mul vl]
# CHECK-NEXT: - - - - - - - - 9.00 9.00 - - - - - - 9.00 9.00 - - st4w { z23.s - z26.s }, p3, [x13, #-32, mul vl]
# CHECK-NEXT: - - - - - - - - 9.00 9.00 - - 4.50 4.50 4.50 4.50 9.00 9.00 - - st4w { z5.s - z8.s }, p3, [x17, x16, lsl #2]
-# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - - - - 0.25 0.25 0.25 0.25 stnt1b { z0.b }, p0, [x0, x0]
-# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - - - - 0.25 0.25 0.25 0.25 stnt1b { z0.b }, p0, [x0]
+# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - - - - 0.50 0.50 - - stnt1b { z0.b }, p0, [x0, x0]
+# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - - - - 0.50 0.50 - - stnt1b { z0.b }, p0, [x0]
# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - - - - 1.00 1.00 - - stnt1b { z0.d }, p0, [z1.d]
# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - - - - - 2.00 2.00 - - stnt1b { z0.s }, p0, [z1.s]
-# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - - - - 0.25 0.25 0.25 0.25 stnt1b { z21.b }, p5, [x10, #7, mul vl]
-# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - - - - 0.25 0.25 0.25 0.25 stnt1b { z23.b }, p3, [x13, #-8, mul vl]
+# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - - - - 0.50 0.50 - - stnt1b { z21.b }, p5, [x10, #7, mul vl]
+# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - - - - 0.50 0.50 - - stnt1b { z23.b }, p3, [x13, #-8, mul vl]
# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - - - - 1.00 1.00 - - stnt1b { z31.d }, p7, [z31.d, x0]
# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - - - - 1.00 1.00 - - stnt1b { z31.d }, p7, [z31.d]
# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - - - - - 2.00 2.00 - - stnt1b { z31.s }, p7, [z31.s, x0]
# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - - - - - 2.00 2.00 - - stnt1b { z31.s }, p7, [z31.s]
-# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - - - - 0.25 0.25 0.25 0.25 stnt1d { z0.d }, p0, [x0, x0, lsl #3]
-# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - - - - 0.25 0.25 0.25 0.25 stnt1d { z0.d }, p0, [x0]
+# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - - - - 0.50 0.50 - - stnt1d { z0.d }, p0, [x0, x0, lsl #3]
+# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - - - - 0.50 0.50 - - stnt1d { z0.d }, p0, [x0]
# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - - - - 1.00 1.00 - - stnt1d { z0.d }, p0, [z1.d]
-# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - - - - 0.25 0.25 0.25 0.25 stnt1d { z21.d }, p5, [x10, #7, mul vl]
-# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - - - - 0.25 0.25 0.25 0.25 stnt1d { z23.d }, p3, [x13, #-8, mul vl]
+# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - - - - 0.50 0.50 - - stnt1d { z21.d }, p5, [x10, #7, mul vl]
+# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - - - - 0.50 0.50 - - stnt1d { z23.d }, p3, [x13, #-8, mul vl]
# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - - - - 1.00 1.00 - - stnt1d { z31.d }, p7, [z31.d, x0]
# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - - - - 1.00 1.00 - - stnt1d { z31.d }, p7, [z31.d]
# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - - - - 1.00 1.00 - - stnt1h { z0.d }, p0, [z1.d]
-# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 stnt1h { z0.h }, p0, [x0, x0, lsl #1]
-# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - - - - 0.25 0.25 0.25 0.25 stnt1h { z0.h }, p0, [x0]
+# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - 0.25 0.25 0.25 0.25 0.50 0.50 - - stnt1h { z0.h }, p0, [x0, x0, lsl #1]
+# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - - - - 0.50 0.50 - - stnt1h { z0.h }, p0, [x0]
# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - - - - - 2.00 2.00 - - stnt1h { z0.s }, p0, [z1.s]
-# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - - - - 0.25 0.25 0.25 0.25 stnt1h { z21.h }, p5, [x10, #7, mul vl]
-# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - - - - 0.25 0.25 0.25 0.25 stnt1h { z23.h }, p3, [x13, #-8, mul vl]
+# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - - - - 0.50 0.50 - - stnt1h { z21.h }, p5, [x10, #7, mul vl]
+# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - - - - 0.50 0.50 - - stnt1h { z23.h }, p3, [x13, #-8, mul vl]
# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - - - - 1.00 1.00 - - stnt1h { z31.d }, p7, [z31.d, x0]
# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - - - - 1.00 1.00 - - stnt1h { z31.d }, p7, [z31.d]
# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - - - - - 2.00 2.00 - - stnt1h { z31.s }, p7, [z31.s, x0]
# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - - - - - 2.00 2.00 - - stnt1h { z31.s }, p7, [z31.s]
# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - - - - 1.00 1.00 - - stnt1w { z0.d }, p0, [z1.d]
-# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - - - - 0.25 0.25 0.25 0.25 stnt1w { z0.s }, p0, [x0, x0, lsl #2]
-# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - - - - 0.25 0.25 0.25 0.25 stnt1w { z0.s }, p0, [x0]
+# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - - - - 0.50 0.50 - - stnt1w { z0.s }, p0, [x0, x0, lsl #2]
+# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - - - - 0.50 0.50 - - stnt1w { z0.s }, p0, [x0]
# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - - - - - 2.00 2.00 - - stnt1w { z0.s }, p0, [z1.s]
-# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - - - - 0.25 0.25 0.25 0.25 stnt1w { z21.s }, p5, [x10, #7, mul vl]
-# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - - - - 0.25 0.25 0.25 0.25 stnt1w { z23.s }, p3, [x13, #-8, mul vl]
+# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - - - - 0.50 0.50 - - stnt1w { z21.s }, p5, [x10, #7, mul vl]
+# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - - - - 0.50 0.50 - - stnt1w { z23.s }, p3, [x13, #-8, mul vl]
# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - - - - 1.00 1.00 - - stnt1w { z31.d }, p7, [z31.d, x0]
# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - - - - 1.00 1.00 - - stnt1w { z31.d }, p7, [z31.d]
# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - - - - - 2.00 2.00 - - stnt1w { z31.s }, p7, [z31.s, x0]
diff --git a/llvm/test/tools/llvm-objdump/MachO/disassemble-source-dsym.test b/llvm/test/tools/llvm-objdump/MachO/disassemble-source-dsym.test
index aaaf6bf..9899dc5 100644
--- a/llvm/test/tools/llvm-objdump/MachO/disassemble-source-dsym.test
+++ b/llvm/test/tools/llvm-objdump/MachO/disassemble-source-dsym.test
@@ -13,4 +13,35 @@
# RUN: dsymutil -f -oso-prepend-path=%p/../../dsymutil/ %t3 -o %t3.dSYM
# RUN: llvm-objdump --source --prefix=%p/../../dsymutil %t3 | FileCheck --check-prefix=SOURCE %s
+## Test that --source works with --macho flag.
+
+## --macho w/ explicit .dSYM
+# RUN: llvm-objdump < %p/../../dsymutil/Inputs/basic.macho.x86_64 - --source --macho --dsym=%t1.dSYM --prefix=%p/../../dsymutil | \
+# RUN: FileCheck --check-prefix=SOURCE %s
+
+## --macho w/ auto-detected .dSYM (dir)
+# RUN: llvm-objdump --source --macho --prefix=%p/../../dsymutil %t2 | FileCheck --check-prefix=SOURCE %s
+
+## --macho w/ auto-detected .dSYM (file)
+# RUN: llvm-objdump --source --macho --prefix=%p/../../dsymutil %t3 | FileCheck --check-prefix=SOURCE %s
+
# SOURCE: ; int bar(int arg) {
+
+## Test that --line-numbers works with --macho flag.
+
+## --macho -l w/ explicit .dSYM
+# RUN: llvm-objdump -d -l --macho --dsym=%t1.dSYM %p/../../dsymutil/Inputs/basic.macho.x86_64 | FileCheck --check-prefix=LINE %s
+
+## --macho -l w/ object file (embedded debug info)
+# RUN: llvm-objdump -d -l --macho %p/../../dsymutil/Inputs/basic1.macho.x86_64.o | FileCheck --check-prefix=LINE_OBJ %s
+
+# LINE: (__TEXT,__text) section
+# LINE: _bar:
+# LINE: ; bar():
+# LINE: ; {{.*}}basic3.c:
+
+# LINE_OBJ: (__TEXT,__text) section
+# LINE_OBJ: _main:
+# LINE_OBJ: ; main():
+# LINE_OBJ: ; {{.*}}basic1.c:23
+# LINE_OBJ: pushq %rbp ## basic1.c:23:0
diff --git a/llvm/test/tools/llvm-profdata/input-wildcard.test b/llvm/test/tools/llvm-profdata/input-wildcard.test
new file mode 100644
index 0000000..f2c46c9
--- /dev/null
+++ b/llvm/test/tools/llvm-profdata/input-wildcard.test
@@ -0,0 +1,15 @@
+# This test verifies that llvm-profdata will do wildcard expansion on its
+# arguments. The expansion is done by Windows-specific support in InitLLVM, so
+# we only expect this to work on Windows hosts.
+# REQUIRES: system-windows
+
+# Create two files to glob.
+RUN: echo '# empty profile 1' > %t.prof1.proftxt
+RUN: echo '# empty profile 2' >> %t.prof2.proftxt
+
+# Prevent LIT itself from globbing by quoting the wildcard argument.
+RUN: llvm-profdata merge "%t.*.proftxt" -dump-input-file-list -o /dev/null | FileCheck %s
+
+# Verify that llvm-profdata expanded the wildcard argument.
+CHECK: 1,{{.*}}.prof1.proftxt
+CHECK-NEXT: 1,{{.*}}.prof2.proftxt
diff --git a/llvm/test/tools/llvm-readobj/ELF/section-types.test b/llvm/test/tools/llvm-readobj/ELF/section-types.test
index 904892a..12a9d05 100644
--- a/llvm/test/tools/llvm-readobj/ELF/section-types.test
+++ b/llvm/test/tools/llvm-readobj/ELF/section-types.test
@@ -63,6 +63,8 @@
# LLVM: Type: SHT_LLVM_PART_PHDR
# LLVM: Name: .llvm.lto
# LLVM: Type: SHT_LLVM_LTO
+# LLVM: Name: .llvm.callgraph
+# LLVM: Type: SHT_LLVM_CALL_GRAPH
# LLVM: Name: gnu_sframe
# LLVM: Type: SHT_GNU_SFRAME
# LLVM: Name: gnu_attributes
@@ -127,6 +129,7 @@
# GNU-NEXT: part1 LLVM_PART_EHDR
# GNU-NEXT: .phdrs LLVM_PART_PHDR
# GNU-NEXT: .llvm.lto LLVM_LTO
+# GNU-NEXT: .llvm.callgraph LLVM_CALL_GRAPH
# GNU-NEXT: gnu_sframe SFRAME
# GNU-NEXT: gnu_attributes ATTRIBUTES
# GNU-NEXT: gnu_hash GNU_HASH
@@ -218,6 +221,8 @@ Sections:
Type: SHT_LLVM_PART_PHDR
- Name: .llvm.lto
Type: SHT_LLVM_LTO
+ - Name: .llvm.callgraph
+ Type: SHT_LLVM_CALL_GRAPH
- Name: gnu_sframe
Type: SHT_GNU_SFRAME
- Name: gnu_attributes
diff --git a/llvm/test/tools/opt/no-target-machine.ll b/llvm/test/tools/opt/no-target-machine.ll
new file mode 100644
index 0000000..4f07c81
--- /dev/null
+++ b/llvm/test/tools/opt/no-target-machine.ll
@@ -0,0 +1,18 @@
+; Report error when pass requires TargetMachine.
+; RUN: not opt -passes=atomic-expand -disable-output %s 2>&1 | FileCheck %s
+; RUN: not opt -passes=codegenprepare -disable-output %s 2>&1 | FileCheck %s
+; RUN: not opt -passes=complex-deinterleaving -disable-output %s 2>&1 | FileCheck %s
+; RUN: not opt -passes=dwarf-eh-prepare -disable-output %s 2>&1 | FileCheck %s
+; RUN: not opt -passes=expand-large-div-rem -disable-output %s 2>&1 | FileCheck %s
+; RUN: not opt -passes=expand-memcmp -disable-output %s 2>&1 | FileCheck %s
+; RUN: not opt -passes=indirectbr-expand -disable-output %s 2>&1 | FileCheck %s
+; RUN: not opt -passes=interleaved-access -disable-output %s 2>&1 | FileCheck %s
+; RUN: not opt -passes=interleaved-load-combine -disable-output %s 2>&1 | FileCheck %s
+; RUN: not opt -passes=safe-stack -disable-output %s 2>&1 | FileCheck %s
+; RUN: not opt -passes=select-optimize -disable-output %s 2>&1 | FileCheck %s
+; RUN: not opt -passes=stack-protector -disable-output %s 2>&1 | FileCheck %s
+; RUN: not opt -passes=typepromotion -disable-output %s 2>&1 | FileCheck %s
+; RUN: not opt -passes='expand-fp<O1>' -disable-output %s 2>&1 | FileCheck %s
+define void @foo() { ret void }
+; CHECK: pass '{{.+}}' requires TargetMachine
+;requires TargetMachine