aboutsummaryrefslogtreecommitdiff
path: root/llvm
diff options
context:
space:
mode:
authorNoah Goldstein <goldstein.w.n@gmail.com>2024-03-13 13:13:52 -0700
committerFangrui Song <i@maskray.me>2024-03-13 13:13:52 -0700
commit9ce8691dea8dadc1302abacf4302f3b805e1448d (patch)
treefdc2da3081156b4c9b80b0d417f090efadac946c /llvm
parent795e3c3d94da0a664642d4580d87c82c02d5eca4 (diff)
parent744a23f24b08e8b988b176173c433d64761e66b3 (diff)
downloadllvm-users/MaskRay/spr/main.llvm-objcopy-add-compress-sections.zip
llvm-users/MaskRay/spr/main.llvm-objcopy-add-compress-sections.tar.gz
llvm-users/MaskRay/spr/main.llvm-objcopy-add-compress-sections.tar.bz2
[𝘀𝗽𝗿] changes introduced through rebaseusers/MaskRay/spr/main.llvm-objcopy-add-compress-sections
Created using spr 1.3.5-bogner [skip ci]
Diffstat (limited to 'llvm')
-rw-r--r--llvm/CMakeLists.txt12
-rw-r--r--llvm/cmake/modules/AddLLVM.cmake15
-rw-r--r--llvm/docs/AMDGPUUsage.rst31
-rw-r--r--llvm/docs/CommandGuide/llvm-ar.rst2
-rw-r--r--llvm/docs/ReleaseNotes.rst3
-rw-r--r--llvm/include/llvm/ADT/STLExtras.h13
-rw-r--r--llvm/include/llvm/CodeGen/AccelTable.h2
-rw-r--r--llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h2
-rw-r--r--llvm/include/llvm/CodeGen/SDPatternMatch.h16
-rw-r--r--llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h4
-rw-r--r--llvm/include/llvm/IR/BasicBlock.h32
-rw-r--r--llvm/include/llvm/IR/DebugProgramInstruction.h62
-rw-r--r--llvm/include/llvm/IR/Instruction.h31
-rw-r--r--llvm/include/llvm/IR/PassManager.h2
-rw-r--r--llvm/include/llvm/MC/MCInstBuilder.h6
-rw-r--r--llvm/include/llvm/Object/Archive.h1
-rw-r--r--llvm/include/llvm/Transforms/Scalar/Float2Int.h2
-rw-r--r--llvm/lib/Analysis/ValueTracking.cpp43
-rw-r--r--llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp6
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp3
-rw-r--r--llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp35
-rw-r--r--llvm/lib/CodeGen/CodeGenPrepare.cpp8
-rw-r--r--llvm/lib/CodeGen/MIRPrinter.cpp4
-rw-r--r--llvm/lib/CodeGen/SelectOptimize.cpp15
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp84
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp26
-rw-r--r--llvm/lib/CodeGen/SjLjEHPrepare.cpp4
-rw-r--r--llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderVTune.cpp2
-rw-r--r--llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp30
-rw-r--r--llvm/lib/IR/AsmWriter.cpp2
-rw-r--r--llvm/lib/IR/BasicBlock.cpp176
-rw-r--r--llvm/lib/IR/DebugInfo.cpp2
-rw-r--r--llvm/lib/IR/DebugProgramInstruction.cpp70
-rw-r--r--llvm/lib/IR/Instruction.cpp37
-rw-r--r--llvm/lib/IR/LLVMContextImpl.cpp2
-rw-r--r--llvm/lib/IR/LLVMContextImpl.h12
-rw-r--r--llvm/lib/MC/MCSectionXCOFF.cpp3
-rw-r--r--llvm/lib/Object/Archive.cpp15
-rw-r--r--llvm/lib/Object/ArchiveWriter.cpp28
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp51
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.td53
-rw-r--r--llvm/lib/Target/AArch64/SMEInstrFormats.td2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructions.td24
-rw-r--r--llvm/lib/Target/AMDGPU/BUFInstructions.td1
-rw-r--r--llvm/lib/Target/AMDGPU/DSInstructions.td20
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp65
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.h4
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td51
-rw-r--r--llvm/lib/Target/PowerPC/GISel/PPCInstructionSelector.cpp4
-rw-r--r--llvm/lib/Target/PowerPC/P10InstrResources.td2
-rw-r--r--llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp8
-rw-r--r--llvm/lib/Target/PowerPC/PPCBack2BackFusion.def4
-rw-r--r--llvm/lib/Target/PowerPC/PPCFastISel.cpp10
-rw-r--r--llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp44
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstr64Bit.td4
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrInfo.cpp12
-rw-r--r--llvm/lib/Target/PowerPC/PPCMacroFusion.def6
-rw-r--r--llvm/lib/Target/PowerPC/PPCSubtarget.cpp22
-rw-r--r--llvm/lib/Target/PowerPC/PPCSubtarget.h2
-rw-r--r--llvm/lib/Target/PowerPC/PPCTOCRegDeps.cpp3
-rw-r--r--llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp50
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp16
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp10
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h2
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp49
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp136
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp32
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h3
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp41
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp4
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVUtils.h26
-rw-r--r--llvm/lib/TargetParser/Host.cpp15
-rw-r--r--llvm/lib/Transforms/IPO/AttributorAttributes.cpp2
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp5
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp4
-rw-r--r--llvm/lib/Transforms/InstCombine/InstructionCombining.cpp3
-rw-r--r--llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp47
-rw-r--r--llvm/lib/Transforms/Scalar/Float2Int.cpp29
-rw-r--r--llvm/lib/Transforms/Utils/Local.cpp2
-rw-r--r--llvm/lib/Transforms/Utils/LoopConstrainer.cpp22
-rw-r--r--llvm/lib/Transforms/Utils/LoopRotationUtils.cpp28
-rw-r--r--llvm/lib/Transforms/Utils/SimplifyCFG.cpp9
-rw-r--r--llvm/lib/Transforms/Utils/ValueMapper.cpp8
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h16
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp3
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp8
-rw-r--r--llvm/test/Analysis/Lint/crash_empty_iterator.ll22
-rw-r--r--llvm/test/Analysis/ValueTracking/knownbits-select-from-cond.ll81
-rw-r--r--llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll21
-rw-r--r--llvm/test/CodeGen/AArch64/bitcast.ll4
-rw-r--r--llvm/test/CodeGen/AArch64/extbinopload.ll31
-rw-r--r--llvm/test/CodeGen/AArch64/fold-global-offsets.ll2
-rw-r--r--llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll5
-rw-r--r--llvm/test/CodeGen/AArch64/fptoi.ll256
-rw-r--r--llvm/test/CodeGen/AArch64/neon-truncstore.ll5
-rw-r--r--llvm/test/CodeGen/AArch64/sadd_sat_vec.ll2
-rw-r--r--llvm/test/CodeGen/AArch64/shuffle-tbl34.ll14
-rw-r--r--llvm/test/CodeGen/AArch64/shufflevector.ll4
-rw-r--r--llvm/test/CodeGen/AArch64/sme-write-vg.ll24
-rw-r--r--llvm/test/CodeGen/AArch64/ssub_sat_vec.ll2
-rw-r--r--llvm/test/CodeGen/AArch64/tbl-loops.ll4
-rw-r--r--llvm/test/CodeGen/AArch64/trunc-to-tbl.ll28
-rw-r--r--llvm/test/CodeGen/AArch64/uadd_sat_vec.ll2
-rw-r--r--llvm/test/CodeGen/AArch64/usub_sat_vec.ll2
-rw-r--r--llvm/test/CodeGen/AArch64/vcvt-oversize.ll5
-rw-r--r--llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll2
-rw-r--r--llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll22
-rw-r--r--llvm/test/CodeGen/AArch64/xor.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/bf16.ll903
-rw-r--r--llvm/test/CodeGen/AMDGPU/clamp.ll64
-rw-r--r--llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll103
-rw-r--r--llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll728
-rw-r--r--llvm/test/CodeGen/AMDGPU/fcanonicalize.ll22
-rw-r--r--llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll22
-rw-r--r--llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll122
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll122
-rw-r--r--llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll191
-rw-r--r--llvm/test/CodeGen/Hexagon/addrmode-immop.mir4
-rw-r--r--llvm/test/CodeGen/NVPTX/b52037.ll2
-rw-r--r--llvm/test/CodeGen/PowerPC/remove-copy-crunsetcrbit.mir2
-rw-r--r--llvm/test/CodeGen/PowerPC/toc-data-large-array.ll16
-rw-r--r--llvm/test/CodeGen/PowerPC/toc-data-large-array2.ll8
-rw-r--r--llvm/test/CodeGen/PowerPC/toc-data-struct-array.ll110
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/compressstore.ll871
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/fixed-vectors-compressstore-fp.ll1004
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/fixed-vectors-compressstore-int.ll928
-rw-r--r--llvm/test/CodeGen/SPIRV/ComparePointers.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/capability-kernel.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/pointers/getelementptr-addressspace.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/pointers/getelementptr-base-type.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-addressspace.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-type-deduction-no-bitcast-to-generic.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-type.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/pointers/load-addressspace.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/pointers/store-operand-ptr-to-struct.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/pointers/struct-opaque-pointers.ll2
-rw-r--r--llvm/test/CodeGen/SPIRV/pointers/two-bitcast-or-param-users.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/pointers/two-subsequent-bitcasts.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call-rev.ll28
-rw-r--r--llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call.ll28
-rw-r--r--llvm/test/CodeGen/SPIRV/pointers/typeof-ptr-int.ll29
-rw-r--r--llvm/test/CodeGen/SPIRV/relationals.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/simple.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/AtomicCompareExchangeExplicit_cl20.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/BitReversePref.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/BuildNDRange.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/BuildNDRange_2.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/ConvertPtr.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/DecorationAlignment.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/DecorationMaxByteOffset.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/DivRem.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/ExecutionMode_SPIR_to_SPIRV.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/GlobalFunAnnotate.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_cmpxchg.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_legacy.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_work_item_fence.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/OpenCL/barrier.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/OpenCL/sub_group_mask.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/OpenCL/work_group_barrier.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/atomic_flag.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/atomic_load_store.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/bitcast.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/block_w_struct_return.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/builtin_calls.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/builtin_vars.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_arithmetics.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_opt.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/check_ro_qualifier.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/cl-types.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/clk_event_t.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/enqueue_kernel.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/explicit-conversions.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/extract_insert_value.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/fadd.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/fclamp.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/fcmp.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/fdiv.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/fmod.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/fmul.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/fneg.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/fp_contract_reassoc_fast_mode.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/frem.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/fsub.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/get_image_num_mip_levels.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/global_block.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/group_ops.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/isequal.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/relationals_double.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/relationals_float.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/relationals_half.ll1
-rw-r--r--llvm/test/CodeGen/WinCFGuard/cfguard-mingw.ll12
-rw-r--r--llvm/test/CodeGen/X86/tls-align.ll2
-rw-r--r--llvm/test/DebugInfo/X86/tu-to-non-tu.ll8
-rw-r--r--llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s44
-rw-r--r--llvm/test/ThinLTO/X86/Inputs/devirt_single_hybrid_bar.ll2
-rw-r--r--llvm/test/ThinLTO/X86/devirt_after_filtering_unreachable.ll2
-rw-r--r--llvm/test/ThinLTO/X86/devirt_external_comdat_same_guid.ll2
-rw-r--r--llvm/test/ThinLTO/X86/devirt_local_same_guid.ll2
-rw-r--r--llvm/test/ThinLTO/X86/lower_type_test_phi.ll4
-rw-r--r--llvm/test/ThinLTO/X86/nodevirt-nonpromoted-typeid.ll2
-rw-r--r--llvm/test/ThinLTO/X86/type_test_noindircall.ll4
-rw-r--r--llvm/test/Transforms/Float2Int/basic.ll251
-rw-r--r--llvm/test/Transforms/IRCE/compound-loop-bound.ll85
-rw-r--r--llvm/test/Transforms/InstCombine/shift-cttz-ctlz.ll93
-rw-r--r--llvm/test/Transforms/InstCombine/zext.ll31
-rw-r--r--llvm/test/Transforms/LoopVectorize/dbg-outer-loop-vect.ll40
-rw-r--r--llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll88
-rw-r--r--llvm/test/Transforms/SLPVectorizer/AArch64/gather-with-minbith-user.ll89
-rw-r--r--llvm/test/Transforms/SLPVectorizer/AArch64/user-node-not-in-bitwidths.ll83
-rw-r--r--llvm/test/tools/llvm-ar/coff-symtab.test91
-rw-r--r--llvm/test/tools/llvm-ar/no-symtab.yaml32
-rw-r--r--llvm/test/tools/llvm-mca/RISCV/SiFive7/vector-integer-arithmetic.s70
-rw-r--r--llvm/test/tools/llvm-objcopy/ELF/compress-debug-sections-zstd.test2
-rw-r--r--llvm/tools/llvm-ar/llvm-ar.cpp22
-rw-r--r--llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp9
-rw-r--r--llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp24
-rw-r--r--llvm/tools/llvm-exegesis/lib/SubprocessMemory.h2
-rw-r--r--llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp11
-rw-r--r--llvm/unittests/IR/BasicBlockDbgInfoTest.cpp76
-rw-r--r--llvm/unittests/IR/DebugInfoTest.cpp26
-rw-r--r--llvm/unittests/Transforms/Utils/CallPromotionUtilsTest.cpp12
-rw-r--r--llvm/unittests/Transforms/Utils/DebugifyTest.cpp2
-rw-r--r--llvm/unittests/tools/llvm-exegesis/X86/SubprocessMemoryTest.cpp5
-rw-r--r--llvm/utils/TableGen/DecoderEmitter.cpp4
225 files changed, 4671 insertions, 4277 deletions
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index bd141619..6f5647d 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -159,6 +159,18 @@ foreach(proj IN LISTS LLVM_ENABLE_RUNTIMES)
endif()
endforeach()
+# Set a shorthand option to enable the GPU build of the 'libc' project.
+option(LIBC_GPU_BUILD "Enable the 'libc' project targeting the GPU" OFF)
+if(LIBC_GPU_BUILD)
+ if(LLVM_RUNTIME_TARGETS)
+ list(APPEND LLVM_RUNTIME_TARGETS "nvptx64-nvidia-cuda" "amdgcn-amd-amdhsa")
+ else()
+ set(LLVM_RUNTIME_TARGETS "default;nvptx64-nvidia-cuda;amdgcn-amd-amdhsa")
+ endif()
+ list(APPEND RUNTIMES_nvptx64-nvidia-cuda_LLVM_ENABLE_RUNTIMES "libc")
+ list(APPEND RUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES "libc")
+endif()
+
set(NEED_LIBC_HDRGEN FALSE)
if("libc" IN_LIST LLVM_ENABLE_RUNTIMES)
set(NEED_LIBC_HDRGEN TRUE)
diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake
index 374f5e0..eb9e610 100644
--- a/llvm/cmake/modules/AddLLVM.cmake
+++ b/llvm/cmake/modules/AddLLVM.cmake
@@ -257,6 +257,16 @@ if (NOT DEFINED LLVM_LINKER_DETECTED AND NOT WIN32)
message(STATUS "Linker detection: unknown")
endif()
endif()
+
+ # Apple's linker complains about duplicate libraries, which CMake likes to do
+ # to support ELF platforms. To silence that warning, we can use
+ # -no_warn_duplicate_libraries, but only in versions of the linker that
+ # support that flag.
+ if(NOT LLVM_USE_LINKER AND ${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+ check_linker_flag(C "-Wl,-no_warn_duplicate_libraries" LLVM_LINKER_SUPPORTS_NO_WARN_DUPLICATE_LIBRARIES)
+ else()
+ set(LLVM_LINKER_SUPPORTS_NO_WARN_DUPLICATE_LIBRARIES OFF CACHE INTERNAL "")
+ endif()
endif()
function(add_link_opts target_name)
@@ -310,6 +320,11 @@ function(add_link_opts target_name)
endif()
endif()
+ if(LLVM_LINKER_SUPPORTS_NO_WARN_DUPLICATE_LIBRARIES)
+ set_property(TARGET ${target_name} APPEND_STRING PROPERTY
+ LINK_FLAGS " -Wl,-no_warn_duplicate_libraries")
+ endif()
+
if(ARG_SUPPORT_PLUGINS AND ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
set_property(TARGET ${target_name} APPEND_STRING PROPERTY
LINK_FLAGS " -Wl,-brtl")
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 99d7a48..fe37e85 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1312,23 +1312,30 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
List AMDGPU intrinsics.
+.. _amdgpu_metadata:
+
LLVM IR Metadata
-------------------
+================
-The AMDGPU backend implements the following LLVM IR metadata.
+The AMDGPU backend implements the following target custom LLVM IR
+metadata.
-.. table:: AMDGPU LLVM IR Metadata
- :name: amdgpu-llvm-ir-metadata-table
+.. _amdgpu_last_use:
+
+'``amdgpu.last.use``' Metadata
+------------------------------
+
+Sets TH_LOAD_LU temporal hint on load instructions that support it.
+Takes priority over nontemporal hint (TH_LOAD_NT). This takes no
+arguments.
+
+.. code-block:: llvm
+
+ %val = load i32, ptr %in, align 4, !amdgpu.last.use !{}
- ============================================== ==========================================================
- LLVM IR Metadata Description
- ============================================== ==========================================================
- !amdgpu.last.use Sets TH_LOAD_LU temporal hint on load instructions that support it.
- Takes priority over nontemporal hint (TH_LOAD_NT).
- ============================================== ==========================================================
LLVM IR Attributes
-------------------
+==================
The AMDGPU backend supports the following LLVM IR attributes.
@@ -1450,7 +1457,7 @@ The AMDGPU backend supports the following LLVM IR attributes.
======================================= ==========================================================
Calling Conventions
--------------------
+===================
The AMDGPU backend supports the following calling conventions:
diff --git a/llvm/docs/CommandGuide/llvm-ar.rst b/llvm/docs/CommandGuide/llvm-ar.rst
index 03d5b9e..63b3a51 100644
--- a/llvm/docs/CommandGuide/llvm-ar.rst
+++ b/llvm/docs/CommandGuide/llvm-ar.rst
@@ -261,7 +261,7 @@ Other
.. option:: --format=<type>
- This option allows for default, gnu, darwin or bsd ``<type>`` to be selected.
+ This option allows for default, gnu, darwin, bsd or coff ``<type>`` to be selected.
When creating an ``archive`` with the default ``<type>``, :program:``llvm-ar``
will attempt to infer it from the input files and fallback to the default
toolchain target if unable to do so.
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index b34a5f3..7be5173 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -153,6 +153,9 @@ Changes to the LLVM tools
if it's not specified with the ``--format`` argument and cannot be inferred from
input files.
+* llvm-ar now allows specifying COFF archive format with ``--format`` argument
+ and uses it by default for COFF targets.
+
* llvm-objcopy now supports ``--set-symbol-visibility`` and
``--set-symbols-visibility`` options for ELF input to change the
visibility of symbols.
diff --git a/llvm/include/llvm/ADT/STLExtras.h b/llvm/include/llvm/ADT/STLExtras.h
index 5ac549c..02a3074 100644
--- a/llvm/include/llvm/ADT/STLExtras.h
+++ b/llvm/include/llvm/ADT/STLExtras.h
@@ -1945,6 +1945,19 @@ auto partition(R &&Range, UnaryPredicate P) {
return std::partition(adl_begin(Range), adl_end(Range), P);
}
+/// Provide wrappers to std::binary_search which take ranges instead of having
+/// to pass begin/end explicitly.
+template <typename R, typename T> auto binary_search(R &&Range, T &&Value) {
+ return std::binary_search(adl_begin(Range), adl_end(Range),
+ std::forward<T>(Value));
+}
+
+template <typename R, typename T, typename Compare>
+auto binary_search(R &&Range, T &&Value, Compare C) {
+ return std::binary_search(adl_begin(Range), adl_end(Range),
+ std::forward<T>(Value), C);
+}
+
/// Provide wrappers to std::lower_bound which take ranges instead of having to
/// pass begin/end explicitly.
template <typename R, typename T> auto lower_bound(R &&Range, T &&Value) {
diff --git a/llvm/include/llvm/CodeGen/AccelTable.h b/llvm/include/llvm/CodeGen/AccelTable.h
index 6ee817a..cff8fcb 100644
--- a/llvm/include/llvm/CodeGen/AccelTable.h
+++ b/llvm/include/llvm/CodeGen/AccelTable.h
@@ -353,7 +353,7 @@ public:
dwarf::Index Index;
dwarf::Form Form;
};
- DebugNamesAbbrev(uint32_t DieTag) : DieTag(DieTag) {}
+ DebugNamesAbbrev(uint32_t DieTag) : DieTag(DieTag), Number(0) {}
/// Add attribute encoding to an abbreviation.
void addAttribute(const DebugNamesAbbrev::AttributeEncoding &Attr) {
AttrVect.push_back(Attr);
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
index bfac54a..29f675b 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
@@ -205,7 +205,7 @@ private:
bool translate(const Constant &C, Register Reg);
/// Examine any debug-info attached to the instruction (in the form of
- /// DPValues) and translate it.
+ /// DbgRecords) and translate it.
void translateDbgInfo(const Instruction &Inst,
MachineIRBuilder &MIRBuilder);
diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h
index 412bf42..a86c740 100644
--- a/llvm/include/llvm/CodeGen/SDPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h
@@ -496,6 +496,21 @@ inline BinaryOpc_match<LHS, RHS, true> m_Mul(const LHS &L, const RHS &R) {
}
template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, true> m_And(const LHS &L, const RHS &R) {
+ return BinaryOpc_match<LHS, RHS, true>(ISD::AND, L, R);
+}
+
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, true> m_Or(const LHS &L, const RHS &R) {
+ return BinaryOpc_match<LHS, RHS, true>(ISD::OR, L, R);
+}
+
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, true> m_Xor(const LHS &L, const RHS &R) {
+ return BinaryOpc_match<LHS, RHS, true>(ISD::XOR, L, R);
+}
+
+template <typename LHS, typename RHS>
inline BinaryOpc_match<LHS, RHS, false> m_UDiv(const LHS &L, const RHS &R) {
return BinaryOpc_match<LHS, RHS, false>(ISD::UDIV, L, R);
}
@@ -648,6 +663,7 @@ inline SpecificInt_match m_SpecificInt(uint64_t V) {
}
inline SpecificInt_match m_Zero() { return m_SpecificInt(0U); }
+inline SpecificInt_match m_One() { return m_SpecificInt(1U); }
inline SpecificInt_match m_AllOnes() { return m_SpecificInt(~0U); }
/// Match true boolean value based on the information provided by
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 5bbaa8c..c9ee0c2 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -1339,10 +1339,12 @@ public:
/// in reductions.
/// \param ReductionInfos A list of info on each reduction variable.
/// \param IsNoWait A flag set if the reduction is marked as nowait.
+ /// \param IsByRef A flag set if the reduction is using reference
+ /// or direct value.
InsertPointTy createReductions(const LocationDescription &Loc,
InsertPointTy AllocaIP,
ArrayRef<ReductionInfo> ReductionInfos,
- bool IsNoWait = false);
+ bool IsNoWait = false, bool IsByRef = false);
///}
diff --git a/llvm/include/llvm/IR/BasicBlock.h b/llvm/include/llvm/IR/BasicBlock.h
index 5bac113..71c1a83 100644
--- a/llvm/include/llvm/IR/BasicBlock.h
+++ b/llvm/include/llvm/IR/BasicBlock.h
@@ -78,13 +78,13 @@ public:
DPMarker *createMarker(InstListType::iterator It);
/// Convert variable location debugging information stored in dbg.value
- /// intrinsics into DPMarker / DPValue records. Deletes all dbg.values in
+ /// intrinsics into DPMarkers / DbgRecords. Deletes all dbg.values in
/// the process and sets IsNewDbgInfoFormat = true. Only takes effect if
/// the UseNewDbgInfoFormat LLVM command line option is given.
void convertToNewDbgValues();
/// Convert variable location debugging information stored in DPMarkers and
- /// DPValues into the dbg.value intrinsic representation. Sets
+ /// DbgRecords into the dbg.value intrinsic representation. Sets
/// IsNewDbgInfoFormat = false.
void convertFromNewDbgValues();
@@ -93,50 +93,50 @@ public:
/// if necessary.
void setIsNewDbgInfoFormat(bool NewFlag);
- /// Record that the collection of DPValues in \p M "trails" after the last
+ /// Record that the collection of DbgRecords in \p M "trails" after the last
/// instruction of this block. These are equivalent to dbg.value intrinsics
/// that exist at the end of a basic block with no terminator (a transient
/// state that occurs regularly).
void setTrailingDbgRecords(DPMarker *M);
- /// Fetch the collection of DPValues that "trail" after the last instruction
+ /// Fetch the collection of DbgRecords that "trail" after the last instruction
/// of this block, see \ref setTrailingDbgRecords. If there are none, returns
/// nullptr.
DPMarker *getTrailingDbgRecords();
- /// Delete any trailing DPValues at the end of this block, see
+ /// Delete any trailing DbgRecords at the end of this block, see
/// \ref setTrailingDbgRecords.
void deleteTrailingDbgRecords();
void dumpDbgValues() const;
- /// Return the DPMarker for the position given by \p It, so that DPValues can
- /// be inserted there. This will either be nullptr if not present, a DPMarker,
- /// or TrailingDPValues if It is end().
+ /// Return the DPMarker for the position given by \p It, so that DbgRecords
+ /// can be inserted there. This will either be nullptr if not present, a
+ /// DPMarker, or TrailingDbgRecords if It is end().
DPMarker *getMarker(InstListType::iterator It);
/// Return the DPMarker for the position that comes after \p I. \see
/// BasicBlock::getMarker, this can be nullptr, a DPMarker, or
- /// TrailingDPValues if there is no next instruction.
+ /// TrailingDbgRecords if there is no next instruction.
DPMarker *getNextMarker(Instruction *I);
- /// Insert a DPValue into a block at the position given by \p I.
+ /// Insert a DbgRecord into a block at the position given by \p I.
void insertDbgRecordAfter(DbgRecord *DPV, Instruction *I);
- /// Insert a DPValue into a block at the position given by \p Here.
+ /// Insert a DbgRecord into a block at the position given by \p Here.
void insertDbgRecordBefore(DbgRecord *DPV, InstListType::iterator Here);
- /// Eject any debug-info trailing at the end of a block. DPValues can
+ /// Eject any debug-info trailing at the end of a block. DbgRecords can
/// transiently be located "off the end" of a block if the blocks terminator
/// is temporarily removed. Once a terminator is re-inserted this method will
- /// move such DPValues back to the right place (ahead of the terminator).
- void flushTerminatorDbgValues();
+ /// move such DbgRecords back to the right place (ahead of the terminator).
+ void flushTerminatorDbgRecords();
/// In rare circumstances instructions can be speculatively removed from
/// blocks, and then be re-inserted back into that position later. When this
/// happens in RemoveDIs debug-info mode, some special patching-up needs to
/// occur: inserting into the middle of a sequence of dbg.value intrinsics
- /// does not have an equivalent with DPValues.
+ /// does not have an equivalent with DbgRecords.
void reinsertInstInDbgRecords(Instruction *I,
std::optional<DbgRecord::self_iterator> Pos);
@@ -522,7 +522,7 @@ private:
BasicBlock::iterator FromEndIt);
/// Perform any debug-info specific maintenence for the given splice
- /// activity. In the DPValue debug-info representation, debug-info is not
+ /// activity. In the DbgRecord debug-info representation, debug-info is not
/// in instructions, and so it does not automatically move from one block
/// to another.
void spliceDebugInfo(BasicBlock::iterator ToIt, BasicBlock *FromBB,
diff --git a/llvm/include/llvm/IR/DebugProgramInstruction.h b/llvm/include/llvm/IR/DebugProgramInstruction.h
index 507b652..1afc925 100644
--- a/llvm/include/llvm/IR/DebugProgramInstruction.h
+++ b/llvm/include/llvm/IR/DebugProgramInstruction.h
@@ -1,4 +1,4 @@
-//===-- llvm/DebugProgramInstruction.h - Stream of debug info -------*- C++ -*-===//
+//===-- llvm/DebugProgramInstruction.h - Stream of debug info ---*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -15,10 +15,10 @@
// %bar = void call @ext(%foo);
//
// and all information is stored in the Value / Metadata hierachy defined
-// elsewhere in LLVM. In the "DPValue" design, each instruction /may/ have a
-// connection with a DPMarker, which identifies a position immediately before the
-// instruction, and each DPMarker /may/ then have connections to DPValues which
-// record the variable assignment information. To illustrate:
+// elsewhere in LLVM. In the "DbgRecord" design, each instruction /may/ have a
+// connection with a DPMarker, which identifies a position immediately before
+// the instruction, and each DPMarker /may/ then have connections to DbgRecords
+// which record the variable assignment information. To illustrate:
//
// %foo = add i32 1, %0
// ; foo->DbgMarker == nullptr
@@ -26,7 +26,7 @@
// ;; the instruction for %foo, therefore it has no DbgMarker.
// %bar = void call @ext(%foo)
// ; bar->DbgMarker = {
-// ; StoredDPValues = {
+// ; StoredDbgRecords = {
// ; DPValue(metadata i32 %foo, ...)
// ; }
// ; }
@@ -119,7 +119,7 @@ public:
/// Base class for non-instruction debug metadata records that have positions
/// within IR. Features various methods copied across from the Instruction
/// class to aid ease-of-use. DbgRecords should always be linked into a
-/// DPMarker's StoredDPValues list. The marker connects a DbgRecord back to
+/// DPMarker's StoredDbgRecords list. The marker connects a DbgRecord back to
/// it's position in the BasicBlock.
///
/// We need a discriminator for dyn/isa casts. In order to avoid paying for a
@@ -557,8 +557,8 @@ public:
/// intrinsics. There is a one-to-one relationship between each debug
/// intrinsic in a block and each DbgRecord once the representation has been
/// converted, and the ordering is meaningful in the same way.
- simple_ilist<DbgRecord> StoredDPValues;
- bool empty() const { return StoredDPValues.empty(); }
+ simple_ilist<DbgRecord> StoredDbgRecords;
+ bool empty() const { return StoredDbgRecords.empty(); }
const BasicBlock *getParent() const;
BasicBlock *getParent();
@@ -576,54 +576,56 @@ public:
void print(raw_ostream &O, bool IsForDebug = false) const;
void print(raw_ostream &ROS, ModuleSlotTracker &MST, bool IsForDebug) const;
- /// Produce a range over all the DPValues in this Marker.
+ /// Produce a range over all the DbgRecords in this Marker.
iterator_range<simple_ilist<DbgRecord>::iterator> getDbgRecordRange();
iterator_range<simple_ilist<DbgRecord>::const_iterator>
getDbgRecordRange() const;
- /// Transfer any DPValues from \p Src into this DPMarker. If \p InsertAtHead
- /// is true, place them before existing DPValues, otherwise afterwards.
+ /// Transfer any DbgRecords from \p Src into this DPMarker. If \p InsertAtHead
+ /// is true, place them before existing DbgRecords, otherwise afterwards.
void absorbDebugValues(DPMarker &Src, bool InsertAtHead);
- /// Transfer the DPValues in \p Range from \p Src into this DPMarker. If
- /// \p InsertAtHead is true, place them before existing DPValues, otherwise
+ /// Transfer the DbgRecords in \p Range from \p Src into this DPMarker. If
+ /// \p InsertAtHead is true, place them before existing DbgRecords, otherwise
// afterwards.
void absorbDebugValues(iterator_range<DbgRecord::self_iterator> Range,
DPMarker &Src, bool InsertAtHead);
- /// Insert a DPValue into this DPMarker, at the end of the list. If
+ /// Insert a DbgRecord into this DPMarker, at the end of the list. If
/// \p InsertAtHead is true, at the start.
void insertDbgRecord(DbgRecord *New, bool InsertAtHead);
- /// Insert a DPValue prior to a DPValue contained within this marker.
+ /// Insert a DbgRecord prior to a DbgRecord contained within this marker.
void insertDbgRecord(DbgRecord *New, DbgRecord *InsertBefore);
- /// Insert a DPValue after a DPValue contained within this marker.
+ /// Insert a DbgRecord after a DbgRecord contained within this marker.
void insertDbgRecordAfter(DbgRecord *New, DbgRecord *InsertAfter);
/// Clone all DPMarkers from \p From into this marker. There are numerous
/// options to customise the source/destination, due to gnarliness, see class
/// comment.
- /// \p FromHere If non-null, copy from FromHere to the end of From's DPValues
- /// \p InsertAtHead Place the cloned DPValues at the start of StoredDPValues
- /// \returns Range over all the newly cloned DPValues
+ /// \p FromHere If non-null, copy from FromHere to the end of From's
+ /// DbgRecords
+ /// \p InsertAtHead Place the cloned DbgRecords at the start of
+ /// StoredDbgRecords
+ /// \returns Range over all the newly cloned DbgRecords
iterator_range<simple_ilist<DbgRecord>::iterator>
cloneDebugInfoFrom(DPMarker *From,
std::optional<simple_ilist<DbgRecord>::iterator> FromHere,
bool InsertAtHead = false);
- /// Erase all DPValues in this DPMarker.
+ /// Erase all DbgRecords in this DPMarker.
void dropDbgRecords();
/// Erase a single DbgRecord from this marker. In an ideal future, we would
/// never erase an assignment in this way, but it's the equivalent to
/// erasing a debug intrinsic from a block.
void dropOneDbgRecord(DbgRecord *DR);
- /// We generally act like all llvm Instructions have a range of DPValues
+ /// We generally act like all llvm Instructions have a range of DbgRecords
/// attached to them, but in reality sometimes we don't allocate the DPMarker
- /// to save time and memory, but still have to return ranges of DPValues. When
- /// we need to describe such an unallocated DPValue range, use this static
- /// markers range instead. This will bite us if someone tries to insert a
- /// DPValue in that range, but they should be using the Official (TM) API for
- /// that.
+ /// to save time and memory, but still have to return ranges of DbgRecords.
+ /// When we need to describe such an unallocated DbgRecord range, use this
+ /// static markers range instead. This will bite us if someone tries to insert
+ /// a DbgRecord in that range, but they should be using the Official (TM) API
+ /// for that.
static DPMarker EmptyDPMarker;
static iterator_range<simple_ilist<DbgRecord>::iterator>
getEmptyDbgRecordRange() {
- return make_range(EmptyDPMarker.StoredDPValues.end(),
- EmptyDPMarker.StoredDPValues.end());
+ return make_range(EmptyDPMarker.StoredDbgRecords.end(),
+ EmptyDPMarker.StoredDbgRecords.end());
}
};
@@ -632,7 +634,7 @@ inline raw_ostream &operator<<(raw_ostream &OS, const DPMarker &Marker) {
return OS;
}
-/// Inline helper to return a range of DPValues attached to a marker. It needs
+/// Inline helper to return a range of DbgRecords attached to a marker. It needs
/// to be inlined as it's frequently called, but also come after the declaration
/// of DPMarker. Thus: it's pre-declared by users like Instruction, then an
/// inlineable body defined here.
diff --git a/llvm/include/llvm/IR/Instruction.h b/llvm/include/llvm/IR/Instruction.h
index 817abd6..d6cf155 100644
--- a/llvm/include/llvm/IR/Instruction.h
+++ b/llvm/include/llvm/IR/Instruction.h
@@ -64,47 +64,48 @@ public:
/// Clone any debug-info attached to \p From onto this instruction. Used to
/// copy debugging information from one block to another, when copying entire
- /// blocks. \see DebugProgramInstruction.h , because the ordering of DPValues
- /// is still important, fine grain control of which instructions are moved and
- /// where they go is necessary.
+ /// blocks. \see DebugProgramInstruction.h , because the ordering of
+ /// DbgRecords is still important, fine grain control of which instructions
+ /// are moved and where they go is necessary.
/// \p From The instruction to clone debug-info from.
- /// \p from_here Optional iterator to limit DPValues cloned to be a range from
+ /// \p from_here Optional iterator to limit DbgRecords cloned to be a range
+ /// from
/// from_here to end().
- /// \p InsertAtHead Whether the cloned DPValues should be placed at the end
- /// or the beginning of existing DPValues attached to this.
- /// \returns A range over the newly cloned DPValues.
+ /// \p InsertAtHead Whether the cloned DbgRecords should be placed at the end
+ /// or the beginning of existing DbgRecords attached to this.
+ /// \returns A range over the newly cloned DbgRecords.
iterator_range<simple_ilist<DbgRecord>::iterator> cloneDebugInfoFrom(
const Instruction *From,
std::optional<simple_ilist<DbgRecord>::iterator> FromHere = std::nullopt,
bool InsertAtHead = false);
- /// Return a range over the DPValues attached to this instruction.
+ /// Return a range over the DbgRecords attached to this instruction.
iterator_range<simple_ilist<DbgRecord>::iterator> getDbgRecordRange() const {
return llvm::getDbgRecordRange(DbgMarker);
}
- /// Return an iterator to the position of the "Next" DPValue after this
+ /// Return an iterator to the position of the "Next" DbgRecord after this
/// instruction, or std::nullopt. This is the position to pass to
/// BasicBlock::reinsertInstInDbgRecords when re-inserting an instruction.
std::optional<simple_ilist<DbgRecord>::iterator> getDbgReinsertionPosition();
- /// Returns true if any DPValues are attached to this instruction.
+ /// Returns true if any DbgRecords are attached to this instruction.
bool hasDbgRecords() const;
- /// Transfer any DPValues on the position \p It onto this instruction,
- /// by simply adopting the sequence of DPValues (which is efficient) if
+ /// Transfer any DbgRecords on the position \p It onto this instruction,
+ /// by simply adopting the sequence of DbgRecords (which is efficient) if
/// possible, by merging two sequences otherwise.
void adoptDbgRecords(BasicBlock *BB, InstListType::iterator It,
bool InsertAtHead);
- /// Erase any DPValues attached to this instruction.
+ /// Erase any DbgRecords attached to this instruction.
void dropDbgRecords();
- /// Erase a single DPValue \p I that is attached to this instruction.
+ /// Erase a single DbgRecord \p I that is attached to this instruction.
void dropOneDbgRecord(DbgRecord *I);
/// Handle the debug-info implications of this instruction being removed. Any
- /// attached DPValues need to "fall" down onto the next instruction.
+ /// attached DbgRecords need to "fall" down onto the next instruction.
void handleMarkerRemoval();
protected:
diff --git a/llvm/include/llvm/IR/PassManager.h b/llvm/include/llvm/IR/PassManager.h
index c03d49c..ec8b809 100644
--- a/llvm/include/llvm/IR/PassManager.h
+++ b/llvm/include/llvm/IR/PassManager.h
@@ -227,7 +227,7 @@ public:
detail::getAnalysisResult<PassInstrumentationAnalysis>(
AM, IR, std::tuple<ExtraArgTs...>(ExtraArgs...));
- // RemoveDIs: if requested, convert debug-info to DPValue representation
+ // RemoveDIs: if requested, convert debug-info to DbgRecord representation
// for duration of these passes.
bool ShouldConvertDbgInfo = shouldConvertDbgInfo(IR);
if (ShouldConvertDbgInfo)
diff --git a/llvm/include/llvm/MC/MCInstBuilder.h b/llvm/include/llvm/MC/MCInstBuilder.h
index 6e5e9dd..d06ed4c 100644
--- a/llvm/include/llvm/MC/MCInstBuilder.h
+++ b/llvm/include/llvm/MC/MCInstBuilder.h
@@ -27,6 +27,12 @@ public:
Inst.setOpcode(Opcode);
}
+ /// Set the location.
+ MCInstBuilder &setLoc(SMLoc SM) {
+ Inst.setLoc(SM);
+ return *this;
+ }
+
/// Add a new register operand.
MCInstBuilder &addReg(unsigned Reg) {
Inst.addOperand(MCOperand::createReg(Reg));
diff --git a/llvm/include/llvm/Object/Archive.h b/llvm/include/llvm/Object/Archive.h
index f716300..a3165c3 100644
--- a/llvm/include/llvm/Object/Archive.h
+++ b/llvm/include/llvm/Object/Archive.h
@@ -339,6 +339,7 @@ public:
Kind kind() const { return (Kind)Format; }
bool isThin() const { return IsThin; }
static object::Archive::Kind getDefaultKind();
+ static object::Archive::Kind getDefaultKindForTriple(Triple &T);
child_iterator child_begin(Error &Err, bool SkipInternal = true) const;
child_iterator child_end() const;
diff --git a/llvm/include/llvm/Transforms/Scalar/Float2Int.h b/llvm/include/llvm/Transforms/Scalar/Float2Int.h
index 83be329..337e229 100644
--- a/llvm/include/llvm/Transforms/Scalar/Float2Int.h
+++ b/llvm/include/llvm/Transforms/Scalar/Float2Int.h
@@ -44,7 +44,7 @@ private:
std::optional<ConstantRange> calcRange(Instruction *I);
void walkBackwards();
void walkForwards();
- bool validateAndTransform();
+ bool validateAndTransform(const DataLayout &DL);
Value *convert(Instruction *I, Type *ToTy);
void cleanup();
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 371ad41..edbeede 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -1023,11 +1023,44 @@ static void computeKnownBitsFromOperator(const Operator *I,
break;
}
case Instruction::Select: {
- computeKnownBits(I->getOperand(2), Known, Depth + 1, Q);
- computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
-
+ auto ComputeForArm = [&](Value *Arm, bool Invert) {
+ KnownBits Res(Known.getBitWidth());
+ computeKnownBits(Arm, Res, Depth + 1, Q);
+ // If we have a constant arm, we are done.
+ if (Res.isConstant())
+ return Res;
+
+ // See what condition implies about the bits of the two select arms.
+ KnownBits CondRes(Res.getBitWidth());
+ computeKnownBitsFromCond(Arm, I->getOperand(0), CondRes, Depth + 1, Q,
+ Invert);
+ // If we don't get any information from the condition, no reason to
+ // proceed.
+ if (CondRes.isUnknown())
+ return Res;
+
+ // We can have conflict if the condition is dead. I.e if we have
+ // (x | 64) < 32 ? (x | 64) : y
+ // we will have conflict at bit 6 from the condition/the `or`.
+ // In that case just return. Its not particularly important
+ // what we do, as this select is going to be simplified soon.
+ CondRes = CondRes.unionWith(Res);
+ if (CondRes.hasConflict())
+ return Res;
+
+ // Finally make sure the information we found is valid. This is relatively
+ // expensive so it's left for the very end.
+ if (!isGuaranteedNotToBeUndef(Arm, Q.AC, Q.CxtI, Q.DT, Depth + 1))
+ return Res;
+
+ // Finally, we know we get information from the condition and its valid,
+ // so return it.
+ return CondRes;
+ };
// Only known if known in both the LHS and RHS.
- Known = Known.intersectWith(Known2);
+ Known =
+ ComputeForArm(I->getOperand(1), /*Invert=*/false)
+ .intersectWith(ComputeForArm(I->getOperand(2), /*Invert=*/true));
break;
}
case Instruction::FPTrunc:
@@ -5709,7 +5742,7 @@ llvm::FindInsertedValue(Value *V, ArrayRef<unsigned> idx_range,
// looking for, then.
if (*req_idx != *i)
return FindInsertedValue(I->getAggregateOperand(), idx_range,
- *InsertBefore);
+ InsertBefore);
}
// If we end up here, the indices of the insertvalue match with those
// requested (though possibly only partially). Now we recursively look at
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp
index 93fb2a8..0eb9c24 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp
@@ -19,7 +19,7 @@
using namespace llvm;
PreservedAnalyses BitcodeWriterPass::run(Module &M, ModuleAnalysisManager &AM) {
- // RemoveDIs: there's no bitcode representation of the DPValue debug-info,
+ // RemoveDIs: there's no bitcode representation of the DbgRecord debug-info,
// convert to dbg.values before writing out.
bool IsNewDbgInfoFormat = M.IsNewDbgInfoFormat;
if (IsNewDbgInfoFormat)
@@ -56,8 +56,8 @@ namespace {
StringRef getPassName() const override { return "Bitcode Writer"; }
bool runOnModule(Module &M) override {
- // RemoveDIs: there's no bitcode representation of the DPValue debug-info,
- // convert to dbg.values before writing out.
+ // RemoveDIs: there's no bitcode representation of the DbgRecord
+ // debug-info, convert to dbg.values before writing out.
bool IsNewDbgInfoFormat = M.IsNewDbgInfoFormat;
if (IsNewDbgInfoFormat)
M.convertFromNewDbgValues();
diff --git a/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp b/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
index 55cdc3c..2e8e7d0 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
@@ -369,7 +369,8 @@ void AppleAccelTableWriter::emit() const {
DWARF5AccelTableData::DWARF5AccelTableData(const DIE &Die,
const uint32_t UnitID,
const bool IsTU)
- : OffsetVal(&Die), DieTag(Die.getTag()), IsTU(IsTU), UnitID(UnitID) {}
+ : OffsetVal(&Die), DieTag(Die.getTag()), AbbrevNumber(0), IsTU(IsTU),
+ UnitID(UnitID) {}
void Dwarf5AccelTableWriter::Header::emit(Dwarf5AccelTableWriter &Ctx) {
assert(CompUnitCount > 0 && "Index must have at least one CU.");
diff --git a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
index a4b819a..746926e56 100644
--- a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
+++ b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
@@ -217,13 +217,14 @@ void FunctionVarLocs::init(FunctionVarLocsBuilder &Builder) {
// to the start and end position in the vector with VarLocsBeforeInst. This
// block includes VarLocs for any DPValues attached to that instruction.
for (auto &P : Builder.VarLocsBeforeInst) {
- // Process VarLocs attached to a DPValue alongside their marker Instruction.
+ // Process VarLocs attached to a DbgRecord alongside their marker
+ // Instruction.
if (isa<const DbgRecord *>(P.first))
continue;
const Instruction *I = cast<const Instruction *>(P.first);
unsigned BlockStart = VarLocRecords.size();
- // Any VarLocInfos attached to a DPValue should now be remapped to their
- // marker Instruction, in order of DPValue appearance and prior to any
+ // Any VarLocInfos attached to a DbgRecord should now be remapped to their
+ // marker Instruction, in order of DbgRecord appearance and prior to any
// VarLocInfos attached directly to that instruction.
for (const DPValue &DPV : DPValue::filter(I->getDbgRecordRange())) {
// Even though DPV defines a variable location, VarLocsBeforeInst can
@@ -1649,7 +1650,7 @@ void AssignmentTrackingLowering::processUntaggedInstruction(
Ops.push_back(dwarf::DW_OP_deref);
DIE = DIExpression::prependOpcodes(DIE, Ops, /*StackValue=*/false,
/*EntryValue=*/false);
- // Find a suitable insert point, before the next instruction or DPValue
+ // Find a suitable insert point, before the next instruction or DbgRecord
// after I.
auto InsertBefore = getNextNode(&I);
assert(InsertBefore && "Shouldn't be inserting after a terminator");
@@ -1886,21 +1887,21 @@ void AssignmentTrackingLowering::resetInsertionPoint(DPValue &After) {
}
void AssignmentTrackingLowering::process(BasicBlock &BB, BlockInfo *LiveSet) {
- // If the block starts with DPValues, we need to process those DPValues as
+ // If the block starts with DbgRecords, we need to process those DbgRecords as
// their own frame without processing any instructions first.
- bool ProcessedLeadingDPValues = !BB.begin()->hasDbgRecords();
+ bool ProcessedLeadingDbgRecords = !BB.begin()->hasDbgRecords();
for (auto II = BB.begin(), EI = BB.end(); II != EI;) {
assert(VarsTouchedThisFrame.empty());
// Process the instructions in "frames". A "frame" includes a single
// non-debug instruction followed any debug instructions before the
// next non-debug instruction.
- // Skip the current instruction if it has unprocessed DPValues attached (see
- // comment above `ProcessedLeadingDPValues`).
- if (ProcessedLeadingDPValues) {
+ // Skip the current instruction if it has unprocessed DbgRecords attached
+ // (see comment above `ProcessedLeadingDbgRecords`).
+ if (ProcessedLeadingDbgRecords) {
// II is now either a debug intrinsic, a non-debug instruction with no
- // attached DPValues, or a non-debug instruction with attached processed
- // DPValues.
+ // attached DbgRecords, or a non-debug instruction with attached processed
+ // DbgRecords.
// II has not been processed.
if (!isa<DbgInfoIntrinsic>(&*II)) {
if (II->isTerminator())
@@ -1912,8 +1913,8 @@ void AssignmentTrackingLowering::process(BasicBlock &BB, BlockInfo *LiveSet) {
}
}
// II is now either a debug intrinsic, a non-debug instruction with no
- // attached DPValues, or a non-debug instruction with attached unprocessed
- // DPValues.
+ // attached DbgRecords, or a non-debug instruction with attached unprocessed
+ // DbgRecords.
if (II != EI && II->hasDbgRecords()) {
// Skip over non-variable debug records (i.e., labels). They're going to
// be read from IR (possibly re-ordering them within the debug record
@@ -1924,7 +1925,7 @@ void AssignmentTrackingLowering::process(BasicBlock &BB, BlockInfo *LiveSet) {
assert(LiveSet->isValid());
}
}
- ProcessedLeadingDPValues = true;
+ ProcessedLeadingDbgRecords = true;
while (II != EI) {
auto *Dbg = dyn_cast<DbgInfoIntrinsic>(&*II);
if (!Dbg)
@@ -1934,9 +1935,9 @@ void AssignmentTrackingLowering::process(BasicBlock &BB, BlockInfo *LiveSet) {
assert(LiveSet->isValid());
++II;
}
- // II is now a non-debug instruction either with no attached DPValues, or
- // with attached processed DPValues. II has not been processed, and all
- // debug instructions or DPValues in the frame preceding II have been
+ // II is now a non-debug instruction either with no attached DbgRecords, or
+ // with attached processed DbgRecords. II has not been processed, and all
+ // debug instructions or DbgRecords in the frame preceding II have been
// processed.
// We've processed everything in the "frame". Now determine which variables
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 59a0c64..055e275 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -2946,7 +2946,7 @@ class TypePromotionTransaction {
Instruction *PrevInst;
BasicBlock *BB;
} Point;
- std::optional<DPValue::self_iterator> BeforeDPValue = std::nullopt;
+ std::optional<DbgRecord::self_iterator> BeforeDbgRecord = std::nullopt;
/// Remember whether or not the instruction had a previous instruction.
bool HasPrevInstruction;
@@ -2958,9 +2958,9 @@ class TypePromotionTransaction {
BasicBlock *BB = Inst->getParent();
// Record where we would have to re-insert the instruction in the sequence
- // of DPValues, if we ended up reinserting.
+ // of DbgRecords, if we ended up reinserting.
if (BB->IsNewDbgInfoFormat)
- BeforeDPValue = Inst->getDbgReinsertionPosition();
+ BeforeDbgRecord = Inst->getDbgReinsertionPosition();
if (HasPrevInstruction) {
Point.PrevInst = &*std::prev(Inst->getIterator());
@@ -2983,7 +2983,7 @@ class TypePromotionTransaction {
Inst->insertBefore(*Point.BB, Position);
}
- Inst->getParent()->reinsertInstInDbgRecords(Inst, BeforeDPValue);
+ Inst->getParent()->reinsertInstInDbgRecords(Inst, BeforeDbgRecord);
}
};
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index 4ed44d1..8efe67a 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -982,7 +982,7 @@ void MIRFormatter::printIRValue(raw_ostream &OS, const Value &V,
}
void llvm::printMIR(raw_ostream &OS, const Module &M) {
- // RemoveDIs: as there's no textual form for DPValues yet, print debug-info
+ // RemoveDIs: as there's no textual form for DbgRecords yet, print debug-info
// in dbg.value format.
bool IsNewDbgInfoFormat = M.IsNewDbgInfoFormat;
if (IsNewDbgInfoFormat)
@@ -996,7 +996,7 @@ void llvm::printMIR(raw_ostream &OS, const Module &M) {
}
void llvm::printMIR(raw_ostream &OS, const MachineFunction &MF) {
- // RemoveDIs: as there's no textual form for DPValues yet, print debug-info
+ // RemoveDIs: as there's no textual form for DbgRecords yet, print debug-info
// in dbg.value format.
bool IsNewDbgInfoFormat = MF.getFunction().IsNewDbgInfoFormat;
if (IsNewDbgInfoFormat)
diff --git a/llvm/lib/CodeGen/SelectOptimize.cpp b/llvm/lib/CodeGen/SelectOptimize.cpp
index 40898d2..f65d532 100644
--- a/llvm/lib/CodeGen/SelectOptimize.cpp
+++ b/llvm/lib/CodeGen/SelectOptimize.cpp
@@ -645,12 +645,13 @@ void SelectOptimizeImpl::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
DI->moveBeforePreserving(&*EndBlock->getFirstInsertionPt());
}
- // Duplicate implementation for DPValues, the non-instruction debug-info
- // record. Helper lambda for moving DPValues to the end block.
- auto TransferDPValues = [&](Instruction &I) {
- for (auto &DPValue : llvm::make_early_inc_range(I.getDbgRecordRange())) {
- DPValue.removeFromParent();
- EndBlock->insertDbgRecordBefore(&DPValue,
+ // Duplicate implementation for DbgRecords, the non-instruction debug-info
+ // format. Helper lambda for moving DbgRecords to the end block.
+ auto TransferDbgRecords = [&](Instruction &I) {
+ for (auto &DbgRecord :
+ llvm::make_early_inc_range(I.getDbgRecordRange())) {
+ DbgRecord.removeFromParent();
+ EndBlock->insertDbgRecordBefore(&DbgRecord,
EndBlock->getFirstInsertionPt());
}
};
@@ -660,7 +661,7 @@ void SelectOptimizeImpl::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
// middle" of the select group.
auto R = make_range(std::next(SI.getI()->getIterator()),
std::next(LastSI.getI()->getIterator()));
- llvm::for_each(R, TransferDPValues);
+ llvm::for_each(R, TransferDbgRecords);
// These are the new basic blocks for the conditional branch.
// At least one will become an actual new basic block.
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 735cec8..40b078a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -2799,8 +2799,7 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) {
// Limit this to after legalization if the add has wrap flags
(Level >= AfterLegalizeDAG || (!N->getFlags().hasNoUnsignedWrap() &&
!N->getFlags().hasNoSignedWrap()))) {
- SDValue Not = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(0),
- DAG.getAllOnesConstant(DL, VT));
+ SDValue Not = DAG.getNOT(DL, N0.getOperand(0), VT);
return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(1), Not);
}
}
@@ -3025,8 +3024,7 @@ SDValue DAGCombiner::visitADDLikeCommutative(SDValue N0, SDValue N1,
// Limit this to after legalization if the add has wrap flags
(Level >= AfterLegalizeDAG || (!N0->getFlags().hasNoUnsignedWrap() &&
!N0->getFlags().hasNoSignedWrap()))) {
- SDValue Not = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(0),
- DAG.getAllOnesConstant(DL, VT));
+ SDValue Not = DAG.getNOT(DL, N0.getOperand(0), VT);
return DAG.getNode(ISD::SUB, DL, VT, N1, Not);
}
@@ -3789,63 +3787,34 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
return DAG.getNode(ISD::SUB, DL, VT, NewC, N0.getOperand(1));
}
- // fold ((A+(B+or-C))-B) -> A+or-C
- if (N0.getOpcode() == ISD::ADD &&
- (N0.getOperand(1).getOpcode() == ISD::SUB ||
- N0.getOperand(1).getOpcode() == ISD::ADD) &&
- N0.getOperand(1).getOperand(0) == N1)
- return DAG.getNode(N0.getOperand(1).getOpcode(), DL, VT, N0.getOperand(0),
- N0.getOperand(1).getOperand(1));
-
- // fold ((A+(C+B))-B) -> A+C
- if (N0.getOpcode() == ISD::ADD && N0.getOperand(1).getOpcode() == ISD::ADD &&
- N0.getOperand(1).getOperand(1) == N1)
- return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0),
- N0.getOperand(1).getOperand(0));
+ SDValue A, B, C;
+
+ // fold ((A+(B+C))-B) -> A+C
+ if (sd_match(N0, m_Add(m_Value(A), m_Add(m_Specific(N1), m_Value(C)))))
+ return DAG.getNode(ISD::ADD, DL, VT, A, C);
+
+ // fold ((A+(B-C))-B) -> A-C
+ if (sd_match(N0, m_Add(m_Value(A), m_Sub(m_Specific(N1), m_Value(C)))))
+ return DAG.getNode(ISD::SUB, DL, VT, A, C);
// fold ((A-(B-C))-C) -> A-B
- if (N0.getOpcode() == ISD::SUB && N0.getOperand(1).getOpcode() == ISD::SUB &&
- N0.getOperand(1).getOperand(1) == N1)
- return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0),
- N0.getOperand(1).getOperand(0));
+ if (sd_match(N0, m_Sub(m_Value(A), m_Sub(m_Value(B), m_Specific(N1)))))
+ return DAG.getNode(ISD::SUB, DL, VT, A, B);
// fold (A-(B-C)) -> A+(C-B)
- if (N1.getOpcode() == ISD::SUB && N1.hasOneUse())
+ if (sd_match(N1, m_OneUse(m_Sub(m_Value(B), m_Value(C)))))
return DAG.getNode(ISD::ADD, DL, VT, N0,
- DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(1),
- N1.getOperand(0)));
+ DAG.getNode(ISD::SUB, DL, VT, C, B));
// A - (A & B) -> A & (~B)
- if (N1.getOpcode() == ISD::AND) {
- SDValue A = N1.getOperand(0);
- SDValue B = N1.getOperand(1);
- if (A != N0)
- std::swap(A, B);
- if (A == N0 &&
- (N1.hasOneUse() || isConstantOrConstantVector(B, /*NoOpaques=*/true))) {
- SDValue InvB =
- DAG.getNode(ISD::XOR, DL, VT, B, DAG.getAllOnesConstant(DL, VT));
- return DAG.getNode(ISD::AND, DL, VT, A, InvB);
- }
- }
+ if (sd_match(N1, m_And(m_Specific(N0), m_Value(B))) &&
+ (N1.hasOneUse() || isConstantOrConstantVector(B, /*NoOpaques=*/true)))
+ return DAG.getNode(ISD::AND, DL, VT, N0, DAG.getNOT(DL, B, VT));
- // fold (X - (-Y * Z)) -> (X + (Y * Z))
- if (N1.getOpcode() == ISD::MUL && N1.hasOneUse()) {
- if (N1.getOperand(0).getOpcode() == ISD::SUB &&
- isNullOrNullSplat(N1.getOperand(0).getOperand(0))) {
- SDValue Mul = DAG.getNode(ISD::MUL, DL, VT,
- N1.getOperand(0).getOperand(1),
- N1.getOperand(1));
- return DAG.getNode(ISD::ADD, DL, VT, N0, Mul);
- }
- if (N1.getOperand(1).getOpcode() == ISD::SUB &&
- isNullOrNullSplat(N1.getOperand(1).getOperand(0))) {
- SDValue Mul = DAG.getNode(ISD::MUL, DL, VT,
- N1.getOperand(0),
- N1.getOperand(1).getOperand(1));
- return DAG.getNode(ISD::ADD, DL, VT, N0, Mul);
- }
- }
+ // fold (A - (-B * C)) -> (A + (B * C))
+ if (sd_match(N1, m_OneUse(m_Mul(m_Sub(m_Zero(), m_Value(B)), m_Value(C)))))
+ return DAG.getNode(ISD::ADD, DL, VT, N0,
+ DAG.getNode(ISD::MUL, DL, VT, B, C));
// If either operand of a sub is undef, the result is undef
if (N0.isUndef())
@@ -3865,12 +3834,9 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
if (SDValue V = foldSubToUSubSat(VT, N))
return V;
- // (x - y) - 1 -> add (xor y, -1), x
- if (N0.getOpcode() == ISD::SUB && N0.hasOneUse() && isOneOrOneSplat(N1)) {
- SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(1),
- DAG.getAllOnesConstant(DL, VT));
- return DAG.getNode(ISD::ADD, DL, VT, Xor, N0.getOperand(0));
- }
+ // (A - B) - 1 -> add (xor B, -1), A
+ if (sd_match(N, m_Sub(m_OneUse(m_Sub(m_Value(A), m_Value(B))), m_One())))
+ return DAG.getNode(ISD::ADD, DL, VT, A, DAG.getNOT(DL, B, VT));
// Look for:
// sub y, (xor x, -1)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index c2430359..b8c7d08 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -37,6 +37,7 @@
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/RuntimeLibcalls.h"
+#include "llvm/CodeGen/SDPatternMatch.h"
#include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
@@ -81,6 +82,7 @@
#include <vector>
using namespace llvm;
+using namespace llvm::SDPatternMatch;
/// makeVTList - Return an instance of the SDVTList struct initialized with the
/// specified members.
@@ -4290,21 +4292,15 @@ bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val, unsigned Depth) const {
return isKnownToBeAPowerOfTwo(Val.getOperand(2), Depth + 1) &&
isKnownToBeAPowerOfTwo(Val.getOperand(1), Depth + 1);
- if (Val.getOpcode() == ISD::AND) {
- // Looking for `x & -x` pattern:
- // If x == 0:
- // x & -x -> 0
- // If x != 0:
- // x & -x -> non-zero pow2
- // so if we find the pattern return whether we know `x` is non-zero.
- for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx) {
- SDValue NegOp = Val.getOperand(OpIdx);
- if (NegOp.getOpcode() == ISD::SUB &&
- NegOp.getOperand(1) == Val.getOperand(1 - OpIdx) &&
- isNullOrNullSplat(NegOp.getOperand(0)))
- return isKnownNeverZero(Val.getOperand(1 - OpIdx), Depth);
- }
- }
+ // Looking for `x & -x` pattern:
+ // If x == 0:
+ // x & -x -> 0
+ // If x != 0:
+ // x & -x -> non-zero pow2
+ // so if we find the pattern return whether we know `x` is non-zero.
+ SDValue X;
+ if (sd_match(Val, m_And(m_Value(X), m_Sub(m_Zero(), m_Deferred(X)))))
+ return isKnownNeverZero(X, Depth);
if (Val.getOpcode() == ISD::ZERO_EXTEND)
return isKnownToBeAPowerOfTwo(Val.getOperand(0), Depth + 1);
diff --git a/llvm/lib/CodeGen/SjLjEHPrepare.cpp b/llvm/lib/CodeGen/SjLjEHPrepare.cpp
index 515b576..4bad57d 100644
--- a/llvm/lib/CodeGen/SjLjEHPrepare.cpp
+++ b/llvm/lib/CodeGen/SjLjEHPrepare.cpp
@@ -150,9 +150,7 @@ static void MarkBlocksLiveIn(BasicBlock *BB,
if (!LiveBBs.insert(BB).second)
return; // already been here.
- df_iterator_default_set<BasicBlock*> Visited;
-
- for (BasicBlock *B : inverse_depth_first_ext(BB, Visited))
+ for (BasicBlock *B : inverse_depth_first(BB))
LiveBBs.insert(B);
}
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderVTune.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderVTune.cpp
index d346214..57ac991e 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderVTune.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderVTune.cpp
@@ -87,7 +87,7 @@ static void registerJITLoaderVTuneUnregisterImpl(
for (auto &Method : UM) {
JITEventWrapper::Wrapper->iJIT_NotifyEvent(
iJVM_EVENT_TYPE_METHOD_UNLOAD_START,
- const_cast<unsigned long *>(&Method.first));
+ const_cast<uint64_t *>(&Method.first));
}
}
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index d65ed8c1..c74c898 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -2110,7 +2110,7 @@ Function *getFreshReductionFunc(Module &M) {
OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
const LocationDescription &Loc, InsertPointTy AllocaIP,
- ArrayRef<ReductionInfo> ReductionInfos, bool IsNoWait) {
+ ArrayRef<ReductionInfo> ReductionInfos, bool IsNoWait, bool IsByRef) {
for (const ReductionInfo &RI : ReductionInfos) {
(void)RI;
assert(RI.Variable && "expected non-null variable");
@@ -2197,17 +2197,29 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
for (auto En : enumerate(ReductionInfos)) {
const ReductionInfo &RI = En.value();
Type *ValueType = RI.ElementType;
- Value *RedValue = Builder.CreateLoad(ValueType, RI.Variable,
- "red.value." + Twine(En.index()));
+ // We have one less load for by-ref case because that load is now inside of
+ // the reduction region
+ Value *RedValue = nullptr;
+ if (!IsByRef) {
+ RedValue = Builder.CreateLoad(ValueType, RI.Variable,
+ "red.value." + Twine(En.index()));
+ }
Value *PrivateRedValue =
Builder.CreateLoad(ValueType, RI.PrivateVariable,
"red.private.value." + Twine(En.index()));
Value *Reduced;
- Builder.restoreIP(
- RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced));
+ if (IsByRef) {
+ Builder.restoreIP(RI.ReductionGen(Builder.saveIP(), RI.Variable,
+ PrivateRedValue, Reduced));
+ } else {
+ Builder.restoreIP(RI.ReductionGen(Builder.saveIP(), RedValue,
+ PrivateRedValue, Reduced));
+ }
if (!Builder.GetInsertBlock())
return InsertPointTy();
- Builder.CreateStore(Reduced, RI.Variable);
+ // for by-ref case, the load is inside of the reduction region
+ if (!IsByRef)
+ Builder.CreateStore(Reduced, RI.Variable);
}
Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
@@ -2219,7 +2231,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
// function. There are no loads/stores here because they will be happening
// inside the atomic elementwise reduction.
Builder.SetInsertPoint(AtomicRedBlock);
- if (CanGenerateAtomic) {
+ if (CanGenerateAtomic && !IsByRef) {
for (const ReductionInfo &RI : ReductionInfos) {
Builder.restoreIP(RI.AtomicReductionGen(Builder.saveIP(), RI.ElementType,
RI.Variable, RI.PrivateVariable));
@@ -2257,7 +2269,9 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
Builder.restoreIP(RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced));
if (!Builder.GetInsertBlock())
return InsertPointTy();
- Builder.CreateStore(Reduced, LHSPtr);
+ // store is inside of the reduction region when using by-ref
+ if (!IsByRef)
+ Builder.CreateStore(Reduced, LHSPtr);
}
Builder.CreateRetVoid();
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 1beb4c0..11383ea 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -4592,7 +4592,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
void AssemblyWriter::printDPMarker(const DPMarker &Marker) {
// There's no formal representation of a DPMarker -- print purely as a
// debugging aid.
- for (const DbgRecord &DPR : Marker.StoredDPValues) {
+ for (const DbgRecord &DPR : Marker.StoredDbgRecords) {
printDbgRecord(DPR);
Out << "\n";
}
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index 7ead7ce..4dd1bdd 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -63,9 +63,9 @@ DPMarker *BasicBlock::createMarker(InstListType::iterator It) {
void BasicBlock::convertToNewDbgValues() {
IsNewDbgInfoFormat = true;
- // Iterate over all instructions in the instruction list, collecting dbg.value
- // instructions and converting them to DPValues. Once we find a "real"
- // instruction, attach all those DPValues to a DPMarker in that instruction.
+ // Iterate over all instructions in the instruction list, collecting debug
+ // info intrinsics and converting them to DbgRecords. Once we find a "real"
+ // instruction, attach all those DbgRecords to a DPMarker in that instruction.
SmallVector<DbgRecord *, 4> DPVals;
for (Instruction &I : make_early_inc_range(InstList)) {
assert(!I.DbgMarker && "DbgMarker already set on old-format instrs?");
@@ -86,7 +86,7 @@ void BasicBlock::convertToNewDbgValues() {
if (DPVals.empty())
continue;
- // Create a marker to store DPValues in.
+ // Create a marker to store DbgRecords in.
createMarker(&I);
DPMarker *Marker = I.DbgMarker;
@@ -102,7 +102,7 @@ void BasicBlock::convertFromNewDbgValues() {
IsNewDbgInfoFormat = false;
// Iterate over the block, finding instructions annotated with DPMarkers.
- // Convert any attached DPValues to dbg.values and insert ahead of the
+ // Convert any attached DbgRecords to debug intrinsics and insert ahead of the
// instruction.
for (auto &Inst : *this) {
if (!Inst.DbgMarker)
@@ -116,7 +116,7 @@ void BasicBlock::convertFromNewDbgValues() {
Marker.eraseFromParent();
}
- // Assume no trailing DPValues: we could technically create them at the end
+ // Assume no trailing DbgRecords: we could technically create them at the end
// of the block, after a terminator, but this would be non-cannonical and
// indicates that something else is broken somewhere.
assert(!getTrailingDbgRecords());
@@ -691,15 +691,15 @@ void BasicBlock::renumberInstructions() {
NumInstrRenumberings++;
}
-void BasicBlock::flushTerminatorDbgValues() {
- // If we erase the terminator in a block, any DPValues will sink and "fall
+void BasicBlock::flushTerminatorDbgRecords() {
+ // If we erase the terminator in a block, any DbgRecords will sink and "fall
// off the end", existing after any terminator that gets inserted. With
// dbg.value intrinsics we would just insert the terminator at end() and
- // the dbg.values would come before the terminator. With DPValues, we must
+ // the dbg.values would come before the terminator. With DbgRecords, we must
// do this manually.
// To get out of this unfortunate form, whenever we insert a terminator,
- // check whether there's anything trailing at the end and move those DPValues
- // in front of the terminator.
+ // check whether there's anything trailing at the end and move those
+ // DbgRecords in front of the terminator.
// Do nothing if we're not in new debug-info format.
if (!IsNewDbgInfoFormat)
@@ -710,15 +710,15 @@ void BasicBlock::flushTerminatorDbgValues() {
if (!Term)
return;
- // Are there any dangling DPValues?
- DPMarker *TrailingDPValues = getTrailingDbgRecords();
- if (!TrailingDPValues)
+ // Are there any dangling DbgRecords?
+ DPMarker *TrailingDbgRecords = getTrailingDbgRecords();
+ if (!TrailingDbgRecords)
return;
- // Transfer DPValues from the trailing position onto the terminator.
+ // Transfer DbgRecords from the trailing position onto the terminator.
createMarker(Term);
- Term->DbgMarker->absorbDebugValues(*TrailingDPValues, false);
- TrailingDPValues->eraseFromParent();
+ Term->DbgMarker->absorbDebugValues(*TrailingDbgRecords, false);
+ TrailingDbgRecords->eraseFromParent();
deleteTrailingDbgRecords();
}
@@ -735,7 +735,7 @@ void BasicBlock::spliceDebugInfoEmptyBlock(BasicBlock::iterator Dest,
// If an optimisation pass attempts to splice the contents of the block from
// BB1->begin() to BB1->getTerminator(), then the dbg.value will be
// transferred to the destination.
- // However, in the "new" DPValue format for debug-info, that range is empty:
+ // However, in the "new" DbgRecord format for debug-info, that range is empty:
// begin() returns an iterator to the terminator, as there will only be a
// single instruction in the block. We must piece together from the bits set
// in the iterators whether there was the intention to transfer any debug
@@ -750,16 +750,16 @@ void BasicBlock::spliceDebugInfoEmptyBlock(BasicBlock::iterator Dest,
bool ReadFromHead = First.getHeadBit();
// If the source block is completely empty, including no terminator, then
- // transfer any trailing DPValues that are still hanging around. This can
+ // transfer any trailing DbgRecords that are still hanging around. This can
// occur when a block is optimised away and the terminator has been moved
// somewhere else.
if (Src->empty()) {
- DPMarker *SrcTrailingDPValues = Src->getTrailingDbgRecords();
- if (!SrcTrailingDPValues)
+ DPMarker *SrcTrailingDbgRecords = Src->getTrailingDbgRecords();
+ if (!SrcTrailingDbgRecords)
return;
Dest->adoptDbgRecords(Src, Src->end(), InsertAtHead);
- // adoptDbgRecords should have released the trailing DPValues.
+ // adoptDbgRecords should have released the trailing DbgRecords.
assert(!Src->getTrailingDbgRecords());
return;
}
@@ -785,8 +785,8 @@ void BasicBlock::spliceDebugInfo(BasicBlock::iterator Dest, BasicBlock *Src,
/* Do a quick normalisation before calling the real splice implementation. We
might be operating on a degenerate basic block that has no instructions
in it, a legitimate transient state. In that case, Dest will be end() and
- any DPValues temporarily stored in the TrailingDPValues map in LLVMContext.
- We might illustrate it thus:
+ any DbgRecords temporarily stored in the TrailingDbgRecords map in
+ LLVMContext. We might illustrate it thus:
Dest
|
@@ -795,35 +795,35 @@ void BasicBlock::spliceDebugInfo(BasicBlock::iterator Dest, BasicBlock *Src,
| |
First Last
- However: does the caller expect the "~" DPValues to end up before or after
- the spliced segment? This is communciated in the "Head" bit of Dest, which
- signals whether the caller called begin() or end() on this block.
+ However: does the caller expect the "~" DbgRecords to end up before or
+ after the spliced segment? This is communciated in the "Head" bit of Dest,
+ which signals whether the caller called begin() or end() on this block.
- If the head bit is set, then all is well, we leave DPValues trailing just
+ If the head bit is set, then all is well, we leave DbgRecords trailing just
like how dbg.value instructions would trail after instructions spliced to
the beginning of this block.
- If the head bit isn't set, then try to jam the "~" DPValues onto the front
- of the First instruction, then splice like normal, which joins the "~"
- DPValues with the "+" DPValues. However if the "+" DPValues are supposed to
- be left behind in Src, then:
- * detach the "+" DPValues,
- * move the "~" DPValues onto First,
+ If the head bit isn't set, then try to jam the "~" DbgRecords onto the
+ front of the First instruction, then splice like normal, which joins the
+ "~" DbgRecords with the "+" DbgRecords. However if the "+" DbgRecords are
+ supposed to be left behind in Src, then:
+ * detach the "+" DbgRecords,
+ * move the "~" DbgRecords onto First,
* splice like normal,
- * replace the "+" DPValues onto the Last position.
+ * replace the "+" DbgRecords onto the Last position.
Complicated, but gets the job done. */
- // If we're inserting at end(), and not in front of dangling DPValues, then
- // move the DPValues onto "First". They'll then be moved naturally in the
+ // If we're inserting at end(), and not in front of dangling DbgRecords, then
+ // move the DbgRecords onto "First". They'll then be moved naturally in the
// splice process.
- DPMarker *MoreDanglingDPValues = nullptr;
- DPMarker *OurTrailingDPValues = getTrailingDbgRecords();
- if (Dest == end() && !Dest.getHeadBit() && OurTrailingDPValues) {
- // Are the "+" DPValues not supposed to move? If so, detach them
+ DPMarker *MoreDanglingDbgRecords = nullptr;
+ DPMarker *OurTrailingDbgRecords = getTrailingDbgRecords();
+ if (Dest == end() && !Dest.getHeadBit() && OurTrailingDbgRecords) {
+ // Are the "+" DbgRecords not supposed to move? If so, detach them
// temporarily.
if (!First.getHeadBit() && First->hasDbgRecords()) {
- MoreDanglingDPValues = Src->getMarker(First);
- MoreDanglingDPValues->removeFromParent();
+ MoreDanglingDbgRecords = Src->getMarker(First);
+ MoreDanglingDbgRecords->removeFromParent();
}
if (First->hasDbgRecords()) {
@@ -839,8 +839,8 @@ void BasicBlock::spliceDebugInfo(BasicBlock::iterator Dest, BasicBlock *Src,
// No current marker, create one and absorb in. (FIXME: we can avoid an
// allocation in the future).
DPMarker *CurMarker = Src->createMarker(&*First);
- CurMarker->absorbDebugValues(*OurTrailingDPValues, false);
- OurTrailingDPValues->eraseFromParent();
+ CurMarker->absorbDebugValues(*OurTrailingDbgRecords, false);
+ OurTrailingDbgRecords->eraseFromParent();
}
deleteTrailingDbgRecords();
First.setHeadBit(true);
@@ -849,16 +849,16 @@ void BasicBlock::spliceDebugInfo(BasicBlock::iterator Dest, BasicBlock *Src,
// Call the main debug-info-splicing implementation.
spliceDebugInfoImpl(Dest, Src, First, Last);
- // Do we have some "+" DPValues hanging around that weren't supposed to move,
- // and we detached to make things easier?
- if (!MoreDanglingDPValues)
+ // Do we have some "+" DbgRecords hanging around that weren't supposed to
+ // move, and we detached to make things easier?
+ if (!MoreDanglingDbgRecords)
return;
// FIXME: we could avoid an allocation here sometimes. (adoptDbgRecords
// requires an iterator).
DPMarker *LastMarker = Src->createMarker(Last);
- LastMarker->absorbDebugValues(*MoreDanglingDPValues, true);
- MoreDanglingDPValues->eraseFromParent();
+ LastMarker->absorbDebugValues(*MoreDanglingDbgRecords, true);
+ MoreDanglingDbgRecords->eraseFromParent();
}
void BasicBlock::spliceDebugInfoImpl(BasicBlock::iterator Dest, BasicBlock *Src,
@@ -870,15 +870,16 @@ void BasicBlock::spliceDebugInfoImpl(BasicBlock::iterator Dest, BasicBlock *Src,
bool InsertAtHead = Dest.getHeadBit();
bool ReadFromHead = First.getHeadBit();
// Use this flag to signal the abnormal case, where we don't want to copy the
- // DPValues ahead of the "Last" position.
+ // DbgRecords ahead of the "Last" position.
bool ReadFromTail = !Last.getTailBit();
bool LastIsEnd = (Last == Src->end());
/*
Here's an illustration of what we're about to do. We have two blocks, this
and Src, and two segments of list. Each instruction is marked by a capital
- while potential DPValue debug-info is marked out by "-" characters and a few
- other special characters (+:=) where I want to highlight what's going on.
+ while potential DbgRecord debug-info is marked out by "-" characters and a
+ few other special characters (+:=) where I want to highlight what's going
+ on.
Dest
|
@@ -889,18 +890,18 @@ void BasicBlock::spliceDebugInfoImpl(BasicBlock::iterator Dest, BasicBlock *Src,
The splice method is going to take all the instructions from First up to
(but not including) Last and insert them in _front_ of Dest, forming one
- long list. All the DPValues attached to instructions _between_ First and
+ long list. All the DbgRecords attached to instructions _between_ First and
Last need no maintenence. However, we have to do special things with the
- DPValues marked with the +:= characters. We only have three positions:
- should the "+" DPValues be transferred, and if so to where? Do we move the
- ":" DPValues? Would they go in front of the "=" DPValues, or should the "="
- DPValues go before "+" DPValues?
+ DbgRecords marked with the +:= characters. We only have three positions:
+ should the "+" DbgRecords be transferred, and if so to where? Do we move the
+ ":" DbgRecords? Would they go in front of the "=" DbgRecords, or should the
+ "=" DbgRecords go before "+" DbgRecords?
We're told which way it should be by the bits carried in the iterators. The
"Head" bit indicates whether the specified position is supposed to be at the
- front of the attached DPValues (true) or not (false). The Tail bit is true
- on the other end of a range: is the range intended to include DPValues up to
- the end (false) or not (true).
+ front of the attached DbgRecords (true) or not (false). The Tail bit is true
+ on the other end of a range: is the range intended to include DbgRecords up
+ to the end (false) or not (true).
FIXME: the tail bit doesn't need to be distinct from the head bit, we could
combine them.
@@ -934,15 +935,16 @@ void BasicBlock::spliceDebugInfoImpl(BasicBlock::iterator Dest, BasicBlock *Src,
*/
- // Detach the marker at Dest -- this lets us move the "====" DPValues around.
+ // Detach the marker at Dest -- this lets us move the "====" DbgRecords
+ // around.
DPMarker *DestMarker = nullptr;
if (Dest != end()) {
if ((DestMarker = getMarker(Dest)))
DestMarker->removeFromParent();
}
- // If we're moving the tail range of DPValues (":::"), absorb them into the
- // front of the DPValues at Dest.
+ // If we're moving the tail range of DbgRecords (":::"), absorb them into the
+ // front of the DbgRecords at Dest.
if (ReadFromTail && Src->getMarker(Last)) {
DPMarker *FromLast = Src->getMarker(Last);
if (LastIsEnd) {
@@ -956,7 +958,7 @@ void BasicBlock::spliceDebugInfoImpl(BasicBlock::iterator Dest, BasicBlock *Src,
}
}
- // If we're _not_ reading from the head of First, i.e. the "++++" DPValues,
+ // If we're _not_ reading from the head of First, i.e. the "++++" DbgRecords,
// move their markers onto Last. They remain in the Src block. No action
// needed.
if (!ReadFromHead && First->hasDbgRecords()) {
@@ -970,16 +972,16 @@ void BasicBlock::spliceDebugInfoImpl(BasicBlock::iterator Dest, BasicBlock *Src,
}
}
- // Finally, do something with the "====" DPValues we detached.
+ // Finally, do something with the "====" DbgRecords we detached.
if (DestMarker) {
if (InsertAtHead) {
- // Insert them at the end of the DPValues at Dest. The "::::" DPValues
+ // Insert them at the end of the DbgRecords at Dest. The "::::" DbgRecords
// might be in front of them.
DPMarker *NewDestMarker = createMarker(Dest);
NewDestMarker->absorbDebugValues(*DestMarker, false);
} else {
// Insert them right at the start of the range we moved, ahead of First
- // and the "++++" DPValues.
+ // and the "++++" DbgRecords.
DPMarker *FirstMarker = createMarker(First);
FirstMarker->absorbDebugValues(*DestMarker, true);
}
@@ -990,10 +992,10 @@ void BasicBlock::spliceDebugInfoImpl(BasicBlock::iterator Dest, BasicBlock *Src,
// any trailing debug-info at the end of the block would "normally" have
// been pushed in front of "First". Move it there now.
DPMarker *FirstMarker = getMarker(First);
- DPMarker *TrailingDPValues = getTrailingDbgRecords();
- if (TrailingDPValues) {
- FirstMarker->absorbDebugValues(*TrailingDPValues, true);
- TrailingDPValues->eraseFromParent();
+ DPMarker *TrailingDbgRecords = getTrailingDbgRecords();
+ if (TrailingDbgRecords) {
+ FirstMarker->absorbDebugValues(*TrailingDbgRecords, true);
+ TrailingDbgRecords->eraseFromParent();
deleteTrailingDbgRecords();
}
}
@@ -1024,7 +1026,7 @@ void BasicBlock::splice(iterator Dest, BasicBlock *Src, iterator First,
// And move the instructions.
getInstList().splice(Dest, Src->getInstList(), First, Last);
- flushTerminatorDbgValues();
+ flushTerminatorDbgRecords();
}
void BasicBlock::insertDbgRecordAfter(DbgRecord *DPV, Instruction *I) {
@@ -1057,38 +1059,40 @@ DPMarker *BasicBlock::getMarker(InstListType::iterator It) {
}
void BasicBlock::reinsertInstInDbgRecords(
- Instruction *I, std::optional<DPValue::self_iterator> Pos) {
+ Instruction *I, std::optional<DbgRecord::self_iterator> Pos) {
// "I" was originally removed from a position where it was
- // immediately in front of Pos. Any DPValues on that position then "fell down"
- // onto Pos. "I" has been re-inserted at the front of that wedge of DPValues,
- // shuffle them around to represent the original positioning. To illustrate:
+ // immediately in front of Pos. Any DbgRecords on that position then "fell
+ // down" onto Pos. "I" has been re-inserted at the front of that wedge of
+ // DbgRecords, shuffle them around to represent the original positioning. To
+ // illustrate:
//
// Instructions: I1---I---I0
- // DPValues: DDD DDD
+ // DbgRecords: DDD DDD
//
// Instruction "I" removed,
//
// Instructions: I1------I0
- // DPValues: DDDDDD
+ // DbgRecords: DDDDDD
// ^Pos
//
// Instruction "I" re-inserted (now):
//
// Instructions: I1---I------I0
- // DPValues: DDDDDD
+ // DbgRecords: DDDDDD
// ^Pos
//
// After this method completes:
//
// Instructions: I1---I---I0
- // DPValues: DDD DDD
+ // DbgRecords: DDD DDD
- // This happens if there were no DPValues on I0. Are there now DPValues there?
+ // This happens if there were no DbgRecords on I0. Are there now DbgRecords
+ // there?
if (!Pos) {
DPMarker *NextMarker = getNextMarker(I);
if (!NextMarker)
return;
- if (NextMarker->StoredDPValues.empty())
+ if (NextMarker->StoredDbgRecords.empty())
return;
// There are DPMarkers there now -- they fell down from "I".
DPMarker *ThisMarker = createMarker(I);
@@ -1096,15 +1100,15 @@ void BasicBlock::reinsertInstInDbgRecords(
return;
}
- // Is there even a range of DPValues to move?
+ // Is there even a range of DbgRecords to move?
DPMarker *DPM = (*Pos)->getMarker();
- auto Range = make_range(DPM->StoredDPValues.begin(), (*Pos));
+ auto Range = make_range(DPM->StoredDbgRecords.begin(), (*Pos));
if (Range.begin() == Range.end())
return;
// Otherwise: splice.
DPMarker *ThisMarker = createMarker(I);
- assert(ThisMarker->StoredDPValues.empty());
+ assert(ThisMarker->StoredDbgRecords.empty());
ThisMarker->absorbDebugValues(Range, *DPM, true);
}
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index e63b1e6..d168950 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -895,7 +895,7 @@ bool llvm::stripNonLineTableDebugInfo(Module &M) {
if (I.hasMetadataOtherThanDebugLoc())
I.setMetadata("heapallocsite", nullptr);
- // Strip any DPValues attached.
+ // Strip any DbgRecords attached.
I.dropDbgRecords();
}
}
diff --git a/llvm/lib/IR/DebugProgramInstruction.cpp b/llvm/lib/IR/DebugProgramInstruction.cpp
index 019b00c..f34d3ae 100644
--- a/llvm/lib/IR/DebugProgramInstruction.cpp
+++ b/llvm/lib/IR/DebugProgramInstruction.cpp
@@ -1,4 +1,4 @@
-//======-- DebugProgramInstruction.cpp - Implement DPValues/DPMarkers --======//
+//=====-- DebugProgramInstruction.cpp - Implement DbgRecords/DPMarkers --=====//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -541,21 +541,21 @@ void DbgRecord::moveAfter(DbgRecord *MoveAfter) {
///////////////////////////////////////////////////////////////////////////////
// An empty, global, DPMarker for the purpose of describing empty ranges of
-// DPValues.
+// DbgRecords.
DPMarker DPMarker::EmptyDPMarker;
void DPMarker::dropDbgRecords() {
- while (!StoredDPValues.empty()) {
- auto It = StoredDPValues.begin();
+ while (!StoredDbgRecords.empty()) {
+ auto It = StoredDbgRecords.begin();
DbgRecord *DR = &*It;
- StoredDPValues.erase(It);
+ StoredDbgRecords.erase(It);
DR->deleteRecord();
}
}
void DPMarker::dropOneDbgRecord(DbgRecord *DR) {
assert(DR->getMarker() == this);
- StoredDPValues.erase(DR->getIterator());
+ StoredDbgRecords.erase(DR->getIterator());
DR->deleteRecord();
}
@@ -566,15 +566,15 @@ const BasicBlock *DPMarker::getParent() const {
BasicBlock *DPMarker::getParent() { return MarkedInstr->getParent(); }
void DPMarker::removeMarker() {
- // Are there any DPValues in this DPMarker? If not, nothing to preserve.
+ // Are there any DbgRecords in this DPMarker? If not, nothing to preserve.
Instruction *Owner = MarkedInstr;
- if (StoredDPValues.empty()) {
+ if (StoredDbgRecords.empty()) {
eraseFromParent();
Owner->DbgMarker = nullptr;
return;
}
- // The attached DPValues need to be preserved; attach them to the next
+ // The attached DbgRecords need to be preserved; attach them to the next
// instruction. If there isn't a next instruction, put them on the
// "trailing" list.
DPMarker *NextMarker = Owner->getParent()->getNextMarker(Owner);
@@ -610,15 +610,15 @@ void DPMarker::eraseFromParent() {
}
iterator_range<DbgRecord::self_iterator> DPMarker::getDbgRecordRange() {
- return make_range(StoredDPValues.begin(), StoredDPValues.end());
+ return make_range(StoredDbgRecords.begin(), StoredDbgRecords.end());
}
iterator_range<DbgRecord::const_self_iterator>
DPMarker::getDbgRecordRange() const {
- return make_range(StoredDPValues.begin(), StoredDPValues.end());
+ return make_range(StoredDbgRecords.begin(), StoredDbgRecords.end());
}
void DbgRecord::removeFromParent() {
- getMarker()->StoredDPValues.erase(getIterator());
+ getMarker()->StoredDbgRecords.erase(getIterator());
Marker = nullptr;
}
@@ -628,29 +628,29 @@ void DbgRecord::eraseFromParent() {
}
void DPMarker::insertDbgRecord(DbgRecord *New, bool InsertAtHead) {
- auto It = InsertAtHead ? StoredDPValues.begin() : StoredDPValues.end();
- StoredDPValues.insert(It, *New);
+ auto It = InsertAtHead ? StoredDbgRecords.begin() : StoredDbgRecords.end();
+ StoredDbgRecords.insert(It, *New);
New->setMarker(this);
}
void DPMarker::insertDbgRecord(DbgRecord *New, DbgRecord *InsertBefore) {
assert(InsertBefore->getMarker() == this &&
- "DPValue 'InsertBefore' must be contained in this DPMarker!");
- StoredDPValues.insert(InsertBefore->getIterator(), *New);
+ "DbgRecord 'InsertBefore' must be contained in this DPMarker!");
+ StoredDbgRecords.insert(InsertBefore->getIterator(), *New);
New->setMarker(this);
}
void DPMarker::insertDbgRecordAfter(DbgRecord *New, DbgRecord *InsertAfter) {
assert(InsertAfter->getMarker() == this &&
- "DPValue 'InsertAfter' must be contained in this DPMarker!");
- StoredDPValues.insert(++(InsertAfter->getIterator()), *New);
+ "DbgRecord 'InsertAfter' must be contained in this DPMarker!");
+ StoredDbgRecords.insert(++(InsertAfter->getIterator()), *New);
New->setMarker(this);
}
void DPMarker::absorbDebugValues(DPMarker &Src, bool InsertAtHead) {
- auto It = InsertAtHead ? StoredDPValues.begin() : StoredDPValues.end();
- for (DbgRecord &DPV : Src.StoredDPValues)
+ auto It = InsertAtHead ? StoredDbgRecords.begin() : StoredDbgRecords.end();
+ for (DbgRecord &DPV : Src.StoredDbgRecords)
DPV.setMarker(this);
- StoredDPValues.splice(It, Src.StoredDPValues);
+ StoredDbgRecords.splice(It, Src.StoredDbgRecords);
}
void DPMarker::absorbDebugValues(iterator_range<DbgRecord::self_iterator> Range,
@@ -659,45 +659,45 @@ void DPMarker::absorbDebugValues(iterator_range<DbgRecord::self_iterator> Range,
DR.setMarker(this);
auto InsertPos =
- (InsertAtHead) ? StoredDPValues.begin() : StoredDPValues.end();
+ (InsertAtHead) ? StoredDbgRecords.begin() : StoredDbgRecords.end();
- StoredDPValues.splice(InsertPos, Src.StoredDPValues, Range.begin(),
- Range.end());
+ StoredDbgRecords.splice(InsertPos, Src.StoredDbgRecords, Range.begin(),
+ Range.end());
}
iterator_range<simple_ilist<DbgRecord>::iterator> DPMarker::cloneDebugInfoFrom(
DPMarker *From, std::optional<simple_ilist<DbgRecord>::iterator> from_here,
bool InsertAtHead) {
DbgRecord *First = nullptr;
- // Work out what range of DPValues to clone: normally all the contents of the
- // "From" marker, optionally we can start from the from_here position down to
- // end().
+ // Work out what range of DbgRecords to clone: normally all the contents of
+ // the "From" marker, optionally we can start from the from_here position down
+ // to end().
auto Range =
- make_range(From->StoredDPValues.begin(), From->StoredDPValues.end());
+ make_range(From->StoredDbgRecords.begin(), From->StoredDbgRecords.end());
if (from_here.has_value())
- Range = make_range(*from_here, From->StoredDPValues.end());
+ Range = make_range(*from_here, From->StoredDbgRecords.end());
// Clone each DPValue and insert into StoreDPValues; optionally place them at
// the start or the end of the list.
- auto Pos = (InsertAtHead) ? StoredDPValues.begin() : StoredDPValues.end();
+ auto Pos = (InsertAtHead) ? StoredDbgRecords.begin() : StoredDbgRecords.end();
for (DbgRecord &DR : Range) {
DbgRecord *New = DR.clone();
New->setMarker(this);
- StoredDPValues.insert(Pos, *New);
+ StoredDbgRecords.insert(Pos, *New);
if (!First)
First = New;
}
if (!First)
- return {StoredDPValues.end(), StoredDPValues.end()};
+ return {StoredDbgRecords.end(), StoredDbgRecords.end()};
if (InsertAtHead)
// If InsertAtHead is set, we cloned a range onto the front of of the
- // StoredDPValues collection, return that range.
- return {StoredDPValues.begin(), Pos};
+ // StoredDbgRecords collection, return that range.
+ return {StoredDbgRecords.begin(), Pos};
else
// We inserted a block at the end, return that range.
- return {First->getIterator(), StoredDPValues.end()};
+ return {First->getIterator(), StoredDbgRecords.end()};
}
} // end namespace llvm
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index e089239..7a677d7 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -143,7 +143,7 @@ void Instruction::insertBefore(BasicBlock &BB,
return;
// We've inserted "this": if InsertAtHead is set then it comes before any
- // DPValues attached to InsertPos. But if it's not set, then any DPValues
+ // DPValues attached to InsertPos. But if it's not set, then any DbgRecords
// should now come before "this".
bool InsertAtHead = InsertPos.getHeadBit();
if (!InsertAtHead) {
@@ -166,10 +166,10 @@ void Instruction::insertBefore(BasicBlock &BB,
}
// If we're inserting a terminator, check if we need to flush out
- // TrailingDPValues. Inserting instructions at the end of an incomplete
+ // TrailingDbgRecords. Inserting instructions at the end of an incomplete
// block is handled by the code block above.
if (isTerminator())
- getParent()->flushTerminatorDbgValues();
+ getParent()->flushTerminatorDbgRecords();
}
/// Unlink this instruction from its current basic block and insert it into the
@@ -212,12 +212,12 @@ void Instruction::moveBeforeImpl(BasicBlock &BB, InstListType::iterator I,
assert(I == BB.end() || I->getParent() == &BB);
bool InsertAtHead = I.getHeadBit();
- // If we've been given the "Preserve" flag, then just move the DPValues with
+ // If we've been given the "Preserve" flag, then just move the DbgRecords with
// the instruction, no more special handling needed.
if (BB.IsNewDbgInfoFormat && DbgMarker && !Preserve) {
if (I != this->getIterator() || InsertAtHead) {
// "this" is definitely moving in the list, or it's moving ahead of its
- // attached DPValues. Detach any existing DPValues.
+ // attached DPValues. Detach any existing DbgRecords.
handleMarkerRemoval();
}
}
@@ -229,15 +229,15 @@ void Instruction::moveBeforeImpl(BasicBlock &BB, InstListType::iterator I,
if (BB.IsNewDbgInfoFormat && !Preserve) {
DPMarker *NextMarker = getParent()->getNextMarker(this);
- // If we're inserting at point I, and not in front of the DPValues attached
- // there, then we should absorb the DPValues attached to I.
+ // If we're inserting at point I, and not in front of the DbgRecords
+ // attached there, then we should absorb the DbgRecords attached to I.
if (!InsertAtHead && NextMarker && !NextMarker->empty()) {
adoptDbgRecords(&BB, I, false);
}
}
if (isTerminator())
- getParent()->flushTerminatorDbgValues();
+ getParent()->flushTerminatorDbgRecords();
}
iterator_range<DbgRecord::self_iterator> Instruction::cloneDebugInfoFrom(
@@ -263,11 +263,11 @@ Instruction::getDbgReinsertionPosition() {
if (!NextMarker)
return std::nullopt;
- // Are there any DPValues in the next marker?
- if (NextMarker->StoredDPValues.empty())
+ // Are there any DbgRecords in the next marker?
+ if (NextMarker->StoredDbgRecords.empty())
return std::nullopt;
- return NextMarker->StoredDPValues.begin();
+ return NextMarker->StoredDbgRecords.begin();
}
bool Instruction::hasDbgRecords() const { return !getDbgRecordRange().empty(); }
@@ -275,20 +275,20 @@ bool Instruction::hasDbgRecords() const { return !getDbgRecordRange().empty(); }
void Instruction::adoptDbgRecords(BasicBlock *BB, BasicBlock::iterator It,
bool InsertAtHead) {
DPMarker *SrcMarker = BB->getMarker(It);
- auto ReleaseTrailingDPValues = [BB, It, SrcMarker]() {
+ auto ReleaseTrailingDbgRecords = [BB, It, SrcMarker]() {
if (BB->end() == It) {
SrcMarker->eraseFromParent();
BB->deleteTrailingDbgRecords();
}
};
- if (!SrcMarker || SrcMarker->StoredDPValues.empty()) {
- ReleaseTrailingDPValues();
+ if (!SrcMarker || SrcMarker->StoredDbgRecords.empty()) {
+ ReleaseTrailingDbgRecords();
return;
}
// If we have DPMarkers attached to this instruction, we have to honour the
- // ordering of DPValues between this and the other marker. Fall back to just
+ // ordering of DbgRecords between this and the other marker. Fall back to just
// absorbing from the source.
if (DbgMarker || It == BB->end()) {
// Ensure we _do_ have a marker.
@@ -304,10 +304,11 @@ void Instruction::adoptDbgRecords(BasicBlock *BB, BasicBlock::iterator It,
// block, it's important to not leave the empty marker trailing. It will
// give a misleading impression that some debug records have been left
// trailing.
- ReleaseTrailingDPValues();
+ ReleaseTrailingDbgRecords();
} else {
- // Optimisation: we're transferring all the DPValues from the source marker
- // onto this empty location: just adopt the other instructions marker.
+ // Optimisation: we're transferring all the DbgRecords from the source
+ // marker onto this empty location: just adopt the other instructions
+ // marker.
DbgMarker = SrcMarker;
DbgMarker->MarkedInstr = this;
It->DbgMarker = nullptr;
diff --git a/llvm/lib/IR/LLVMContextImpl.cpp b/llvm/lib/IR/LLVMContextImpl.cpp
index a0bf9ca..a471314 100644
--- a/llvm/lib/IR/LLVMContextImpl.cpp
+++ b/llvm/lib/IR/LLVMContextImpl.cpp
@@ -50,7 +50,7 @@ LLVMContextImpl::~LLVMContextImpl() {
// when it's terminator was removed were eventually replaced. This assertion
// firing indicates that DPValues went missing during the lifetime of the
// LLVMContext.
- assert(TrailingDPValues.empty() && "DPValue records in blocks not cleaned");
+ assert(TrailingDbgRecords.empty() && "DbgRecords in blocks not cleaned");
#endif
// NOTE: We need to delete the contents of OwnedModules, but Module's dtor
diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h
index c841b28..b1dcb26 100644
--- a/llvm/lib/IR/LLVMContextImpl.h
+++ b/llvm/lib/IR/LLVMContextImpl.h
@@ -1684,19 +1684,19 @@ public:
/// such a way. These are stored in LLVMContext because typically LLVM only
/// edits a small number of blocks at a time, so there's no need to bloat
/// BasicBlock with such a data structure.
- SmallDenseMap<BasicBlock *, DPMarker *> TrailingDPValues;
+ SmallDenseMap<BasicBlock *, DPMarker *> TrailingDbgRecords;
- // Set, get and delete operations for TrailingDPValues.
+ // Set, get and delete operations for TrailingDbgRecords.
void setTrailingDbgRecords(BasicBlock *B, DPMarker *M) {
- assert(!TrailingDPValues.count(B));
- TrailingDPValues[B] = M;
+ assert(!TrailingDbgRecords.count(B));
+ TrailingDbgRecords[B] = M;
}
DPMarker *getTrailingDbgRecords(BasicBlock *B) {
- return TrailingDPValues.lookup(B);
+ return TrailingDbgRecords.lookup(B);
}
- void deleteTrailingDbgRecords(BasicBlock *B) { TrailingDPValues.erase(B); }
+ void deleteTrailingDbgRecords(BasicBlock *B) { TrailingDbgRecords.erase(B); }
};
} // end namespace llvm
diff --git a/llvm/lib/MC/MCSectionXCOFF.cpp b/llvm/lib/MC/MCSectionXCOFF.cpp
index 95d32e3..609ef09 100644
--- a/llvm/lib/MC/MCSectionXCOFF.cpp
+++ b/llvm/lib/MC/MCSectionXCOFF.cpp
@@ -87,8 +87,7 @@ void MCSectionXCOFF::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
if (getKind().isCommon() && !getKind().isBSSLocal())
return;
- assert((getKind().isBSSExtern() || getKind().isBSSLocal()) &&
- "Unexepected section kind for toc-data");
+ assert(getKind().isBSS() && "Unexpected section kind for toc-data");
printCsectDirective(OS);
return;
}
diff --git a/llvm/lib/Object/Archive.cpp b/llvm/lib/Object/Archive.cpp
index 9000e9a..6139d99 100644
--- a/llvm/lib/Object/Archive.cpp
+++ b/llvm/lib/Object/Archive.cpp
@@ -969,12 +969,19 @@ Archive::Archive(MemoryBufferRef Source, Error &Err)
Err = Error::success();
}
+object::Archive::Kind Archive::getDefaultKindForTriple(Triple &T) {
+ if (T.isOSDarwin())
+ return object::Archive::K_DARWIN;
+ if (T.isOSAIX())
+ return object::Archive::K_AIXBIG;
+ if (T.isOSWindows())
+ return object::Archive::K_COFF;
+ return object::Archive::K_GNU;
+}
+
object::Archive::Kind Archive::getDefaultKind() {
Triple HostTriple(sys::getDefaultTargetTriple());
- return HostTriple.isOSDarwin()
- ? object::Archive::K_DARWIN
- : (HostTriple.isOSAIX() ? object::Archive::K_AIXBIG
- : object::Archive::K_GNU);
+ return getDefaultKindForTriple(HostTriple);
}
Archive::child_iterator Archive::child_begin(Error &Err,
diff --git a/llvm/lib/Object/ArchiveWriter.cpp b/llvm/lib/Object/ArchiveWriter.cpp
index e062974..aa57e55 100644
--- a/llvm/lib/Object/ArchiveWriter.cpp
+++ b/llvm/lib/Object/ArchiveWriter.cpp
@@ -62,12 +62,16 @@ object::Archive::Kind NewArchiveMember::detectKindFromObject() const {
Expected<std::unique_ptr<object::ObjectFile>> OptionalObject =
object::ObjectFile::createObjectFile(MemBufferRef);
- if (OptionalObject)
- return isa<object::MachOObjectFile>(**OptionalObject)
- ? object::Archive::K_DARWIN
- : (isa<object::XCOFFObjectFile>(**OptionalObject)
- ? object::Archive::K_AIXBIG
- : object::Archive::K_GNU);
+ if (OptionalObject) {
+ if (isa<object::MachOObjectFile>(**OptionalObject))
+ return object::Archive::K_DARWIN;
+ if (isa<object::XCOFFObjectFile>(**OptionalObject))
+ return object::Archive::K_AIXBIG;
+ if (isa<object::COFFObjectFile>(**OptionalObject) ||
+ isa<object::COFFImportFile>(**OptionalObject))
+ return object::Archive::K_COFF;
+ return object::Archive::K_GNU;
+ }
// Squelch the error in case we had a non-object file.
consumeError(OptionalObject.takeError());
@@ -80,10 +84,7 @@ object::Archive::Kind NewArchiveMember::detectKindFromObject() const {
MemBufferRef, file_magic::bitcode, &Context)) {
auto &IRObject = cast<object::IRObjectFile>(**ObjOrErr);
auto TargetTriple = Triple(IRObject.getTargetTriple());
- return TargetTriple.isOSDarwin()
- ? object::Archive::K_DARWIN
- : (TargetTriple.isOSAIX() ? object::Archive::K_AIXBIG
- : object::Archive::K_GNU);
+ return object::Archive::getDefaultKindForTriple(TargetTriple);
} else {
// Squelch the error in case this was not a SymbolicFile.
consumeError(ObjOrErr.takeError());
@@ -976,10 +977,12 @@ static Error writeArchiveToStream(raw_ostream &Out,
SmallString<0> StringTableBuf;
raw_svector_ostream StringTable(StringTableBuf);
SymMap SymMap;
+ bool ShouldWriteSymtab = WriteSymtab != SymtabWritingMode::NoSymtab;
// COFF symbol map uses 16-bit indexes, so we can't use it if there are too
- // many members.
- if (isCOFFArchive(Kind) && NewMembers.size() > 0xfffe)
+ // many members. COFF format also requires symbol table presence, so use
+ // GNU format when NoSymtab is requested.
+ if (isCOFFArchive(Kind) && (NewMembers.size() > 0xfffe || !ShouldWriteSymtab))
Kind = object::Archive::K_GNU;
// In the scenario when LLVMContext is populated SymbolicFile will contain a
@@ -1008,7 +1011,6 @@ static Error writeArchiveToStream(raw_ostream &Out,
uint64_t LastMemberHeaderOffset = 0;
uint64_t NumSyms = 0;
uint64_t NumSyms32 = 0; // Store symbol number of 32-bit member files.
- bool ShouldWriteSymtab = WriteSymtab != SymtabWritingMode::NoSymtab;
for (const auto &M : Data) {
// Record the start of the member's offset
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 054311d..9665ae5 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7751,7 +7751,7 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
// register allocator to pass call args in callee saved regs, without extra
// copies to avoid these fake clobbers of actually-preserved GPRs.
if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 ||
- MI.getOpcode() == AArch64::MSRpstatePseudo)
+ MI.getOpcode() == AArch64::MSRpstatePseudo) {
for (unsigned I = MI.getNumOperands() - 1; I > 0; --I)
if (MachineOperand &MO = MI.getOperand(I);
MO.isReg() && MO.isImplicit() && MO.isDef() &&
@@ -7759,6 +7759,16 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
AArch64::GPR64RegClass.contains(MO.getReg())))
MI.removeOperand(I);
+ // The SVE vector length can change when entering/leaving streaming mode.
+ if (MI.getOperand(0).getImm() == AArch64SVCR::SVCRSM ||
+ MI.getOperand(0).getImm() == AArch64SVCR::SVCRSMZA) {
+ MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
+ /*IsImplicit=*/true));
+ MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/true,
+ /*IsImplicit=*/true));
+ }
+ }
+
// Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
// have nothing to do with VG, were it not that they are used to materialise a
// frame-address. If they contain a frame-index to a scalable vector, this
@@ -21413,12 +21423,8 @@ static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG,
}
}
- // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
- // Only implemented on little-endian subtargets.
- bool IsLittleEndian = DAG.getDataLayout().isLittleEndian();
-
- // This optimization only works on little endian.
- if (!IsLittleEndian)
+ // These optimizations only work on little endian.
+ if (!DAG.getDataLayout().isLittleEndian())
return SDValue();
// uzp1(bitcast(x), bitcast(y)) -> uzp1(x, y)
@@ -21437,21 +21443,28 @@ static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG,
if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8)
return SDValue();
- auto getSourceOp = [](SDValue Operand) -> SDValue {
- const unsigned Opcode = Operand.getOpcode();
- if (Opcode == ISD::TRUNCATE)
- return Operand->getOperand(0);
- if (Opcode == ISD::BITCAST &&
- Operand->getOperand(0).getOpcode() == ISD::TRUNCATE)
- return Operand->getOperand(0)->getOperand(0);
- return SDValue();
- };
+ SDValue SourceOp0 = peekThroughBitcasts(Op0);
+ SDValue SourceOp1 = peekThroughBitcasts(Op1);
- SDValue SourceOp0 = getSourceOp(Op0);
- SDValue SourceOp1 = getSourceOp(Op1);
+ // truncating uzp1(x, y) -> xtn(concat (x, y))
+ if (SourceOp0.getValueType() == SourceOp1.getValueType()) {
+ EVT Op0Ty = SourceOp0.getValueType();
+ if ((ResVT == MVT::v4i16 && Op0Ty == MVT::v2i32) ||
+ (ResVT == MVT::v8i8 && Op0Ty == MVT::v4i16)) {
+ SDValue Concat =
+ DAG.getNode(ISD::CONCAT_VECTORS, DL,
+ Op0Ty.getDoubleNumVectorElementsVT(*DAG.getContext()),
+ SourceOp0, SourceOp1);
+ return DAG.getNode(ISD::TRUNCATE, DL, ResVT, Concat);
+ }
+ }
- if (!SourceOp0 || !SourceOp1)
+ // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
+ if (SourceOp0.getOpcode() != ISD::TRUNCATE ||
+ SourceOp1.getOpcode() != ISD::TRUNCATE)
return SDValue();
+ SourceOp0 = SourceOp0.getOperand(0);
+ SourceOp1 = SourceOp1.getOperand(0);
if (SourceOp0.getValueType() != SourceOp1.getValueType() ||
!SourceOp0.getValueType().isSimple())
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 6254e68..b4b975c 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6153,26 +6153,39 @@ defm UZP2 : SIMDZipVector<0b101, "uzp2", AArch64uzp2>;
defm ZIP1 : SIMDZipVector<0b011, "zip1", AArch64zip1>;
defm ZIP2 : SIMDZipVector<0b111, "zip2", AArch64zip2>;
-def : Pat<(v16i8 (concat_vectors (v8i8 (trunc (v8i16 V128:$Vn))),
- (v8i8 (trunc (v8i16 V128:$Vm))))),
- (UZP1v16i8 V128:$Vn, V128:$Vm)>;
-def : Pat<(v8i16 (concat_vectors (v4i16 (trunc (v4i32 V128:$Vn))),
- (v4i16 (trunc (v4i32 V128:$Vm))))),
- (UZP1v8i16 V128:$Vn, V128:$Vm)>;
-def : Pat<(v4i32 (concat_vectors (v2i32 (trunc (v2i64 V128:$Vn))),
- (v2i32 (trunc (v2i64 V128:$Vm))))),
- (UZP1v4i32 V128:$Vn, V128:$Vm)>;
-// These are the same as above, with an optional assertzext node that can be
-// generated from fptoi lowering.
-def : Pat<(v16i8 (concat_vectors (v8i8 (assertzext (trunc (v8i16 V128:$Vn)))),
- (v8i8 (assertzext (trunc (v8i16 V128:$Vm)))))),
- (UZP1v16i8 V128:$Vn, V128:$Vm)>;
-def : Pat<(v8i16 (concat_vectors (v4i16 (assertzext (trunc (v4i32 V128:$Vn)))),
- (v4i16 (assertzext (trunc (v4i32 V128:$Vm)))))),
- (UZP1v8i16 V128:$Vn, V128:$Vm)>;
-def : Pat<(v4i32 (concat_vectors (v2i32 (assertzext (trunc (v2i64 V128:$Vn)))),
- (v2i32 (assertzext (trunc (v2i64 V128:$Vm)))))),
- (UZP1v4i32 V128:$Vn, V128:$Vm)>;
+def trunc_optional_assert_ext : PatFrags<(ops node:$op0),
+ [(trunc node:$op0),
+ (assertzext (trunc node:$op0)),
+ (assertsext (trunc node:$op0))]>;
+
+// concat_vectors(trunc(x), trunc(y)) -> uzp1(x, y)
+// concat_vectors(assertzext(trunc(x)), assertzext(trunc(y))) -> uzp1(x, y)
+// concat_vectors(assertsext(trunc(x)), assertsext(trunc(y))) -> uzp1(x, y)
+class concat_trunc_to_uzp1_pat<ValueType SrcTy, ValueType TruncTy, ValueType ConcatTy>
+ : Pat<(ConcatTy (concat_vectors (TruncTy (trunc_optional_assert_ext (SrcTy V128:$Vn))),
+ (TruncTy (trunc_optional_assert_ext (SrcTy V128:$Vm))))),
+ (!cast<Instruction>("UZP1"#ConcatTy) V128:$Vn, V128:$Vm)>;
+def : concat_trunc_to_uzp1_pat<v8i16, v8i8, v16i8>;
+def : concat_trunc_to_uzp1_pat<v4i32, v4i16, v8i16>;
+def : concat_trunc_to_uzp1_pat<v2i64, v2i32, v4i32>;
+
+// trunc(concat_vectors(trunc(x), trunc(y))) -> xtn(uzp1(x, y))
+// trunc(concat_vectors(assertzext(trunc(x)), assertzext(trunc(y)))) -> xtn(uzp1(x, y))
+// trunc(concat_vectors(assertsext(trunc(x)), assertsext(trunc(y)))) -> xtn(uzp1(x, y))
+class trunc_concat_trunc_to_xtn_uzp1_pat<ValueType SrcTy, ValueType TruncTy, ValueType ConcatTy,
+ ValueType Ty>
+ : Pat<(Ty (trunc_optional_assert_ext
+ (ConcatTy (concat_vectors
+ (TruncTy (trunc_optional_assert_ext (SrcTy V128:$Vn))),
+ (TruncTy (trunc_optional_assert_ext (SrcTy V128:$Vm))))))),
+ (!cast<Instruction>("XTN"#Ty) (!cast<Instruction>("UZP1"#ConcatTy) V128:$Vn, V128:$Vm))>;
+def : trunc_concat_trunc_to_xtn_uzp1_pat<v4i32, v4i16, v8i16, v8i8>;
+def : trunc_concat_trunc_to_xtn_uzp1_pat<v2i64, v2i32, v4i32, v4i16>;
+
+def : Pat<(v8i8 (trunc (concat_vectors (v4i16 V64:$Vn), (v4i16 V64:$Vm)))),
+ (UZP1v8i8 V64:$Vn, V64:$Vm)>;
+def : Pat<(v4i16 (trunc (concat_vectors (v2i32 V64:$Vn), (v2i32 V64:$Vm)))),
+ (UZP1v4i16 V64:$Vn, V64:$Vm)>;
def : Pat<(v16i8 (concat_vectors
(v8i8 (trunc (AArch64vlshr (v8i16 V128:$Vn), (i32 8)))),
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 33cb5f9..44d9a8a 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -223,8 +223,6 @@ def MSRpstatesvcrImm1
let Inst{8} = imm;
let Inst{7-5} = 0b011; // op2
let hasPostISelHook = 1;
- let Uses = [VG];
- let Defs = [VG];
}
def : InstAlias<"smstart", (MSRpstatesvcrImm1 0b011, 0b1)>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 5ed82c0..86f77f7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -194,7 +194,25 @@ class HasOneUseTernaryOp<SDPatternOperator op> : PatFrag<
}];
}
-class is_canonicalized<SDPatternOperator op> : PatFrag<
+class is_canonicalized_1<SDPatternOperator op> : PatFrag<
+ (ops node:$src0),
+ (op $src0),
+ [{
+ const SITargetLowering &Lowering =
+ *static_cast<const SITargetLowering *>(getTargetLowering());
+
+ return Lowering.isCanonicalized(*CurDAG, N->getOperand(0));
+ }]> {
+
+ let GISelPredicateCode = [{
+ const SITargetLowering *TLI = static_cast<const SITargetLowering *>(
+ MF.getSubtarget().getTargetLowering());
+
+ return TLI->isCanonicalized(MI.getOperand(1).getReg(), MF);
+ }];
+}
+
+class is_canonicalized_2<SDPatternOperator op> : PatFrag<
(ops node:$src0, node:$src1),
(op $src0, $src1),
[{
@@ -210,8 +228,8 @@ class is_canonicalized<SDPatternOperator op> : PatFrag<
const SITargetLowering *TLI = static_cast<const SITargetLowering *>(
MF.getSubtarget().getTargetLowering());
- return TLI->isCanonicalized(MI.getOperand(1).getReg(), const_cast<MachineFunction&>(MF)) &&
- TLI->isCanonicalized(MI.getOperand(2).getReg(), const_cast<MachineFunction&>(MF));
+ return TLI->isCanonicalized(MI.getOperand(1).getReg(), MF) &&
+ TLI->isCanonicalized(MI.getOperand(2).getReg(), MF);
}];
}
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index c709102..4ae514f 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -2616,7 +2616,6 @@ defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Real_Atomic_gfx11_gfx12<0x042, "buffer
defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Real_Atomic_gfx11<0x050, "buffer_atomic_cmpswap_f32">;
defm BUFFER_ATOMIC_COND_SUB_U32 : MUBUF_Real_Atomic_gfx12<0x050>;
defm BUFFER_ATOMIC_CSUB : MUBUF_Real_Atomic_gfx11_gfx12<0x037, "buffer_atomic_sub_clamp_u32", "buffer_atomic_csub_u32">;
-def : Mnem_gfx11_gfx12<"buffer_atomic_csub", "buffer_atomic_csub_u32">;
defm BUFFER_ATOMIC_DEC : MUBUF_Real_Atomic_gfx11_gfx12<0x040, "buffer_atomic_dec_u32">;
defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Real_Atomic_gfx11_gfx12<0x04D, "buffer_atomic_dec_u64">;
defm BUFFER_ATOMIC_INC : MUBUF_Real_Atomic_gfx11_gfx12<0x03F, "buffer_atomic_inc_u32">;
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 87ace01..e944dde 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -1735,14 +1735,12 @@ def DS_WRITE_B128_vi : DS_Real_vi<0xdf, DS_WRITE_B128>;
def DS_READ_B96_vi : DS_Real_vi<0xfe, DS_READ_B96>;
def DS_READ_B128_vi : DS_Real_vi<0xff, DS_READ_B128>;
-let SubtargetPredicate = isGFX90APlus in {
- def DS_ADD_F64_vi : DS_Real_vi<0x5c, DS_ADD_F64>;
- def DS_ADD_RTN_F64_vi : DS_Real_vi<0x7c, DS_ADD_RTN_F64>;
-} // End SubtargetPredicate = isGFX90APlus
-
-let SubtargetPredicate = isGFX940Plus in {
- def DS_PK_ADD_F16_vi : DS_Real_vi<0x17, DS_PK_ADD_F16>;
- def DS_PK_ADD_RTN_F16_vi : DS_Real_vi<0xb7, DS_PK_ADD_RTN_F16>;
- def DS_PK_ADD_BF16_vi : DS_Real_vi<0x18, DS_PK_ADD_BF16>;
- def DS_PK_ADD_RTN_BF16_vi : DS_Real_vi<0xb8, DS_PK_ADD_RTN_BF16>;
-} // End SubtargetPredicate = isGFX940Plus
+// GFX90A+.
+def DS_ADD_F64_vi : DS_Real_vi<0x5c, DS_ADD_F64>;
+def DS_ADD_RTN_F64_vi : DS_Real_vi<0x7c, DS_ADD_RTN_F64>;
+
+// GFX940+.
+def DS_PK_ADD_F16_vi : DS_Real_vi<0x17, DS_PK_ADD_F16>;
+def DS_PK_ADD_RTN_F16_vi : DS_Real_vi<0xb7, DS_PK_ADD_RTN_F16>;
+def DS_PK_ADD_BF16_vi : DS_Real_vi<0x18, DS_PK_ADD_BF16>;
+def DS_PK_ADD_RTN_BF16_vi : DS_Real_vi<0xb8, DS_PK_ADD_RTN_BF16>;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 9bc1b8e..5ccf21f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -12572,6 +12572,10 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
case ISD::FREM:
case ISD::FP_ROUND:
case ISD::FP_EXTEND:
+ case ISD::FP16_TO_FP:
+ case ISD::FP_TO_FP16:
+ case ISD::BF16_TO_FP:
+ case ISD::FP_TO_BF16:
case ISD::FLDEXP:
case AMDGPUISD::FMUL_LEGACY:
case AMDGPUISD::FMAD_FTZ:
@@ -12591,6 +12595,9 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
case AMDGPUISD::CVT_F32_UBYTE1:
case AMDGPUISD::CVT_F32_UBYTE2:
case AMDGPUISD::CVT_F32_UBYTE3:
+ case AMDGPUISD::FP_TO_FP16:
+ case AMDGPUISD::SIN_HW:
+ case AMDGPUISD::COS_HW:
return true;
// It can/will be lowered or combined as a bit operation.
@@ -12600,6 +12607,20 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
case ISD::FCOPYSIGN:
return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
+ case ISD::AND:
+ if (Op.getValueType() == MVT::i32) {
+ // Be careful as we only know it is a bitcast floating point type. It
+ // could be f32, v2f16, we have no way of knowing. Luckily the constant
+ // value that we optimize for, which comes up in fp32 to bf16 conversions,
+ // is valid to optimize for all types.
+ if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+ if (RHS->getZExtValue() == 0xffff0000) {
+ return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
+ }
+ }
+ }
+ break;
+
case ISD::FSIN:
case ISD::FCOS:
case ISD::FSINCOS:
@@ -12665,6 +12686,9 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
return false;
case ISD::BITCAST:
+ // TODO: This is incorrect as it loses track of the operand's type. We may
+ // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
+ // same bits that are canonicalized in one type need not be in the other.
return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
case ISD::TRUNCATE: {
// Hack round the mess we make when legalizing extract_vector_elt
@@ -12694,25 +12718,26 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
case Intrinsic::amdgcn_trig_preop:
case Intrinsic::amdgcn_log:
case Intrinsic::amdgcn_exp2:
+ case Intrinsic::amdgcn_sqrt:
return true;
default:
break;
}
- [[fallthrough]];
+ break;
}
default:
- // FIXME: denormalsEnabledForType is broken for dynamic
- return denormalsEnabledForType(DAG, Op.getValueType()) &&
- DAG.isKnownNeverSNaN(Op);
+ break;
}
- llvm_unreachable("invalid operation");
+ // FIXME: denormalsEnabledForType is broken for dynamic
+ return denormalsEnabledForType(DAG, Op.getValueType()) &&
+ DAG.isKnownNeverSNaN(Op);
}
-bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF,
+bool SITargetLowering::isCanonicalized(Register Reg, const MachineFunction &MF,
unsigned MaxDepth) const {
- MachineRegisterInfo &MRI = MF.getRegInfo();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
MachineInstr *MI = MRI.getVRegDef(Reg);
unsigned Opcode = MI->getOpcode();
@@ -12931,27 +12956,7 @@ SDValue SITargetLowering::performFCanonicalizeCombine(
}
}
- unsigned SrcOpc = N0.getOpcode();
-
- // If it's free to do so, push canonicalizes further up the source, which may
- // find a canonical source.
- //
- // TODO: More opcodes. Note this is unsafe for the _ieee minnum/maxnum for
- // sNaNs.
- if (SrcOpc == ISD::FMINNUM || SrcOpc == ISD::FMAXNUM) {
- auto *CRHS = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
- if (CRHS && N0.hasOneUse()) {
- SDLoc SL(N);
- SDValue Canon0 = DAG.getNode(ISD::FCANONICALIZE, SL, VT,
- N0.getOperand(0));
- SDValue Canon1 = getCanonicalConstantFP(DAG, SL, VT, CRHS->getValueAPF());
- DCI.AddToWorklist(Canon0.getNode());
-
- return DAG.getNode(N0.getOpcode(), SL, VT, Canon0, Canon1);
- }
- }
-
- return isCanonicalized(DAG, N0) ? N0 : SDValue();
+ return SDValue();
}
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
@@ -15939,8 +15944,8 @@ bool SITargetLowering::denormalsEnabledForType(const SelectionDAG &DAG,
}
}
-bool SITargetLowering::denormalsEnabledForType(LLT Ty,
- MachineFunction &MF) const {
+bool SITargetLowering::denormalsEnabledForType(
+ LLT Ty, const MachineFunction &MF) const {
switch (Ty.getScalarSizeInBits()) {
case 32:
return !denormalModeIsFlushAllF32(MF);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index a20442e..89da442 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -523,10 +523,10 @@ public:
bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
unsigned MaxDepth = 5) const;
- bool isCanonicalized(Register Reg, MachineFunction &MF,
+ bool isCanonicalized(Register Reg, const MachineFunction &MF,
unsigned MaxDepth = 5) const;
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const;
- bool denormalsEnabledForType(LLT Ty, MachineFunction &MF) const;
+ bool denormalsEnabledForType(LLT Ty, const MachineFunction &MF) const;
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op,
const TargetRegisterInfo *TRI,
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 33c93cd..3ab7884 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2944,6 +2944,34 @@ def : GCNPat<
(V_BFREV_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1))), sub0,
(V_BFREV_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0))), sub1)>;
+// If fcanonicalize's operand is implicitly canonicalized, we only need a copy.
+let AddedComplexity = 1000 in {
+def : GCNPat<
+ (is_canonicalized_1<fcanonicalize> f16:$src),
+ (COPY f16:$src)
+>;
+
+def : GCNPat<
+ (is_canonicalized_1<fcanonicalize> v2f16:$src),
+ (COPY v2f16:$src)
+>;
+
+def : GCNPat<
+ (is_canonicalized_1<fcanonicalize> f32:$src),
+ (COPY f32:$src)
+>;
+
+def : GCNPat<
+ (is_canonicalized_1<fcanonicalize> v2f32:$src),
+ (COPY v2f32:$src)
+>;
+
+def : GCNPat<
+ (is_canonicalized_1<fcanonicalize> f64:$src),
+ (COPY f64:$src)
+>;
+}
+
// Prefer selecting to max when legal, but using mul is always valid.
let AddedComplexity = -5 in {
@@ -3277,8 +3305,8 @@ def : GCNPat <
let AddedComplexity = 5 in {
def : GCNPat <
- (v2f16 (is_canonicalized<build_vector> (f16 (VOP3Mods (f16 VGPR_32:$src0), i32:$src0_mods)),
- (f16 (VOP3Mods (f16 VGPR_32:$src1), i32:$src1_mods)))),
+ (v2f16 (is_canonicalized_2<build_vector> (f16 (VOP3Mods (f16 VGPR_32:$src0), i32:$src0_mods)),
+ (f16 (VOP3Mods (f16 VGPR_32:$src1), i32:$src1_mods)))),
(V_PACK_B32_F16_e64 $src0_mods, VGPR_32:$src0, $src1_mods, VGPR_32:$src1)
>;
}
@@ -3590,6 +3618,17 @@ FPMinMaxPat<Instruction minmaxInst, ValueType vt, SDPatternOperator min_or_max,
DSTCLAMP.NONE, DSTOMOD.NONE)
>;
+class
+FPMinCanonMaxPat<Instruction minmaxInst, ValueType vt, SDPatternOperator min_or_max,
+ SDPatternOperator max_or_min_oneuse> : GCNPat <
+ (min_or_max (is_canonicalized_1<fcanonicalize>
+ (max_or_min_oneuse (VOP3Mods vt:$src0, i32:$src0_mods),
+ (VOP3Mods vt:$src1, i32:$src1_mods))),
+ (vt (VOP3Mods vt:$src2, i32:$src2_mods))),
+ (minmaxInst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
+ DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
let OtherPredicates = [isGFX11Plus] in {
def : IntMinMaxPat<V_MAXMIN_I32_e64, smin, smax_oneuse>;
def : IntMinMaxPat<V_MINMAX_I32_e64, smax, smin_oneuse>;
@@ -3599,6 +3638,10 @@ def : FPMinMaxPat<V_MINMAX_F32_e64, f32, fmaxnum_like, fminnum_like_oneuse>;
def : FPMinMaxPat<V_MAXMIN_F32_e64, f32, fminnum_like, fmaxnum_like_oneuse>;
def : FPMinMaxPat<V_MINMAX_F16_e64, f16, fmaxnum_like, fminnum_like_oneuse>;
def : FPMinMaxPat<V_MAXMIN_F16_e64, f16, fminnum_like, fmaxnum_like_oneuse>;
+def : FPMinCanonMaxPat<V_MINMAX_F32_e64, f32, fmaxnum_like, fminnum_like_oneuse>;
+def : FPMinCanonMaxPat<V_MAXMIN_F32_e64, f32, fminnum_like, fmaxnum_like_oneuse>;
+def : FPMinCanonMaxPat<V_MINMAX_F16_e64, f16, fmaxnum_like, fminnum_like_oneuse>;
+def : FPMinCanonMaxPat<V_MAXMIN_F16_e64, f16, fminnum_like, fmaxnum_like_oneuse>;
}
let OtherPredicates = [isGFX9Plus] in {
@@ -3612,6 +3655,10 @@ def : FPMinMaxPat<V_MINIMUMMAXIMUM_F32_e64, f32, DivergentBinFrag<fmaximum>, fmi
def : FPMinMaxPat<V_MAXIMUMMINIMUM_F32_e64, f32, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
def : FPMinMaxPat<V_MINIMUMMAXIMUM_F16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
def : FPMinMaxPat<V_MAXIMUMMINIMUM_F16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
+def : FPMinCanonMaxPat<V_MINIMUMMAXIMUM_F32_e64, f32, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
+def : FPMinCanonMaxPat<V_MAXIMUMMINIMUM_F32_e64, f32, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
+def : FPMinCanonMaxPat<V_MINIMUMMAXIMUM_F16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
+def : FPMinCanonMaxPat<V_MAXIMUMMINIMUM_F16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
}
// Convert a floating-point power of 2 to the integer exponent.
diff --git a/llvm/lib/Target/PowerPC/GISel/PPCInstructionSelector.cpp b/llvm/lib/Target/PowerPC/GISel/PPCInstructionSelector.cpp
index 3fd7a1a..98cd3a8 100644
--- a/llvm/lib/Target/PowerPC/GISel/PPCInstructionSelector.cpp
+++ b/llvm/lib/Target/PowerPC/GISel/PPCInstructionSelector.cpp
@@ -695,8 +695,8 @@ bool PPCInstructionSelector::selectConstantPool(
.addReg(HaAddrReg)
.addMemOperand(MMO);
else
- // For medium code model, generate ADDItocL(CPI, ADDIStocHA8(X2, CPI))
- MI = BuildMI(MBB, I, DbgLoc, TII.get(PPC::ADDItocL), DstReg)
+ // For medium code model, generate ADDItocL8(CPI, ADDIStocHA8(X2, CPI))
+ MI = BuildMI(MBB, I, DbgLoc, TII.get(PPC::ADDItocL8), DstReg)
.addReg(HaAddrReg)
.addConstantPoolIndex(CPI);
}
diff --git a/llvm/lib/Target/PowerPC/P10InstrResources.td b/llvm/lib/Target/PowerPC/P10InstrResources.td
index 3bbc5a6..5015ba8 100644
--- a/llvm/lib/Target/PowerPC/P10InstrResources.td
+++ b/llvm/lib/Target/PowerPC/P10InstrResources.td
@@ -881,7 +881,7 @@ def : InstRW<[P10W_FX_3C, P10W_DISP_ANY],
// 3 Cycles ALU operations, 1 input operands
def : InstRW<[P10W_FX_3C, P10W_DISP_ANY, P10FX_Read],
(instrs
- ADDI, ADDI8, ADDIdtprelL32, ADDItlsldLADDR32, ADDItocL, LI, LI8,
+ ADDI, ADDI8, ADDIdtprelL32, ADDItlsldLADDR32, ADDItocL8, LI, LI8,
ADDIC, ADDIC8,
ADDIS, ADDIS8, ADDISdtprelHA32, ADDIStocHA, ADDIStocHA8, LIS, LIS8,
ADDME, ADDME8,
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 9396ca2..542854e 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -1236,8 +1236,8 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
EmitToStreamer(*OutStreamer, TmpInst);
return;
}
- case PPC::ADDItocL: {
- // Transform %xd = ADDItocL %xs, @sym
+ case PPC::ADDItocL8: {
+ // Transform %xd = ADDItocL8 %xs, @sym
LowerPPCMachineInstrToMCInst(MI, TmpInst, *this);
// Change the opcode to ADDI8. If the global address is external, then
@@ -1246,7 +1246,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
TmpInst.setOpcode(PPC::ADDI8);
const MachineOperand &MO = MI->getOperand(2);
- assert((MO.isGlobal() || MO.isCPI()) && "Invalid operand for ADDItocL.");
+ assert((MO.isGlobal() || MO.isCPI()) && "Invalid operand for ADDItocL8.");
LLVM_DEBUG(assert(
!(MO.isGlobal() && Subtarget->isGVIndirectSymbol(MO.getGlobal())) &&
@@ -2659,6 +2659,8 @@ void PPCAIXAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
// If the Global Variable has the toc-data attribute, it needs to be emitted
// when we emit the .toc section.
if (GV->hasAttribute("toc-data")) {
+ unsigned PointerSize = GV->getParent()->getDataLayout().getPointerSize();
+ Subtarget->tocDataChecks(PointerSize, GV);
TOCDataGlobalVars.push_back(GV);
return;
}
diff --git a/llvm/lib/Target/PowerPC/PPCBack2BackFusion.def b/llvm/lib/Target/PowerPC/PPCBack2BackFusion.def
index 8bbe315..6bb66bc 100644
--- a/llvm/lib/Target/PowerPC/PPCBack2BackFusion.def
+++ b/llvm/lib/Target/PowerPC/PPCBack2BackFusion.def
@@ -29,7 +29,7 @@ FUSION_FEATURE(GeneralBack2Back, hasBack2BackFusion, -1,
ADDIStocHA8,
ADDIdtprelL32,
ADDItlsldLADDR32,
- ADDItocL,
+ ADDItocL8,
ADDME,
ADDME8,
ADDME8O,
@@ -518,7 +518,7 @@ FUSION_FEATURE(GeneralBack2Back, hasBack2BackFusion, -1,
ADDIStocHA8,
ADDIdtprelL32,
ADDItlsldLADDR32,
- ADDItocL,
+ ADDItocL8,
ADDME,
ADDME8,
ADDME8O,
diff --git a/llvm/lib/Target/PowerPC/PPCFastISel.cpp b/llvm/lib/Target/PowerPC/PPCFastISel.cpp
index 56af80f..6e31cda 100644
--- a/llvm/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFastISel.cpp
@@ -2094,7 +2094,7 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
// for large code model, we generate:
// LDtocL(GV, ADDIStocHA8(%x2, GV))
// Otherwise we generate:
- // ADDItocL(ADDIStocHA8(%x2, GV), GV)
+ // ADDItocL8(ADDIStocHA8(%x2, GV), GV)
// Either way, start with the ADDIStocHA8:
Register HighPartReg = createResultReg(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(PPC::ADDIStocHA8),
@@ -2104,9 +2104,11 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(PPC::LDtocL),
DestReg).addGlobalAddress(GV).addReg(HighPartReg);
} else {
- // Otherwise generate the ADDItocL.
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(PPC::ADDItocL),
- DestReg).addReg(HighPartReg).addGlobalAddress(GV);
+ // Otherwise generate the ADDItocL8.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(PPC::ADDItocL8),
+ DestReg)
+ .addReg(HighPartReg)
+ .addGlobalAddress(GV);
}
}
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 9e5f0b3..0c25accd 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -521,40 +521,6 @@ static bool hasTocDataAttr(SDValue Val, unsigned PointerSize) {
if (!GV->hasAttribute("toc-data"))
return false;
-
- // TODO: These asserts should be updated as more support for the toc data
- // transformation is added (struct support, etc.).
-
- assert(
- PointerSize >= GV->getAlign().valueOrOne().value() &&
- "GlobalVariables with an alignment requirement stricter than TOC entry "
- "size not supported by the toc data transformation.");
-
- Type *GVType = GV->getValueType();
-
- assert(GVType->isSized() && "A GlobalVariable's size must be known to be "
- "supported by the toc data transformation.");
-
- if (GVType->isVectorTy())
- report_fatal_error("A GlobalVariable of Vector type is not currently "
- "supported by the toc data transformation.");
-
- if (GVType->isArrayTy())
- report_fatal_error("A GlobalVariable of Array type is not currently "
- "supported by the toc data transformation.");
-
- if (GVType->isStructTy())
- report_fatal_error("A GlobalVariable of Struct type is not currently "
- "supported by the toc data transformation.");
-
- assert(GVType->getPrimitiveSizeInBits() <= PointerSize * 8 &&
- "A GlobalVariable with size larger than a TOC entry is not currently "
- "supported by the toc data transformation.");
-
- if (GV->hasPrivateLinkage())
- report_fatal_error("A GlobalVariable with private linkage is not "
- "currently supported by the toc data transformation.");
-
return true;
}
@@ -6168,7 +6134,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
// [64-bit ELF/AIX]
// LDtocL(@sym, ADDIStocHA8(%x2, @sym))
// Otherwise we generate:
- // ADDItocL(ADDIStocHA8(%x2, @sym), @sym)
+ // ADDItocL8(ADDIStocHA8(%x2, @sym), @sym)
SDValue GA = N->getOperand(0);
SDValue TOCbase = N->getOperand(1);
@@ -6188,7 +6154,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
}
// Build the address relative to the TOC-pointer.
- ReplaceNode(N, CurDAG->getMachineNode(PPC::ADDItocL, dl, MVT::i64,
+ ReplaceNode(N, CurDAG->getMachineNode(PPC::ADDItocL8, dl, MVT::i64,
SDValue(Tmp, 0), GA));
return;
}
@@ -7741,7 +7707,7 @@ void PPCDAGToDAGISel::PeepholePPC64() {
// target flags on the immediate operand when we fold it into the
// load instruction.
//
- // For something like ADDItocL, the relocation information is
+ // For something like ADDItocL8, the relocation information is
// inferred from the opcode; when we process it in the AsmPrinter,
// we add the necessary relocation there. A load, though, can receive
// relocation from various flavors of ADDIxxx, so we need to carry
@@ -7762,7 +7728,7 @@ void PPCDAGToDAGISel::PeepholePPC64() {
case PPC::ADDItlsldL:
Flags = PPCII::MO_TLSLD_LO;
break;
- case PPC::ADDItocL:
+ case PPC::ADDItocL8:
Flags = PPCII::MO_TOC_LO;
break;
}
@@ -7789,7 +7755,7 @@ void PPCDAGToDAGISel::PeepholePPC64() {
// If we have a addi(toc@l)/addis(toc@ha) pair, and the addis has only
// one use, then we can do this for any offset, we just need to also
// update the offset (i.e. the symbol addend) on the addis also.
- if (Base.getMachineOpcode() != PPC::ADDItocL)
+ if (Base.getMachineOpcode() != PPC::ADDItocL8)
continue;
if (!HBase.isMachineOpcode() ||
diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
index 2949d58..a935979 100644
--- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -1480,8 +1480,8 @@ let hasSideEffects = 0 in {
let isReMaterializable = 1 in {
def ADDIStocHA8: PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp),
"#ADDIStocHA8", []>, isPPC64;
-def ADDItocL: PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp),
- "#ADDItocL", []>, isPPC64;
+def ADDItocL8: PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp),
+ "#ADDItocL8", []>, isPPC64;
}
// Local Data Transform
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index 5d37e92..5f5eb31 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -1077,7 +1077,7 @@ bool PPCInstrInfo::isReallyTriviallyReMaterializable(
case PPC::LIS8:
case PPC::ADDIStocHA:
case PPC::ADDIStocHA8:
- case PPC::ADDItocL:
+ case PPC::ADDItocL8:
case PPC::LOAD_STACK_GUARD:
case PPC::PPCLdFixedAddr:
case PPC::XXLXORz:
@@ -3453,7 +3453,7 @@ MachineInstr *PPCInstrInfo::getForwardingDefMI(
break;
case PPC::LI:
case PPC::LI8:
- case PPC::ADDItocL:
+ case PPC::ADDItocL8:
case PPC::ADDI:
case PPC::ADDI8:
OpNoForForwarding = i;
@@ -4420,7 +4420,7 @@ bool PPCInstrInfo::isDefMIElgibleForForwarding(MachineInstr &DefMI,
MachineOperand *&ImmMO,
MachineOperand *&RegMO) const {
unsigned Opc = DefMI.getOpcode();
- if (Opc != PPC::ADDItocL && Opc != PPC::ADDI && Opc != PPC::ADDI8)
+ if (Opc != PPC::ADDItocL8 && Opc != PPC::ADDI && Opc != PPC::ADDI8)
return false;
assert(DefMI.getNumOperands() >= 3 &&
@@ -4485,8 +4485,8 @@ bool PPCInstrInfo::isImmElgibleForForwarding(const MachineOperand &ImmMO,
int64_t &Imm,
int64_t BaseImm) const {
assert(isAnImmediateOperand(ImmMO) && "ImmMO is NOT an immediate");
- if (DefMI.getOpcode() == PPC::ADDItocL) {
- // The operand for ADDItocL is CPI, which isn't imm at compiling time,
+ if (DefMI.getOpcode() == PPC::ADDItocL8) {
+ // The operand for ADDItocL8 is CPI, which isn't imm at compiling time,
// However, we know that, it is 16-bit width, and has the alignment of 4.
// Check if the instruction met the requirement.
if (III.ImmMustBeMultipleOf > 4 ||
@@ -4899,7 +4899,7 @@ bool PPCInstrInfo::transformToImmFormFedByAdd(
// register with ImmMO.
// Before that, we need to fixup the target flags for imm.
// For some reason, we miss to set the flag for the ImmMO if it is CPI.
- if (DefMI.getOpcode() == PPC::ADDItocL)
+ if (DefMI.getOpcode() == PPC::ADDItocL8)
ImmMO->setTargetFlags(PPCII::MO_TOC_LO);
// MI didn't have the interface such as MI.setOperand(i) though
diff --git a/llvm/lib/Target/PowerPC/PPCMacroFusion.def b/llvm/lib/Target/PowerPC/PPCMacroFusion.def
index 6b8ad22..fb6e656 100644
--- a/llvm/lib/Target/PowerPC/PPCMacroFusion.def
+++ b/llvm/lib/Target/PowerPC/PPCMacroFusion.def
@@ -32,7 +32,7 @@
// {addi} followed by one of these {lxvd2x, lxvw4x, lxvdsx, lvebx, lvehx,
// lvewx, lvx, lxsdx}
FUSION_FEATURE(AddiLoad, hasAddiLoadFusion, 2, \
- FUSION_OP_SET(ADDI, ADDI8, ADDItocL), \
+ FUSION_OP_SET(ADDI, ADDI8, ADDItocL8), \
FUSION_OP_SET(LXVD2X, LXVW4X, LXVDSX, LVEBX, LVEHX, LVEWX, \
LVX, LXSDX))
@@ -135,11 +135,11 @@ FUSION_FEATURE(XorisXori, hasWideImmFusion, 1, FUSION_OP_SET(XORIS, XORIS8),
// addis rx,ra,si - addi rt,rx,SI, SI >= 0
FUSION_FEATURE(AddisAddi, hasWideImmFusion, 1,
FUSION_OP_SET(ADDIS, ADDIS8, ADDIStocHA8),
- FUSION_OP_SET(ADDI, ADDI8, ADDItocL))
+ FUSION_OP_SET(ADDI, ADDI8, ADDItocL8))
// addi rx,ra,si - addis rt,rx,SI, ra > 0, SI >= 2
FUSION_FEATURE(AddiAddis, hasWideImmFusion, 1,
- FUSION_OP_SET(ADDI, ADDI8, ADDItocL),
+ FUSION_OP_SET(ADDI, ADDI8, ADDItocL8),
FUSION_OP_SET(ADDIS, ADDIS8, ADDIStocHA8))
// mtctr - { bcctr,bcctrl }
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
index 5380ec1..884f2f5 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -185,6 +185,28 @@ bool PPCSubtarget::enableSubRegLiveness() const {
return UseSubRegLiveness;
}
+void PPCSubtarget::tocDataChecks(unsigned PointerSize,
+ const GlobalVariable *GV) const {
+ // TODO: These asserts should be updated as more support for the toc data
+ // transformation is added (struct support, etc.).
+ assert(
+ PointerSize >= GV->getAlign().valueOrOne().value() &&
+ "GlobalVariables with an alignment requirement stricter than TOC entry "
+ "size not supported by the toc data transformation.");
+
+ Type *GVType = GV->getValueType();
+ assert(GVType->isSized() && "A GlobalVariable's size must be known to be "
+ "supported by the toc data transformation.");
+ if (GV->getParent()->getDataLayout().getTypeSizeInBits(GVType) >
+ PointerSize * 8)
+ report_fatal_error(
+ "A GlobalVariable with size larger than a TOC entry is not currently "
+ "supported by the toc data transformation.");
+ if (GV->hasPrivateLinkage())
+ report_fatal_error("A GlobalVariable with private linkage is not "
+ "currently supported by the toc data transformation.");
+}
+
bool PPCSubtarget::isGVIndirectSymbol(const GlobalValue *GV) const {
// Large code model always uses the TOC even for local symbols.
if (TM.getCodeModel() == CodeModel::Large)
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h
index 306a52d..d913f22 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.h
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h
@@ -245,6 +245,8 @@ public:
/// True if the GV will be accessed via an indirect symbol.
bool isGVIndirectSymbol(const GlobalValue *GV) const;
+ void tocDataChecks(unsigned PointerSize, const GlobalVariable *GV) const;
+
/// True if the ABI is descriptor based.
bool usesFunctionDescriptors() const {
// Both 32-bit and 64-bit AIX are descriptor based. For ELF only the 64-bit
diff --git a/llvm/lib/Target/PowerPC/PPCTOCRegDeps.cpp b/llvm/lib/Target/PowerPC/PPCTOCRegDeps.cpp
index 81f078a..0527991 100644
--- a/llvm/lib/Target/PowerPC/PPCTOCRegDeps.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTOCRegDeps.cpp
@@ -94,8 +94,7 @@ namespace {
protected:
bool hasTOCLoReloc(const MachineInstr &MI) {
- if (MI.getOpcode() == PPC::LDtocL ||
- MI.getOpcode() == PPC::ADDItocL ||
+ if (MI.getOpcode() == PPC::LDtocL || MI.getOpcode() == PPC::ADDItocL8 ||
MI.getOpcode() == PPC::LWZtocL)
return true;
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index d83979a..2da75bd 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -2716,7 +2716,7 @@ ParseStatus RISCVAsmParser::parseDirective(AsmToken DirectiveID) {
bool RISCVAsmParser::resetToArch(StringRef Arch, SMLoc Loc, std::string &Result,
bool FromOptionDirective) {
- for (auto Feature : RISCVFeatureKV)
+ for (auto &Feature : RISCVFeatureKV)
if (llvm::RISCVISAInfo::isSupportedExtensionFeature(Feature.Key))
clearFeatureBits(Feature.Value, Feature.Key);
@@ -2735,7 +2735,7 @@ bool RISCVAsmParser::resetToArch(StringRef Arch, SMLoc Loc, std::string &Result,
}
auto &ISAInfo = *ParseResult;
- for (auto Feature : RISCVFeatureKV)
+ for (auto &Feature : RISCVFeatureKV)
if (ISAInfo->hasExtension(Feature.Key))
setFeatureBits(Feature.Value, Feature.Key);
@@ -2823,9 +2823,8 @@ bool RISCVAsmParser::parseDirectiveOption() {
break;
}
- ArrayRef<SubtargetFeatureKV> KVArray(RISCVFeatureKV);
- auto Ext = llvm::lower_bound(KVArray, Arch);
- if (Ext == KVArray.end() || StringRef(Ext->Key) != Arch ||
+ auto Ext = llvm::lower_bound(RISCVFeatureKV, Arch);
+ if (Ext == std::end(RISCVFeatureKV) || StringRef(Ext->Key) != Arch ||
!RISCVISAInfo::isSupportedExtension(Arch)) {
if (isDigit(Arch.back()))
return Error(
@@ -2858,7 +2857,7 @@ bool RISCVAsmParser::parseDirectiveOption() {
// It is invalid to disable an extension that there are other enabled
// extensions depend on it.
// TODO: Make use of RISCVISAInfo to handle this
- for (auto Feature : KVArray) {
+ for (auto &Feature : RISCVFeatureKV) {
if (getSTI().hasFeature(Feature.Value) &&
Feature.Implies.test(Ext->Value))
return Error(Loc,
@@ -3271,11 +3270,13 @@ void RISCVAsmParser::emitVMSGE(MCInst &Inst, unsigned Opcode, SMLoc IDLoc,
.addOperand(Inst.getOperand(0))
.addOperand(Inst.getOperand(1))
.addOperand(Inst.getOperand(2))
- .addReg(RISCV::NoRegister));
+ .addReg(RISCV::NoRegister)
+ .setLoc(IDLoc));
emitToStreamer(Out, MCInstBuilder(RISCV::VMNAND_MM)
.addOperand(Inst.getOperand(0))
.addOperand(Inst.getOperand(0))
- .addOperand(Inst.getOperand(0)));
+ .addOperand(Inst.getOperand(0))
+ .setLoc(IDLoc));
} else if (Inst.getNumOperands() == 4) {
// masked va >= x, vd != v0
//
@@ -3287,11 +3288,13 @@ void RISCVAsmParser::emitVMSGE(MCInst &Inst, unsigned Opcode, SMLoc IDLoc,
.addOperand(Inst.getOperand(0))
.addOperand(Inst.getOperand(1))
.addOperand(Inst.getOperand(2))
- .addOperand(Inst.getOperand(3)));
+ .addOperand(Inst.getOperand(3))
+ .setLoc(IDLoc));
emitToStreamer(Out, MCInstBuilder(RISCV::VMXOR_MM)
.addOperand(Inst.getOperand(0))
.addOperand(Inst.getOperand(0))
- .addReg(RISCV::V0));
+ .addReg(RISCV::V0)
+ .setLoc(IDLoc));
} else if (Inst.getNumOperands() == 5 &&
Inst.getOperand(0).getReg() == RISCV::V0) {
// masked va >= x, vd == v0
@@ -3306,11 +3309,13 @@ void RISCVAsmParser::emitVMSGE(MCInst &Inst, unsigned Opcode, SMLoc IDLoc,
.addOperand(Inst.getOperand(1))
.addOperand(Inst.getOperand(2))
.addOperand(Inst.getOperand(3))
- .addReg(RISCV::NoRegister));
+ .addReg(RISCV::NoRegister)
+ .setLoc(IDLoc));
emitToStreamer(Out, MCInstBuilder(RISCV::VMANDN_MM)
.addOperand(Inst.getOperand(0))
.addOperand(Inst.getOperand(0))
- .addOperand(Inst.getOperand(1)));
+ .addOperand(Inst.getOperand(1))
+ .setLoc(IDLoc));
} else if (Inst.getNumOperands() == 5) {
// masked va >= x, any vd
//
@@ -3323,19 +3328,23 @@ void RISCVAsmParser::emitVMSGE(MCInst &Inst, unsigned Opcode, SMLoc IDLoc,
.addOperand(Inst.getOperand(1))
.addOperand(Inst.getOperand(2))
.addOperand(Inst.getOperand(3))
- .addReg(RISCV::NoRegister));
+ .addReg(RISCV::NoRegister)
+ .setLoc(IDLoc));
emitToStreamer(Out, MCInstBuilder(RISCV::VMANDN_MM)
.addOperand(Inst.getOperand(1))
.addReg(RISCV::V0)
- .addOperand(Inst.getOperand(1)));
+ .addOperand(Inst.getOperand(1))
+ .setLoc(IDLoc));
emitToStreamer(Out, MCInstBuilder(RISCV::VMANDN_MM)
.addOperand(Inst.getOperand(0))
.addOperand(Inst.getOperand(0))
- .addReg(RISCV::V0));
+ .addReg(RISCV::V0)
+ .setLoc(IDLoc));
emitToStreamer(Out, MCInstBuilder(RISCV::VMOR_MM)
.addOperand(Inst.getOperand(0))
.addOperand(Inst.getOperand(1))
- .addOperand(Inst.getOperand(0)));
+ .addOperand(Inst.getOperand(0))
+ .setLoc(IDLoc));
}
}
@@ -3637,7 +3646,8 @@ bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
.addOperand(Inst.getOperand(0))
.addOperand(Inst.getOperand(1))
.addImm(Imm - 1)
- .addOperand(Inst.getOperand(3)));
+ .addOperand(Inst.getOperand(3))
+ .setLoc(IDLoc));
return false;
}
case RISCV::PseudoVMSGEU_VI:
@@ -3655,7 +3665,8 @@ bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
.addOperand(Inst.getOperand(0))
.addOperand(Inst.getOperand(1))
.addOperand(Inst.getOperand(1))
- .addOperand(Inst.getOperand(3)));
+ .addOperand(Inst.getOperand(3))
+ .setLoc(IDLoc));
} else {
// Other immediate values can subtract one like signed.
unsigned Opc = Inst.getOpcode() == RISCV::PseudoVMSGEU_VI
@@ -3665,7 +3676,8 @@ bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
.addOperand(Inst.getOperand(0))
.addOperand(Inst.getOperand(1))
.addImm(Imm - 1)
- .addOperand(Inst.getOperand(3)));
+ .addOperand(Inst.getOperand(3))
+ .setLoc(IDLoc));
}
return false;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 08678a8..803774f 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -10466,6 +10466,7 @@ SDValue RISCVTargetLowering::lowerMaskedStore(SDValue Op,
SDValue BasePtr = MemSD->getBasePtr();
SDValue Val, Mask, VL;
+ bool IsCompressingStore = false;
if (const auto *VPStore = dyn_cast<VPStoreSDNode>(Op)) {
Val = VPStore->getValue();
Mask = VPStore->getMask();
@@ -10474,9 +10475,11 @@ SDValue RISCVTargetLowering::lowerMaskedStore(SDValue Op,
const auto *MStore = cast<MaskedStoreSDNode>(Op);
Val = MStore->getValue();
Mask = MStore->getMask();
+ IsCompressingStore = MStore->isCompressingStore();
}
- bool IsUnmasked = ISD::isConstantSplatVectorAllOnes(Mask.getNode());
+ bool IsUnmasked =
+ ISD::isConstantSplatVectorAllOnes(Mask.getNode()) || IsCompressingStore;
MVT VT = Val.getSimpleValueType();
MVT XLenVT = Subtarget.getXLenVT();
@@ -10486,7 +10489,7 @@ SDValue RISCVTargetLowering::lowerMaskedStore(SDValue Op,
ContainerVT = getContainerForFixedLengthVector(VT);
Val = convertToScalableVector(ContainerVT, Val, DAG, Subtarget);
- if (!IsUnmasked) {
+ if (!IsUnmasked || IsCompressingStore) {
MVT MaskVT = getMaskTypeFor(ContainerVT);
Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget);
}
@@ -10495,6 +10498,15 @@ SDValue RISCVTargetLowering::lowerMaskedStore(SDValue Op,
if (!VL)
VL = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget).second;
+ if (IsCompressingStore) {
+ Val = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
+ DAG.getConstant(Intrinsic::riscv_vcompress, DL, XLenVT),
+ DAG.getUNDEF(ContainerVT), Val, Mask, VL);
+ VL =
+ DAG.getNode(RISCVISD::VCPOP_VL, DL, XLenVT, Mask,
+ getAllOnesMask(Mask.getSimpleValueType(), VL, DL, DAG), VL);
+ }
+
unsigned IntID =
IsUnmasked ? Intrinsic::riscv_vse : Intrinsic::riscv_vse_mask;
SmallVector<SDValue, 8> Ops{Chain, DAG.getTargetConstant(IntID, DL, XLenVT)};
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index ecd3736..8f46fdc 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1620,3 +1620,13 @@ bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
C2.NumIVMuls, C2.NumBaseAdds,
C2.ScaleCost, C2.ImmCost, C2.SetupCost);
}
+
+bool RISCVTTIImpl::isLegalMaskedCompressStore(Type *DataTy, Align Alignment) {
+ auto *VTy = dyn_cast<VectorType>(DataTy);
+ if (!VTy || VTy->isScalableTy())
+ return false;
+
+ if (!isLegalMaskedLoadStore(DataTy, Alignment))
+ return false;
+ return true;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index af36e9d..8daf684 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -261,6 +261,8 @@ public:
return TLI->isLegalStridedLoadStore(DataTypeVT, Alignment);
}
+ bool isLegalMaskedCompressStore(Type *DataTy, Align Alignment);
+
bool isVScaleKnownToBeAPowerOfTwo() const {
return TLI->isVScaleKnownToBeAPowerOfTwo();
}
diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
index 2d7a00b..f1fbe2b 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
@@ -85,6 +85,42 @@ static ConstantInt *getConstInt(MDNode *MD, unsigned NumOp) {
return nullptr;
}
+// If the function has pointer arguments, we are forced to re-create this
+// function type from the very beginning, changing PointerType by
+// TypedPointerType for each pointer argument. Otherwise, the same `Type*`
+// potentially corresponds to different SPIR-V function type, effectively
+// invalidating logic behind global registry and duplicates tracker.
+static FunctionType *
+fixFunctionTypeIfPtrArgs(SPIRVGlobalRegistry *GR, const Function &F,
+ FunctionType *FTy, const SPIRVType *SRetTy,
+ const SmallVector<SPIRVType *, 4> &SArgTys) {
+ if (F.getParent()->getNamedMetadata("spv.cloned_funcs"))
+ return FTy;
+
+ bool hasArgPtrs = false;
+ for (auto &Arg : F.args()) {
+ // check if it's an instance of a non-typed PointerType
+ if (Arg.getType()->isPointerTy()) {
+ hasArgPtrs = true;
+ break;
+ }
+ }
+ if (!hasArgPtrs) {
+ Type *RetTy = FTy->getReturnType();
+ // check if it's an instance of a non-typed PointerType
+ if (!RetTy->isPointerTy())
+ return FTy;
+ }
+
+ // re-create function type, using TypedPointerType instead of PointerType to
+ // properly trace argument types
+ const Type *RetTy = GR->getTypeForSPIRVType(SRetTy);
+ SmallVector<Type *, 4> ArgTys;
+ for (auto SArgTy : SArgTys)
+ ArgTys.push_back(const_cast<Type *>(GR->getTypeForSPIRVType(SArgTy)));
+ return FunctionType::get(const_cast<Type *>(RetTy), ArgTys, false);
+}
+
// This code restores function args/retvalue types for composite cases
// because the final types should still be aggregate whereas they're i32
// during the translation to cope with aggregate flattening etc.
@@ -162,7 +198,7 @@ static SPIRVType *getArgSPIRVType(const Function &F, unsigned ArgIdx,
// If OriginalArgType is non-pointer, use the OriginalArgType (the type cannot
// be legally reassigned later).
- if (!OriginalArgType->isPointerTy())
+ if (!isPointerTy(OriginalArgType))
return GR->getOrCreateSPIRVType(OriginalArgType, MIRBuilder, ArgAccessQual);
// In case OriginalArgType is of pointer type, there are three possibilities:
@@ -179,8 +215,7 @@ static SPIRVType *getArgSPIRVType(const Function &F, unsigned ArgIdx,
SPIRVType *ElementType = GR->getOrCreateSPIRVType(ByValRefType, MIRBuilder);
return GR->getOrCreateSPIRVPointerType(
ElementType, MIRBuilder,
- addressSpaceToStorageClass(Arg->getType()->getPointerAddressSpace(),
- ST));
+ addressSpaceToStorageClass(getPointerAddressSpace(Arg->getType()), ST));
}
for (auto User : Arg->users()) {
@@ -240,7 +275,6 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
static_cast<const SPIRVSubtarget *>(&MIRBuilder.getMF().getSubtarget());
// Assign types and names to all args, and store their types for later.
- FunctionType *FTy = getOriginalFunctionType(F);
SmallVector<SPIRVType *, 4> ArgTypeVRegs;
if (VRegs.size() > 0) {
unsigned i = 0;
@@ -255,7 +289,7 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
if (Arg.hasName())
buildOpName(VRegs[i][0], Arg.getName(), MIRBuilder);
- if (Arg.getType()->isPointerTy()) {
+ if (isPointerTy(Arg.getType())) {
auto DerefBytes = static_cast<unsigned>(Arg.getDereferenceableBytes());
if (DerefBytes != 0)
buildOpDecorate(VRegs[i][0], MIRBuilder,
@@ -322,7 +356,9 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
MRI->setRegClass(FuncVReg, &SPIRV::IDRegClass);
if (F.isDeclaration())
GR->add(&F, &MIRBuilder.getMF(), FuncVReg);
+ FunctionType *FTy = getOriginalFunctionType(F);
SPIRVType *RetTy = GR->getOrCreateSPIRVType(FTy->getReturnType(), MIRBuilder);
+ FTy = fixFunctionTypeIfPtrArgs(GR, F, FTy, RetTy, ArgTypeVRegs);
SPIRVType *FuncTy = GR->getOrCreateOpTypeFunctionWithArgs(
FTy, RetTy, ArgTypeVRegs, MIRBuilder);
uint32_t FuncControl = getFunctionControl(F);
@@ -429,7 +465,6 @@ bool SPIRVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
return false;
MachineFunction &MF = MIRBuilder.getMF();
GR->setCurrentFunc(MF);
- FunctionType *FTy = nullptr;
const Function *CF = nullptr;
std::string DemangledName;
const Type *OrigRetTy = Info.OrigRet.Ty;
@@ -444,7 +479,7 @@ bool SPIRVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
// TODO: support constexpr casts and indirect calls.
if (CF == nullptr)
return false;
- if ((FTy = getOriginalFunctionType(*CF)) != nullptr)
+ if (FunctionType *FTy = getOriginalFunctionType(*CF))
OrigRetTy = FTy->getReturnType();
}
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index 575e903..c5b9012 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -57,8 +57,14 @@ class SPIRVEmitIntrinsics
bool TrackConstants = true;
DenseMap<Instruction *, Constant *> AggrConsts;
DenseSet<Instruction *> AggrStores;
+
+ // deduce values type
+ DenseMap<Value *, Type *> DeducedElTys;
+ Type *deduceElementType(Value *I);
+
void preprocessCompositeConstants(IRBuilder<> &B);
void preprocessUndefs(IRBuilder<> &B);
+
CallInst *buildIntrWithMD(Intrinsic::ID IntrID, ArrayRef<Type *> Types,
Value *Arg, Value *Arg2, ArrayRef<Constant *> Imms,
IRBuilder<> &B) {
@@ -72,6 +78,7 @@ class SPIRVEmitIntrinsics
Args.push_back(Imm);
return B.CreateIntrinsic(IntrID, {Types}, Args);
}
+
void replaceMemInstrUses(Instruction *Old, Instruction *New, IRBuilder<> &B);
void processInstrAfterVisit(Instruction *I, IRBuilder<> &B);
void insertAssignPtrTypeIntrs(Instruction *I, IRBuilder<> &B);
@@ -156,6 +163,48 @@ static inline void reportFatalOnTokenType(const Instruction *I) {
false);
}
+// Deduce and return a successfully deduced Type of the Instruction,
+// or nullptr otherwise.
+static Type *deduceElementTypeHelper(Value *I,
+ std::unordered_set<Value *> &Visited,
+ DenseMap<Value *, Type *> &DeducedElTys) {
+ // maybe already known
+ auto It = DeducedElTys.find(I);
+ if (It != DeducedElTys.end())
+ return It->second;
+
+ // maybe a cycle
+ if (Visited.find(I) != Visited.end())
+ return nullptr;
+ Visited.insert(I);
+
+ // fallback value in case when we fail to deduce a type
+ Type *Ty = nullptr;
+ // look for known basic patterns of type inference
+ if (auto *Ref = dyn_cast<AllocaInst>(I))
+ Ty = Ref->getAllocatedType();
+ else if (auto *Ref = dyn_cast<GetElementPtrInst>(I))
+ Ty = Ref->getResultElementType();
+ else if (auto *Ref = dyn_cast<GlobalValue>(I))
+ Ty = Ref->getValueType();
+ else if (auto *Ref = dyn_cast<AddrSpaceCastInst>(I))
+ Ty = deduceElementTypeHelper(Ref->getPointerOperand(), Visited,
+ DeducedElTys);
+
+ // remember the found relationship
+ if (Ty)
+ DeducedElTys[I] = Ty;
+
+ return Ty;
+}
+
+Type *SPIRVEmitIntrinsics::deduceElementType(Value *I) {
+ std::unordered_set<Value *> Visited;
+ if (Type *Ty = deduceElementTypeHelper(I, Visited, DeducedElTys))
+ return Ty;
+ return IntegerType::getInt8Ty(I->getContext());
+}
+
void SPIRVEmitIntrinsics::replaceMemInstrUses(Instruction *Old,
Instruction *New,
IRBuilder<> &B) {
@@ -280,7 +329,7 @@ Instruction *SPIRVEmitIntrinsics::visitBitCastInst(BitCastInst &I) {
// varying element types. In case of IR coming from older versions of LLVM
// such bitcasts do not provide sufficient information, should be just skipped
// here, and handled in insertPtrCastOrAssignTypeInstr.
- if (I.getType()->isPointerTy()) {
+ if (isPointerTy(I.getType())) {
I.replaceAllUsesWith(Source);
I.eraseFromParent();
return nullptr;
@@ -333,20 +382,10 @@ void SPIRVEmitIntrinsics::replacePointerOperandWithPtrCast(
while (BitCastInst *BC = dyn_cast<BitCastInst>(Pointer))
Pointer = BC->getOperand(0);
- // Do not emit spv_ptrcast if Pointer is a GlobalValue of expected type.
- GlobalValue *GV = dyn_cast<GlobalValue>(Pointer);
- if (GV && GV->getValueType() == ExpectedElementType)
- return;
-
- // Do not emit spv_ptrcast if Pointer is a result of alloca with expected
- // type.
- AllocaInst *A = dyn_cast<AllocaInst>(Pointer);
- if (A && A->getAllocatedType() == ExpectedElementType)
- return;
-
- // Do not emit spv_ptrcast if Pointer is a result of GEP of expected type.
- GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Pointer);
- if (GEPI && GEPI->getResultElementType() == ExpectedElementType)
+ // Do not emit spv_ptrcast if Pointer's element type is ExpectedElementType
+ std::unordered_set<Value *> Visited;
+ Type *PointerElemTy = deduceElementTypeHelper(Pointer, Visited, DeducedElTys);
+ if (PointerElemTy == ExpectedElementType)
return;
setInsertPointSkippingPhis(B, I);
@@ -356,7 +395,7 @@ void SPIRVEmitIntrinsics::replacePointerOperandWithPtrCast(
ValueAsMetadata::getConstant(ExpectedElementTypeConst);
MDTuple *TyMD = MDNode::get(F->getContext(), CM);
MetadataAsValue *VMD = MetadataAsValue::get(F->getContext(), TyMD);
- unsigned AddressSpace = Pointer->getType()->getPointerAddressSpace();
+ unsigned AddressSpace = getPointerAddressSpace(Pointer->getType());
bool FirstPtrCastOrAssignPtrType = true;
// Do not emit new spv_ptrcast if equivalent one already exists or when
@@ -401,9 +440,11 @@ void SPIRVEmitIntrinsics::replacePointerOperandWithPtrCast(
// spv_assign_ptr_type instead.
if (FirstPtrCastOrAssignPtrType &&
(isa<Instruction>(Pointer) || isa<Argument>(Pointer))) {
- buildIntrWithMD(Intrinsic::spv_assign_ptr_type, {Pointer->getType()},
- ExpectedElementTypeConst, Pointer,
- {B.getInt32(AddressSpace)}, B);
+ CallInst *CI = buildIntrWithMD(
+ Intrinsic::spv_assign_ptr_type, {Pointer->getType()},
+ ExpectedElementTypeConst, Pointer, {B.getInt32(AddressSpace)}, B);
+ DeducedElTys[CI] = ExpectedElementType;
+ DeducedElTys[Pointer] = ExpectedElementType;
return;
}
@@ -419,7 +460,7 @@ void SPIRVEmitIntrinsics::insertPtrCastOrAssignTypeInstr(Instruction *I,
// Handle basic instructions:
StoreInst *SI = dyn_cast<StoreInst>(I);
if (SI && F->getCallingConv() == CallingConv::SPIR_KERNEL &&
- SI->getValueOperand()->getType()->isPointerTy() &&
+ isPointerTy(SI->getValueOperand()->getType()) &&
isa<Argument>(SI->getValueOperand())) {
return replacePointerOperandWithPtrCast(
I, SI->getValueOperand(), IntegerType::getInt8Ty(F->getContext()), 0,
@@ -440,9 +481,34 @@ void SPIRVEmitIntrinsics::insertPtrCastOrAssignTypeInstr(Instruction *I,
if (!CI || CI->isIndirectCall() || CI->getCalledFunction()->isIntrinsic())
return;
+ // collect information about formal parameter types
+ Function *CalledF = CI->getCalledFunction();
+ SmallVector<Type *, 4> CalledArgTys;
+ bool HaveTypes = false;
+ for (auto &CalledArg : CalledF->args()) {
+ if (!isPointerTy(CalledArg.getType())) {
+ CalledArgTys.push_back(nullptr);
+ continue;
+ }
+ auto It = DeducedElTys.find(&CalledArg);
+ Type *ParamTy = It != DeducedElTys.end() ? It->second : nullptr;
+ if (!ParamTy) {
+ for (User *U : CalledArg.users()) {
+ if (Instruction *Inst = dyn_cast<Instruction>(U)) {
+ std::unordered_set<Value *> Visited;
+ ParamTy = deduceElementTypeHelper(Inst, Visited, DeducedElTys);
+ if (ParamTy)
+ break;
+ }
+ }
+ }
+ HaveTypes |= ParamTy != nullptr;
+ CalledArgTys.push_back(ParamTy);
+ }
+
std::string DemangledName =
getOclOrSpirvBuiltinDemangledName(CI->getCalledFunction()->getName());
- if (DemangledName.empty())
+ if (DemangledName.empty() && !HaveTypes)
return;
for (unsigned OpIdx = 0; OpIdx < CI->arg_size(); OpIdx++) {
@@ -455,8 +521,11 @@ void SPIRVEmitIntrinsics::insertPtrCastOrAssignTypeInstr(Instruction *I,
if (!isa<Instruction>(ArgOperand) && !isa<Argument>(ArgOperand))
continue;
- Type *ExpectedType = SPIRV::parseBuiltinCallArgumentBaseType(
- DemangledName, OpIdx, I->getContext());
+ Type *ExpectedType =
+ OpIdx < CalledArgTys.size() ? CalledArgTys[OpIdx] : nullptr;
+ if (!ExpectedType && !DemangledName.empty())
+ ExpectedType = SPIRV::parseBuiltinCallArgumentBaseType(
+ DemangledName, OpIdx, I->getContext());
if (!ExpectedType)
continue;
@@ -639,30 +708,25 @@ void SPIRVEmitIntrinsics::processGlobalValue(GlobalVariable &GV,
void SPIRVEmitIntrinsics::insertAssignPtrTypeIntrs(Instruction *I,
IRBuilder<> &B) {
reportFatalOnTokenType(I);
- if (!I->getType()->isPointerTy() || !requireAssignType(I) ||
+ if (!isPointerTy(I->getType()) || !requireAssignType(I) ||
isa<BitCastInst>(I))
return;
setInsertPointSkippingPhis(B, I->getNextNode());
- Constant *EltTyConst;
- unsigned AddressSpace = I->getType()->getPointerAddressSpace();
- if (auto *AI = dyn_cast<AllocaInst>(I))
- EltTyConst = UndefValue::get(AI->getAllocatedType());
- else if (auto *GEP = dyn_cast<GetElementPtrInst>(I))
- EltTyConst = UndefValue::get(GEP->getResultElementType());
- else
- EltTyConst = UndefValue::get(IntegerType::getInt8Ty(I->getContext()));
-
- buildIntrWithMD(Intrinsic::spv_assign_ptr_type, {I->getType()}, EltTyConst, I,
- {B.getInt32(AddressSpace)}, B);
+ Type *ElemTy = deduceElementType(I);
+ Constant *EltTyConst = UndefValue::get(ElemTy);
+ unsigned AddressSpace = getPointerAddressSpace(I->getType());
+ CallInst *CI = buildIntrWithMD(Intrinsic::spv_assign_ptr_type, {I->getType()},
+ EltTyConst, I, {B.getInt32(AddressSpace)}, B);
+ DeducedElTys[CI] = ElemTy;
}
void SPIRVEmitIntrinsics::insertAssignTypeIntrs(Instruction *I,
IRBuilder<> &B) {
reportFatalOnTokenType(I);
Type *Ty = I->getType();
- if (!Ty->isVoidTy() && !Ty->isPointerTy() && requireAssignType(I)) {
+ if (!Ty->isVoidTy() && !isPointerTy(Ty) && requireAssignType(I)) {
setInsertPointSkippingPhis(B, I->getNextNode());
Type *TypeToAssign = Ty;
if (auto *II = dyn_cast<IntrinsicInst>(I)) {
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
index 8556581..bda9c57 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -750,7 +750,7 @@ SPIRVType *SPIRVGlobalRegistry::createSPIRVType(
SPIRVType *SPIRVGlobalRegistry::restOfCreateSPIRVType(
const Type *Ty, MachineIRBuilder &MIRBuilder,
SPIRV::AccessQualifier::AccessQualifier AccessQual, bool EmitIR) {
- if (TypesInProcessing.count(Ty) && !Ty->isPointerTy())
+ if (TypesInProcessing.count(Ty) && !isPointerTy(Ty))
return nullptr;
TypesInProcessing.insert(Ty);
SPIRVType *SpirvType = createSPIRVType(Ty, MIRBuilder, AccessQual, EmitIR);
@@ -762,11 +762,15 @@ SPIRVType *SPIRVGlobalRegistry::restOfCreateSPIRVType(
// will be added later. For special types it is already added to DT.
if (SpirvType->getOpcode() != SPIRV::OpTypeForwardPointer && !Reg.isValid() &&
!isSpecialOpaqueType(Ty)) {
- if (!Ty->isPointerTy())
+ if (!isPointerTy(Ty))
DT.add(Ty, &MIRBuilder.getMF(), getSPIRVTypeID(SpirvType));
+ else if (isTypedPointerTy(Ty))
+ DT.add(cast<TypedPointerType>(Ty)->getElementType(),
+ getPointerAddressSpace(Ty), &MIRBuilder.getMF(),
+ getSPIRVTypeID(SpirvType));
else
DT.add(Type::getInt8Ty(MIRBuilder.getMF().getFunction().getContext()),
- Ty->getPointerAddressSpace(), &MIRBuilder.getMF(),
+ getPointerAddressSpace(Ty), &MIRBuilder.getMF(),
getSPIRVTypeID(SpirvType));
}
@@ -787,12 +791,15 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVType(
const Type *Ty, MachineIRBuilder &MIRBuilder,
SPIRV::AccessQualifier::AccessQualifier AccessQual, bool EmitIR) {
Register Reg;
- if (!Ty->isPointerTy())
+ if (!isPointerTy(Ty))
Reg = DT.find(Ty, &MIRBuilder.getMF());
+ else if (isTypedPointerTy(Ty))
+ Reg = DT.find(cast<TypedPointerType>(Ty)->getElementType(),
+ getPointerAddressSpace(Ty), &MIRBuilder.getMF());
else
Reg =
DT.find(Type::getInt8Ty(MIRBuilder.getMF().getFunction().getContext()),
- Ty->getPointerAddressSpace(), &MIRBuilder.getMF());
+ getPointerAddressSpace(Ty), &MIRBuilder.getMF());
if (Reg.isValid() && !isSpecialOpaqueType(Ty))
return getSPIRVTypeForVReg(Reg);
@@ -836,11 +843,16 @@ bool SPIRVGlobalRegistry::isScalarOrVectorOfType(Register VReg,
unsigned
SPIRVGlobalRegistry::getScalarOrVectorComponentCount(Register VReg) const {
- if (SPIRVType *Type = getSPIRVTypeForVReg(VReg))
- return Type->getOpcode() == SPIRV::OpTypeVector
- ? static_cast<unsigned>(Type->getOperand(2).getImm())
- : 1;
- return 0;
+ return getScalarOrVectorComponentCount(getSPIRVTypeForVReg(VReg));
+}
+
+unsigned
+SPIRVGlobalRegistry::getScalarOrVectorComponentCount(SPIRVType *Type) const {
+ if (!Type)
+ return 0;
+ return Type->getOpcode() == SPIRV::OpTypeVector
+ ? static_cast<unsigned>(Type->getOperand(2).getImm())
+ : 1;
}
unsigned
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
index 9c0061d..25d82ebf 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
@@ -198,9 +198,10 @@ public:
// opcode (e.g. OpTypeBool, or OpTypeVector %x 4, where %x is OpTypeBool).
bool isScalarOrVectorOfType(Register VReg, unsigned TypeOpcode) const;
- // Return number of elements in a vector if the given VReg is associated with
+ // Return number of elements in a vector if the argument is associated with
// a vector type. Return 1 for a scalar type, and 0 for a missing type.
unsigned getScalarOrVectorComponentCount(Register VReg) const;
+ unsigned getScalarOrVectorComponentCount(SPIRVType *Type) const;
// For vectors or scalars of booleans, integers and floats, return the scalar
// type's bitwidth. Otherwise calls llvm_unreachable().
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 74df8de..fd19b74 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -125,6 +125,8 @@ private:
bool selectConstVector(Register ResVReg, const SPIRVType *ResType,
MachineInstr &I) const;
+ bool selectSplatVector(Register ResVReg, const SPIRVType *ResType,
+ MachineInstr &I) const;
bool selectCmp(Register ResVReg, const SPIRVType *ResType,
unsigned comparisonOpcode, MachineInstr &I) const;
@@ -313,6 +315,8 @@ bool SPIRVInstructionSelector::spvSelect(Register ResVReg,
case TargetOpcode::G_BUILD_VECTOR:
return selectConstVector(ResVReg, ResType, I);
+ case TargetOpcode::G_SPLAT_VECTOR:
+ return selectSplatVector(ResVReg, ResType, I);
case TargetOpcode::G_SHUFFLE_VECTOR: {
MachineBasicBlock &BB = *I.getParent();
@@ -1185,6 +1189,43 @@ bool SPIRVInstructionSelector::selectConstVector(Register ResVReg,
return MIB.constrainAllUses(TII, TRI, RBI);
}
+bool SPIRVInstructionSelector::selectSplatVector(Register ResVReg,
+ const SPIRVType *ResType,
+ MachineInstr &I) const {
+ if (ResType->getOpcode() != SPIRV::OpTypeVector)
+ report_fatal_error("Cannot select G_SPLAT_VECTOR with a non-vector result");
+ unsigned N = GR.getScalarOrVectorComponentCount(ResType);
+ unsigned OpIdx = I.getNumExplicitDefs();
+ if (!I.getOperand(OpIdx).isReg())
+ report_fatal_error("Unexpected argument in G_SPLAT_VECTOR");
+
+ // check if we may construct a constant vector
+ Register OpReg = I.getOperand(OpIdx).getReg();
+ bool IsConst = false;
+ if (SPIRVType *OpDef = MRI->getVRegDef(OpReg)) {
+ if (OpDef->getOpcode() == SPIRV::ASSIGN_TYPE &&
+ OpDef->getOperand(1).isReg()) {
+ if (SPIRVType *RefDef = MRI->getVRegDef(OpDef->getOperand(1).getReg()))
+ OpDef = RefDef;
+ }
+ IsConst = OpDef->getOpcode() == TargetOpcode::G_CONSTANT ||
+ OpDef->getOpcode() == TargetOpcode::G_FCONSTANT;
+ }
+
+ if (!IsConst && N < 2)
+ report_fatal_error(
+ "There must be at least two constituent operands in a vector");
+
+ auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(),
+ TII.get(IsConst ? SPIRV::OpConstantComposite
+ : SPIRV::OpCompositeConstruct))
+ .addDef(ResVReg)
+ .addUse(GR.getSPIRVTypeID(ResType));
+ for (unsigned i = 0; i < N; ++i)
+ MIB.addUse(OpReg);
+ return MIB.constrainAllUses(TII, TRI, RBI);
+}
+
bool SPIRVInstructionSelector::selectCmp(Register ResVReg,
const SPIRVType *ResType,
unsigned CmpOpc,
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
index f815487..4b871bd 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
@@ -149,7 +149,9 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) {
getActionDefinitionsBuilder(G_GLOBAL_VALUE).alwaysLegal();
// TODO: add proper rules for vectors legalization.
- getActionDefinitionsBuilder({G_BUILD_VECTOR, G_SHUFFLE_VECTOR}).alwaysLegal();
+ getActionDefinitionsBuilder(
+ {G_BUILD_VECTOR, G_SHUFFLE_VECTOR, G_SPLAT_VECTOR})
+ .alwaysLegal();
// Vector Reduction Operations
getActionDefinitionsBuilder(
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.h b/llvm/lib/Target/SPIRV/SPIRVUtils.h
index e5f35aa..d5ed501 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.h
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.h
@@ -15,6 +15,7 @@
#include "MCTargetDesc/SPIRVBaseInfo.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/TypedPointerType.h"
#include <string>
namespace llvm {
@@ -100,5 +101,30 @@ bool isEntryPoint(const Function &F);
// Parse basic scalar type name, substring TypeName, and return LLVM type.
Type *parseBasicTypeName(StringRef TypeName, LLVMContext &Ctx);
+
+// True if this is an instance of TypedPointerType.
+inline bool isTypedPointerTy(const Type *T) {
+ return T->getTypeID() == Type::TypedPointerTyID;
+}
+
+// True if this is an instance of PointerType.
+inline bool isUntypedPointerTy(const Type *T) {
+ return T->getTypeID() == Type::PointerTyID;
+}
+
+// True if this is an instance of PointerType or TypedPointerType.
+inline bool isPointerTy(const Type *T) {
+ return isUntypedPointerTy(T) || isTypedPointerTy(T);
+}
+
+// Get the address space of this pointer or pointer vector type for instances of
+// PointerType or TypedPointerType.
+inline unsigned getPointerAddressSpace(const Type *T) {
+ Type *SubT = T->getScalarType();
+ return SubT->getTypeID() == Type::PointerTyID
+ ? cast<PointerType>(SubT)->getAddressSpace()
+ : cast<TypedPointerType>(SubT)->getAddressSpace();
+}
+
} // namespace llvm
#endif // LLVM_LIB_TARGET_SPIRV_SPIRVUTILS_H
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index ee4fd04..f65ed25 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -196,14 +196,24 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
.Case("0xb36", "arm1136j-s")
.Case("0xb56", "arm1156t2-s")
.Case("0xb76", "arm1176jz-s")
+ .Case("0xc05", "cortex-a5")
+ .Case("0xc07", "cortex-a7")
.Case("0xc08", "cortex-a8")
.Case("0xc09", "cortex-a9")
.Case("0xc0f", "cortex-a15")
+ .Case("0xc0e", "cortex-a17")
.Case("0xc20", "cortex-m0")
.Case("0xc23", "cortex-m3")
.Case("0xc24", "cortex-m4")
+ .Case("0xc27", "cortex-m7")
+ .Case("0xd20", "cortex-m23")
+ .Case("0xd21", "cortex-m33")
.Case("0xd24", "cortex-m52")
.Case("0xd22", "cortex-m55")
+ .Case("0xd23", "cortex-m85")
+ .Case("0xc18", "cortex-r8")
+ .Case("0xd13", "cortex-r52")
+ .Case("0xd15", "cortex-r82")
.Case("0xd02", "cortex-a34")
.Case("0xd04", "cortex-a35")
.Case("0xd03", "cortex-a53")
@@ -211,13 +221,17 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
.Case("0xd46", "cortex-a510")
.Case("0xd80", "cortex-a520")
.Case("0xd07", "cortex-a57")
+ .Case("0xd06", "cortex-a65")
+ .Case("0xd43", "cortex-a65ae")
.Case("0xd08", "cortex-a72")
.Case("0xd09", "cortex-a73")
.Case("0xd0a", "cortex-a75")
.Case("0xd0b", "cortex-a76")
+ .Case("0xd0e", "cortex-a76ae")
.Case("0xd0d", "cortex-a77")
.Case("0xd41", "cortex-a78")
.Case("0xd42", "cortex-a78ae")
+ .Case("0xd4b", "cortex-a78c")
.Case("0xd47", "cortex-a710")
.Case("0xd4d", "cortex-a715")
.Case("0xd81", "cortex-a720")
@@ -226,6 +240,7 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
.Case("0xd48", "cortex-x2")
.Case("0xd4e", "cortex-x3")
.Case("0xd82", "cortex-x4")
+ .Case("0xd4a", "neoverse-e1")
.Case("0xd0c", "neoverse-n1")
.Case("0xd49", "neoverse-n2")
.Case("0xd40", "neoverse-v1")
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 488a6f0..f98833b 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -12371,7 +12371,7 @@ struct AAIndirectCallInfoCallSite : public AAIndirectCallInfo {
SplitBlockAndInsertIfThen(LastCmp, IP, /* Unreachable */ false);
BasicBlock *CBBB = CB->getParent();
A.registerManifestAddedBasicBlock(*ThenTI->getParent());
- A.registerManifestAddedBasicBlock(*CBBB);
+ A.registerManifestAddedBasicBlock(*IP->getParent());
auto *SplitTI = cast<BranchInst>(LastCmp->getNextNode());
BasicBlock *ElseBB;
if (&*IP == CB) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index f5f3716..694b180 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -504,6 +504,11 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombinerImpl &IC) {
return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(II.getType()));
}
+ // If ctlz/cttz is only used as a shift amount, set is_zero_poison to true.
+ if (II.hasOneUse() && match(Op1, m_Zero()) &&
+ match(II.user_back(), m_Shift(m_Value(), m_Specific(&II))))
+ return IC.replaceOperand(II, 1, IC.Builder.getTrue());
+
Constant *C;
if (IsTZ) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 45afa63..a9817f1 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1121,6 +1121,10 @@ Instruction *InstCombinerImpl::visitZExt(ZExtInst &Zext) {
Value *Src = Zext.getOperand(0);
Type *SrcTy = Src->getType(), *DestTy = Zext.getType();
+ // zext nneg bool x -> 0
+ if (SrcTy->isIntOrIntVectorTy(1) && Zext.hasNonNeg())
+ return replaceInstUsesWith(Zext, Constant::getNullValue(Zext.getType()));
+
// Try to extend the entire expression tree to the wide destination type.
unsigned BitsToClear;
if (shouldChangeType(SrcTy, DestTy) &&
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 1688005..c9bbe43 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -5202,7 +5202,8 @@ static bool combineInstructionsOverFunction(
if (Iteration > Opts.MaxIterations) {
report_fatal_error(
"Instruction Combining did not reach a fixpoint after " +
- Twine(Opts.MaxIterations) + " iterations");
+ Twine(Opts.MaxIterations) + " iterations",
+ /*GenCrashDiag=*/false);
}
}
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 11a5c29c..87584da 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -260,6 +260,10 @@ static cl::opt<bool> ClUsePageAliases("hwasan-experimental-use-page-aliases",
namespace {
+template <typename T> T optOr(cl::opt<T> &Opt, T Other) {
+ return Opt.getNumOccurrences() ? Opt : Other;
+}
+
bool shouldUsePageAliases(const Triple &TargetTriple) {
return ClUsePageAliases && TargetTriple.getArch() == Triple::x86_64;
}
@@ -269,14 +273,11 @@ bool shouldInstrumentStack(const Triple &TargetTriple) {
}
bool shouldInstrumentWithCalls(const Triple &TargetTriple) {
- return ClInstrumentWithCalls.getNumOccurrences()
- ? ClInstrumentWithCalls
- : TargetTriple.getArch() == Triple::x86_64;
+ return optOr(ClInstrumentWithCalls, TargetTriple.getArch() == Triple::x86_64);
}
bool mightUseStackSafetyAnalysis(bool DisableOptimization) {
- return ClUseStackSafety.getNumOccurrences() ? ClUseStackSafety
- : !DisableOptimization;
+ return optOr(ClUseStackSafety, !DisableOptimization);
}
bool shouldUseStackSafetyAnalysis(const Triple &TargetTriple,
@@ -296,10 +297,8 @@ public:
HWAddressSanitizer(Module &M, bool CompileKernel, bool Recover,
const StackSafetyGlobalInfo *SSI)
: M(M), SSI(SSI) {
- this->Recover = ClRecover.getNumOccurrences() > 0 ? ClRecover : Recover;
- this->CompileKernel = ClEnableKhwasan.getNumOccurrences() > 0
- ? ClEnableKhwasan
- : CompileKernel;
+ this->Recover = optOr(ClRecover, Recover);
+ this->CompileKernel = optOr(ClEnableKhwasan, CompileKernel);
this->Rng =
ClRandomSkipRate.getNumOccurrences() ? M.createRNG("hwasan") : nullptr;
@@ -625,19 +624,14 @@ void HWAddressSanitizer::initializeModule() {
bool NewRuntime =
!TargetTriple.isAndroid() || !TargetTriple.isAndroidVersionLT(30);
- UseShortGranules =
- ClUseShortGranules.getNumOccurrences() ? ClUseShortGranules : NewRuntime;
- OutlinedChecks =
- (TargetTriple.isAArch64() || TargetTriple.isRISCV64()) &&
- TargetTriple.isOSBinFormatELF() &&
- (ClInlineAllChecks.getNumOccurrences() ? !ClInlineAllChecks : !Recover);
+ UseShortGranules = optOr(ClUseShortGranules, NewRuntime);
+ OutlinedChecks = (TargetTriple.isAArch64() || TargetTriple.isRISCV64()) &&
+ TargetTriple.isOSBinFormatELF() &&
+ !optOr(ClInlineAllChecks, Recover);
- InlineFastPath =
- (ClInlineFastPathChecks.getNumOccurrences()
- ? ClInlineFastPathChecks
- : !(TargetTriple.isAndroid() ||
- TargetTriple.isOSFuchsia())); // These platforms may prefer less
- // inlining to reduce binary size.
+ // These platforms may prefer less inlining to reduce binary size.
+ InlineFastPath = optOr(ClInlineFastPathChecks, !(TargetTriple.isAndroid() ||
+ TargetTriple.isOSFuchsia()));
if (ClMatchAllTag.getNumOccurrences()) {
if (ClMatchAllTag != -1) {
@@ -649,22 +643,17 @@ void HWAddressSanitizer::initializeModule() {
UseMatchAllCallback = !CompileKernel && MatchAllTag.has_value();
// If we don't have personality function support, fall back to landing pads.
- InstrumentLandingPads = ClInstrumentLandingPads.getNumOccurrences()
- ? ClInstrumentLandingPads
- : !NewRuntime;
+ InstrumentLandingPads = optOr(ClInstrumentLandingPads, !NewRuntime);
if (!CompileKernel) {
createHwasanCtorComdat();
- bool InstrumentGlobals =
- ClGlobals.getNumOccurrences() ? ClGlobals : NewRuntime;
+ bool InstrumentGlobals = optOr(ClGlobals, NewRuntime);
if (InstrumentGlobals && !UsePageAliases)
instrumentGlobals();
bool InstrumentPersonalityFunctions =
- ClInstrumentPersonalityFunctions.getNumOccurrences()
- ? ClInstrumentPersonalityFunctions
- : NewRuntime;
+ optOr(ClInstrumentPersonalityFunctions, NewRuntime);
if (InstrumentPersonalityFunctions)
instrumentPersonalityFunctions();
}
diff --git a/llvm/lib/Transforms/Scalar/Float2Int.cpp b/llvm/lib/Transforms/Scalar/Float2Int.cpp
index ccca8bc..6ad4be169 100644
--- a/llvm/lib/Transforms/Scalar/Float2Int.cpp
+++ b/llvm/lib/Transforms/Scalar/Float2Int.cpp
@@ -311,7 +311,7 @@ void Float2IntPass::walkForwards() {
}
// If there is a valid transform to be done, do it.
-bool Float2IntPass::validateAndTransform() {
+bool Float2IntPass::validateAndTransform(const DataLayout &DL) {
bool MadeChange = false;
// Iterate over every disjoint partition of the def-use graph.
@@ -376,15 +376,23 @@ bool Float2IntPass::validateAndTransform() {
LLVM_DEBUG(dbgs() << "F2I: Value not guaranteed to be representable!\n");
continue;
}
- if (MinBW > 64) {
- LLVM_DEBUG(
- dbgs() << "F2I: Value requires more than 64 bits to represent!\n");
- continue;
- }
- // OK, R is known to be representable. Now pick a type for it.
- // FIXME: Pick the smallest legal type that will fit.
- Type *Ty = (MinBW > 32) ? Type::getInt64Ty(*Ctx) : Type::getInt32Ty(*Ctx);
+ // OK, R is known to be representable.
+ // Pick the smallest legal type that will fit.
+ Type *Ty = DL.getSmallestLegalIntType(*Ctx, MinBW);
+ if (!Ty) {
+ // Every supported target supports 64-bit and 32-bit integers,
+ // so fallback to a 32 or 64-bit integer if the value fits.
+ if (MinBW <= 32) {
+ Ty = Type::getInt32Ty(*Ctx);
+ } else if (MinBW <= 64) {
+ Ty = Type::getInt64Ty(*Ctx);
+ } else {
+ LLVM_DEBUG(dbgs() << "F2I: Value requires more than bits to represent "
+ "than the target supports!\n");
+ continue;
+ }
+ }
for (auto MI = ECs.member_begin(It), ME = ECs.member_end();
MI != ME; ++MI)
@@ -491,7 +499,8 @@ bool Float2IntPass::runImpl(Function &F, const DominatorTree &DT) {
walkBackwards();
walkForwards();
- bool Modified = validateAndTransform();
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ bool Modified = validateAndTransform(DL);
if (Modified)
cleanup();
return Modified;
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 7b74caa..a87e5a3 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -2848,7 +2848,7 @@ unsigned llvm::changeToUnreachable(Instruction *I, bool PreserveLCSSA,
Updates.push_back({DominatorTree::Delete, BB, UniqueSuccessor});
DTU->applyUpdates(Updates);
}
- BB->flushTerminatorDbgValues();
+ BB->flushTerminatorDbgRecords();
return NumInstrsRemoved;
}
diff --git a/llvm/lib/Transforms/Utils/LoopConstrainer.cpp b/llvm/lib/Transforms/Utils/LoopConstrainer.cpp
index 81545ef..d9832ee 100644
--- a/llvm/lib/Transforms/Utils/LoopConstrainer.cpp
+++ b/llvm/lib/Transforms/Utils/LoopConstrainer.cpp
@@ -42,8 +42,11 @@ static bool isSafeDecreasingBound(const SCEV *Start, const SCEV *BoundSCEV,
ICmpInst::Predicate BoundPred =
IsSigned ? CmpInst::ICMP_SGT : CmpInst::ICMP_UGT;
+ auto StartLG = SE.applyLoopGuards(Start, L);
+ auto BoundLG = SE.applyLoopGuards(BoundSCEV, L);
+
if (LatchBrExitIdx == 1)
- return SE.isLoopEntryGuardedByCond(L, BoundPred, Start, BoundSCEV);
+ return SE.isLoopEntryGuardedByCond(L, BoundPred, StartLG, BoundLG);
assert(LatchBrExitIdx == 0 && "LatchBrExitIdx should be either 0 or 1");
@@ -54,10 +57,10 @@ static bool isSafeDecreasingBound(const SCEV *Start, const SCEV *BoundSCEV,
const SCEV *Limit = SE.getMinusSCEV(SE.getConstant(Min), StepPlusOne);
const SCEV *MinusOne =
- SE.getMinusSCEV(BoundSCEV, SE.getOne(BoundSCEV->getType()));
+ SE.getMinusSCEV(BoundLG, SE.getOne(BoundLG->getType()));
- return SE.isLoopEntryGuardedByCond(L, BoundPred, Start, MinusOne) &&
- SE.isLoopEntryGuardedByCond(L, BoundPred, BoundSCEV, Limit);
+ return SE.isLoopEntryGuardedByCond(L, BoundPred, StartLG, MinusOne) &&
+ SE.isLoopEntryGuardedByCond(L, BoundPred, BoundLG, Limit);
}
/// Given a loop with an increasing induction variable, is it possible to
@@ -86,8 +89,11 @@ static bool isSafeIncreasingBound(const SCEV *Start, const SCEV *BoundSCEV,
ICmpInst::Predicate BoundPred =
IsSigned ? CmpInst::ICMP_SLT : CmpInst::ICMP_ULT;
+ auto StartLG = SE.applyLoopGuards(Start, L);
+ auto BoundLG = SE.applyLoopGuards(BoundSCEV, L);
+
if (LatchBrExitIdx == 1)
- return SE.isLoopEntryGuardedByCond(L, BoundPred, Start, BoundSCEV);
+ return SE.isLoopEntryGuardedByCond(L, BoundPred, StartLG, BoundLG);
assert(LatchBrExitIdx == 0 && "LatchBrExitIdx should be 0 or 1");
@@ -97,9 +103,9 @@ static bool isSafeIncreasingBound(const SCEV *Start, const SCEV *BoundSCEV,
: APInt::getMaxValue(BitWidth);
const SCEV *Limit = SE.getMinusSCEV(SE.getConstant(Max), StepMinusOne);
- return (SE.isLoopEntryGuardedByCond(L, BoundPred, Start,
- SE.getAddExpr(BoundSCEV, Step)) &&
- SE.isLoopEntryGuardedByCond(L, BoundPred, BoundSCEV, Limit));
+ return (SE.isLoopEntryGuardedByCond(L, BoundPred, StartLG,
+ SE.getAddExpr(BoundLG, Step)) &&
+ SE.isLoopEntryGuardedByCond(L, BoundPred, BoundLG, Limit));
}
/// Returns estimate for max latch taken count of the loop of the narrowest
diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
index 8c6af7a..acfd87c 100644
--- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -577,28 +577,28 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
Module *M = OrigHeader->getModule();
- // Track the next DPValue to clone. If we have a sequence where an
+ // Track the next DbgRecord to clone. If we have a sequence where an
// instruction is hoisted instead of being cloned:
- // DPValue blah
+ // DbgRecord blah
// %foo = add i32 0, 0
- // DPValue xyzzy
+ // DbgRecord xyzzy
// %bar = call i32 @foobar()
- // where %foo is hoisted, then the DPValue "blah" will be seen twice, once
+ // where %foo is hoisted, then the DbgRecord "blah" will be seen twice, once
// attached to %foo, then when %foo his hoisted it will "fall down" onto the
// function call:
- // DPValue blah
- // DPValue xyzzy
+ // DbgRecord blah
+ // DbgRecord xyzzy
// %bar = call i32 @foobar()
// causing it to appear attached to the call too.
//
// To avoid this, cloneDebugInfoFrom takes an optional "start cloning from
- // here" position to account for this behaviour. We point it at any DPValues
- // on the next instruction, here labelled xyzzy, before we hoist %foo.
- // Later, we only only clone DPValues from that position (xyzzy) onwards,
- // which avoids cloning DPValue "blah" multiple times.
- // (Stored as a range because it gives us a natural way of testing whether
- // there were DPValues on the next instruction before we hoisted things).
- iterator_range<DPValue::self_iterator> NextDbgInsts =
+ // here" position to account for this behaviour. We point it at any
+ // DbgRecords on the next instruction, here labelled xyzzy, before we hoist
+ // %foo. Later, we only only clone DbgRecords from that position (xyzzy)
+ // onwards, which avoids cloning DbgRecord "blah" multiple times. (Stored as
+ // a range because it gives us a natural way of testing whether
+ // there were DbgRecords on the next instruction before we hoisted things).
+ iterator_range<DbgRecord::self_iterator> NextDbgInsts =
(I != E) ? I->getDbgRecordRange() : DPMarker::getEmptyDbgRecordRange();
while (I != E) {
@@ -777,7 +777,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
// OrigPreHeader's old terminator (the original branch into the loop), and
// remove the corresponding incoming values from the PHI nodes in OrigHeader.
LoopEntryBranch->eraseFromParent();
- OrigPreheader->flushTerminatorDbgValues();
+ OrigPreheader->flushTerminatorDbgRecords();
// Update MemorySSA before the rewrite call below changes the 1:1
// instruction:cloned_instruction_or_value mapping.
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 0f3d140..6d2a6a3 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -1572,7 +1572,8 @@ hoistLockstepIdenticalDPValues(Instruction *TI, Instruction *I1,
while (none_of(Itrs, atEnd)) {
bool HoistDPVs = allIdentical(Itrs);
for (CurrentAndEndIt &Pair : Itrs) {
- // Increment Current iterator now as we may be about to move the DPValue.
+ // Increment Current iterator now as we may be about to move the
+ // DbgRecord.
DbgRecord &DR = *Pair.first++;
if (HoistDPVs) {
DR.removeFromParent();
@@ -5304,7 +5305,7 @@ bool SimplifyCFGOpt::simplifyUnreachable(UnreachableInst *UI) {
// Ensure that any debug-info records that used to occur after the Unreachable
// are moved to in front of it -- otherwise they'll "dangle" at the end of
// the block.
- BB->flushTerminatorDbgValues();
+ BB->flushTerminatorDbgRecords();
// Debug-info records on the unreachable inst itself should be deleted, as
// below we delete everything past the final executable instruction.
@@ -5326,8 +5327,8 @@ bool SimplifyCFGOpt::simplifyUnreachable(UnreachableInst *UI) {
// block will be the unwind edges of Invoke/CatchSwitch/CleanupReturn,
// and we can therefore guarantee this block will be erased.
- // If we're deleting this, we're deleting any subsequent dbg.values, so
- // delete DPValue records of variable information.
+ // If we're deleting this, we're deleting any subsequent debug info, so
+ // delete DbgRecords.
BBI->dropDbgRecords();
// Delete this instruction (any uses are guaranteed to be dead)
diff --git a/llvm/lib/Transforms/Utils/ValueMapper.cpp b/llvm/lib/Transforms/Utils/ValueMapper.cpp
index 3da1610..abb7a44 100644
--- a/llvm/lib/Transforms/Utils/ValueMapper.cpp
+++ b/llvm/lib/Transforms/Utils/ValueMapper.cpp
@@ -146,7 +146,7 @@ public:
Value *mapValue(const Value *V);
void remapInstruction(Instruction *I);
void remapFunction(Function &F);
- void remapDPValue(DbgRecord &DPV);
+ void remapDbgRecord(DbgRecord &DPV);
Constant *mapConstant(const Constant *C) {
return cast_or_null<Constant>(mapValue(C));
@@ -537,7 +537,7 @@ Value *Mapper::mapValue(const Value *V) {
return getVM()[V] = ConstantPointerNull::get(cast<PointerType>(NewTy));
}
-void Mapper::remapDPValue(DbgRecord &DR) {
+void Mapper::remapDbgRecord(DbgRecord &DR) {
if (DPLabel *DPL = dyn_cast<DPLabel>(&DR)) {
DPL->setLabel(cast<DILabel>(mapMetadata(DPL->getLabel())));
return;
@@ -1067,7 +1067,7 @@ void Mapper::remapFunction(Function &F) {
for (Instruction &I : BB) {
remapInstruction(&I);
for (DbgRecord &DR : I.getDbgRecordRange())
- remapDPValue(DR);
+ remapDbgRecord(DR);
}
}
}
@@ -1234,7 +1234,7 @@ void ValueMapper::remapInstruction(Instruction &I) {
}
void ValueMapper::remapDPValue(Module *M, DPValue &V) {
- FlushingMapper(pImpl)->remapDPValue(V);
+ FlushingMapper(pImpl)->remapDbgRecord(V);
}
void ValueMapper::remapDPValueRange(
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index a7ebf78..e86705e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -79,6 +79,13 @@ public:
VPBasicBlock *getInsertBlock() const { return BB; }
VPBasicBlock::iterator getInsertPoint() const { return InsertPt; }
+ /// Create a VPBuilder to insert after \p R.
+ static VPBuilder getToInsertAfter(VPRecipeBase *R) {
+ VPBuilder B;
+ B.setInsertPoint(R->getParent(), std::next(R->getIterator()));
+ return B;
+ }
+
/// InsertPoint - A saved insertion point.
class VPInsertPoint {
VPBasicBlock *Block = nullptr;
@@ -131,8 +138,9 @@ public:
/// Create an N-ary operation with \p Opcode, \p Operands and set \p Inst as
/// its underlying Instruction.
- VPValue *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
- Instruction *Inst = nullptr, const Twine &Name = "") {
+ VPInstruction *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
+ Instruction *Inst = nullptr,
+ const Twine &Name = "") {
DebugLoc DL;
if (Inst)
DL = Inst->getDebugLoc();
@@ -140,8 +148,8 @@ public:
NewVPInst->setUnderlyingValue(Inst);
return NewVPInst;
}
- VPValue *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
- DebugLoc DL, const Twine &Name = "") {
+ VPInstruction *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
+ DebugLoc DL, const Twine &Name = "") {
return createInstruction(Opcode, Operands, DL, Name);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
index 6474a96..877b5d4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
@@ -296,8 +296,7 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
// recipes.
if (Br->isConditional()) {
VPValue *Cond = getOrCreateVPOperand(Br->getCondition());
- VPBB->appendRecipe(
- new VPInstruction(VPInstruction::BranchOnCond, {Cond}));
+ VPIRBuilder.createNaryOp(VPInstruction::BranchOnCond, {Cond}, Inst);
}
// Skip the rest of the Instruction processing for Branch instructions.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index f6b564a..3b19db9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1192,10 +1192,10 @@ void VPlanTransforms::addActiveLaneMask(
LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(
Plan, DataAndControlFlowWithoutRuntimeCheck);
} else {
- LaneMask = new VPInstruction(VPInstruction::ActiveLaneMask,
- {WideCanonicalIV, Plan.getTripCount()},
- nullptr, "active.lane.mask");
- LaneMask->insertAfter(WideCanonicalIV);
+ VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
+ LaneMask = B.createNaryOp(VPInstruction::ActiveLaneMask,
+ {WideCanonicalIV, Plan.getTripCount()}, nullptr,
+ "active.lane.mask");
}
// Walk users of WideCanonicalIV and replace all compares of the form
diff --git a/llvm/test/Analysis/Lint/crash_empty_iterator.ll b/llvm/test/Analysis/Lint/crash_empty_iterator.ll
new file mode 100644
index 0000000..2fbecbc
--- /dev/null
+++ b/llvm/test/Analysis/Lint/crash_empty_iterator.ll
@@ -0,0 +1,22 @@
+; RUN: opt -passes="lint" -S < %s | FileCheck %s
+
+; After 2fe81edef6f0b
+; [NFC][RemoveDIs] Insert instruction using iterators in Transforms/
+; this crashed in FindInsertedValue when dereferencing an empty
+; optional iterator.
+; Just see that it doesn't crash anymore.
+
+; CHECK-LABEL: @test1
+
+%struct = type { i32, i32 }
+
+define void @test1() {
+entry:
+ %.fca.1.insert = insertvalue %struct zeroinitializer, i32 0, 1
+ %0 = extractvalue %struct %.fca.1.insert, 0
+ %1 = tail call %struct @foo(i32 %0)
+ ret void
+}
+
+declare %struct @foo(i32)
+
diff --git a/llvm/test/Analysis/ValueTracking/knownbits-select-from-cond.ll b/llvm/test/Analysis/ValueTracking/knownbits-select-from-cond.ll
new file mode 100644
index 0000000..c3343edf
--- /dev/null
+++ b/llvm/test/Analysis/ValueTracking/knownbits-select-from-cond.ll
@@ -0,0 +1,81 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+
+define i8 @select_condition_implies_highbits_op1(i8 %xx, i8 noundef %y) {
+; CHECK-LABEL: @select_condition_implies_highbits_op1(
+; CHECK-NEXT: [[X:%.*]] = and i8 [[XX:%.*]], 15
+; CHECK-NEXT: [[COND:%.*]] = icmp ult i8 [[Y:%.*]], 3
+; CHECK-NEXT: [[SEL:%.*]] = select i1 [[COND]], i8 [[Y]], i8 [[X]]
+; CHECK-NEXT: [[R:%.*]] = or disjoint i8 [[SEL]], 32
+; CHECK-NEXT: ret i8 [[R]]
+;
+ %x = and i8 %xx, 15
+ %cond = icmp ult i8 %y, 3
+ %sel = select i1 %cond, i8 %y, i8 %x
+ %r = add i8 %sel, 32
+ ret i8 %r
+}
+
+define i8 @select_condition_implies_highbits_op1_maybe_undef_fail(i8 %xx, i8 %y) {
+; CHECK-LABEL: @select_condition_implies_highbits_op1_maybe_undef_fail(
+; CHECK-NEXT: [[X:%.*]] = and i8 [[XX:%.*]], 15
+; CHECK-NEXT: [[COND:%.*]] = icmp ult i8 [[Y:%.*]], 3
+; CHECK-NEXT: [[SEL:%.*]] = select i1 [[COND]], i8 [[Y]], i8 [[X]]
+; CHECK-NEXT: [[R:%.*]] = add i8 [[SEL]], 32
+; CHECK-NEXT: ret i8 [[R]]
+;
+ %x = and i8 %xx, 15
+ %cond = icmp ult i8 %y, 3
+ %sel = select i1 %cond, i8 %y, i8 %x
+ %r = add i8 %sel, 32
+ ret i8 %r
+}
+
+define i8 @select_condition_implies_highbits_op2(i8 %xx, i8 noundef %y) {
+; CHECK-LABEL: @select_condition_implies_highbits_op2(
+; CHECK-NEXT: [[X:%.*]] = and i8 [[XX:%.*]], 15
+; CHECK-NEXT: [[COND:%.*]] = icmp ugt i8 [[Y:%.*]], 3
+; CHECK-NEXT: [[SEL:%.*]] = select i1 [[COND]], i8 [[X]], i8 [[Y]]
+; CHECK-NEXT: [[R:%.*]] = or disjoint i8 [[SEL]], 32
+; CHECK-NEXT: ret i8 [[R]]
+;
+ %x = and i8 %xx, 15
+ %cond = icmp ugt i8 %y, 3
+ %sel = select i1 %cond, i8 %x, i8 %y
+ %r = add i8 %sel, 32
+ ret i8 %r
+}
+
+define i8 @select_condition_implies_highbits_op1_and(i8 %xx, i8 noundef %y, i1 %other_cond) {
+; CHECK-LABEL: @select_condition_implies_highbits_op1_and(
+; CHECK-NEXT: [[X:%.*]] = and i8 [[XX:%.*]], 15
+; CHECK-NEXT: [[COND0:%.*]] = icmp ult i8 [[Y:%.*]], 3
+; CHECK-NEXT: [[COND:%.*]] = and i1 [[COND0]], [[OTHER_COND:%.*]]
+; CHECK-NEXT: [[SEL:%.*]] = select i1 [[COND]], i8 [[Y]], i8 [[X]]
+; CHECK-NEXT: [[R:%.*]] = or disjoint i8 [[SEL]], 32
+; CHECK-NEXT: ret i8 [[R]]
+;
+ %x = and i8 %xx, 15
+ %cond0 = icmp ult i8 %y, 3
+ %cond = and i1 %cond0, %other_cond
+ %sel = select i1 %cond, i8 %y, i8 %x
+ %r = add i8 %sel, 32
+ ret i8 %r
+}
+
+define i8 @select_condition_implies_highbits_op2_or(i8 %xx, i8 noundef %y, i1 %other_cond) {
+; CHECK-LABEL: @select_condition_implies_highbits_op2_or(
+; CHECK-NEXT: [[X:%.*]] = and i8 [[XX:%.*]], 15
+; CHECK-NEXT: [[COND0:%.*]] = icmp ugt i8 [[Y:%.*]], 3
+; CHECK-NEXT: [[COND:%.*]] = or i1 [[COND0]], [[OTHER_COND:%.*]]
+; CHECK-NEXT: [[SEL:%.*]] = select i1 [[COND]], i8 [[X]], i8 [[Y]]
+; CHECK-NEXT: [[R:%.*]] = or disjoint i8 [[SEL]], 32
+; CHECK-NEXT: ret i8 [[R]]
+;
+ %x = and i8 %xx, 15
+ %cond0 = icmp ugt i8 %y, 3
+ %cond = or i1 %cond0, %other_cond
+ %sel = select i1 %cond, i8 %x, i8 %y
+ %r = add i8 %sel, 32
+ ret i8 %r
+}
diff --git a/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
index 4932529..3007e7c 100644
--- a/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
@@ -8,9 +8,8 @@ define <4 x i16> @fptosi_v4f64_to_v4i16(ptr %ptr) {
; CHECK-NEXT: ldp q0, q1, [x0]
; CHECK-NEXT: fcvtzs v1.2d, v1.2d
; CHECK-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-NEXT: xtn v1.2s, v1.2d
-; CHECK-NEXT: xtn v0.2s, v0.2d
-; CHECK-NEXT: uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: xtn v0.4h, v0.4s
; CHECK-NEXT: ret
%tmp1 = load <4 x double>, ptr %ptr
%tmp2 = fptosi <4 x double> %tmp1 to <4 x i16>
@@ -26,13 +25,10 @@ define <8 x i8> @fptosi_v4f64_to_v4i8(ptr %ptr) {
; CHECK-NEXT: fcvtzs v1.2d, v1.2d
; CHECK-NEXT: fcvtzs v3.2d, v3.2d
; CHECK-NEXT: fcvtzs v2.2d, v2.2d
-; CHECK-NEXT: xtn v0.2s, v0.2d
-; CHECK-NEXT: xtn v1.2s, v1.2d
-; CHECK-NEXT: xtn v3.2s, v3.2d
-; CHECK-NEXT: xtn v2.2s, v2.2d
-; CHECK-NEXT: uzp1 v0.4h, v1.4h, v0.4h
-; CHECK-NEXT: uzp1 v1.4h, v2.4h, v3.4h
-; CHECK-NEXT: uzp1 v0.8b, v1.8b, v0.8b
+; CHECK-NEXT: uzp1 v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: uzp1 v1.4s, v2.4s, v3.4s
+; CHECK-NEXT: uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-NEXT: xtn v0.8b, v0.8h
; CHECK-NEXT: ret
%tmp1 = load <8 x double>, ptr %ptr
%tmp2 = fptosi <8 x double> %tmp1 to <8 x i8>
@@ -96,9 +92,8 @@ define <4 x i16> @fptoui_v4f64_to_v4i16(ptr %ptr) {
; CHECK-NEXT: ldp q0, q1, [x0]
; CHECK-NEXT: fcvtzs v1.2d, v1.2d
; CHECK-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-NEXT: xtn v1.2s, v1.2d
-; CHECK-NEXT: xtn v0.2s, v0.2d
-; CHECK-NEXT: uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: xtn v0.4h, v0.4s
; CHECK-NEXT: ret
%tmp1 = load <4 x double>, ptr %ptr
%tmp2 = fptoui <4 x double> %tmp1 to <4 x i16>
diff --git a/llvm/test/CodeGen/AArch64/bitcast.ll b/llvm/test/CodeGen/AArch64/bitcast.ll
index bccfdb9..9ebd570 100644
--- a/llvm/test/CodeGen/AArch64/bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/bitcast.ll
@@ -59,7 +59,7 @@ define i32 @bitcast_v4i8_i32(<4 x i8> %a, <4 x i8> %b){
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: add v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: xtn v0.8b, v0.8h
+; CHECK-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
@@ -388,7 +388,7 @@ define <2 x i16> @bitcast_v4i8_v2i16(<4 x i8> %a, <4 x i8> %b){
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: add v0.4h, v0.4h, v1.4h
; CHECK-NEXT: add x8, sp, #12
-; CHECK-NEXT: xtn v0.8b, v0.8h
+; CHECK-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; CHECK-NEXT: str s0, [sp, #12]
; CHECK-NEXT: ld1 { v0.h }[0], [x8]
; CHECK-NEXT: orr x8, x8, #0x2
diff --git a/llvm/test/CodeGen/AArch64/extbinopload.ll b/llvm/test/CodeGen/AArch64/extbinopload.ll
index 1f68c77..dff4831 100644
--- a/llvm/test/CodeGen/AArch64/extbinopload.ll
+++ b/llvm/test/CodeGen/AArch64/extbinopload.ll
@@ -650,7 +650,7 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
; CHECK-NEXT: add x11, x3, #12
; CHECK-NEXT: str s1, [x4]
; CHECK-NEXT: ushll v1.8h, v1.8b, #0
-; CHECK-NEXT: ldp s0, s5, [x2]
+; CHECK-NEXT: ldp s0, s4, [x2]
; CHECK-NEXT: ushll v2.8h, v0.8b, #0
; CHECK-NEXT: umov w9, v2.h[0]
; CHECK-NEXT: umov w10, v2.h[1]
@@ -662,24 +662,25 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
; CHECK-NEXT: ushll v2.8h, v2.8b, #0
; CHECK-NEXT: mov v0.b[10], w9
; CHECK-NEXT: add x9, x1, #4
-; CHECK-NEXT: uzp1 v1.8b, v1.8b, v2.8b
+; CHECK-NEXT: mov v1.d[1], v2.d[0]
; CHECK-NEXT: mov v0.b[11], w10
; CHECK-NEXT: add x10, x1, #12
+; CHECK-NEXT: bic v1.8h, #255, lsl #8
; CHECK-NEXT: ld1 { v0.s }[3], [x3], #4
-; CHECK-NEXT: ldr s4, [x0, #12]
-; CHECK-NEXT: ldp s3, s16, [x0, #4]
-; CHECK-NEXT: ld1 { v5.s }[1], [x3]
-; CHECK-NEXT: ldp s6, s7, [x2, #8]
-; CHECK-NEXT: ld1 { v4.s }[1], [x10]
-; CHECK-NEXT: ld1 { v3.s }[1], [x9]
-; CHECK-NEXT: ld1 { v6.s }[1], [x8]
-; CHECK-NEXT: ld1 { v7.s }[1], [x11]
+; CHECK-NEXT: ldr s3, [x0, #12]
+; CHECK-NEXT: ldp s2, s7, [x0, #4]
+; CHECK-NEXT: ld1 { v4.s }[1], [x3]
+; CHECK-NEXT: ldp s5, s6, [x2, #8]
+; CHECK-NEXT: ld1 { v3.s }[1], [x10]
+; CHECK-NEXT: ld1 { v2.s }[1], [x9]
+; CHECK-NEXT: ld1 { v5.s }[1], [x8]
+; CHECK-NEXT: ld1 { v6.s }[1], [x11]
; CHECK-NEXT: add x8, x1, #8
-; CHECK-NEXT: ld1 { v16.s }[1], [x8]
-; CHECK-NEXT: uaddl v2.8h, v3.8b, v4.8b
-; CHECK-NEXT: ushll v3.8h, v6.8b, #0
-; CHECK-NEXT: uaddl v4.8h, v5.8b, v7.8b
-; CHECK-NEXT: uaddl v1.8h, v1.8b, v16.8b
+; CHECK-NEXT: ld1 { v7.s }[1], [x8]
+; CHECK-NEXT: uaddl v2.8h, v2.8b, v3.8b
+; CHECK-NEXT: ushll v3.8h, v5.8b, #0
+; CHECK-NEXT: uaddl v4.8h, v4.8b, v6.8b
+; CHECK-NEXT: uaddw v1.8h, v1.8h, v7.8b
; CHECK-NEXT: uaddw2 v5.8h, v3.8h, v0.16b
; CHECK-NEXT: ushll v0.4s, v2.4h, #3
; CHECK-NEXT: ushll2 v2.4s, v2.8h, #3
diff --git a/llvm/test/CodeGen/AArch64/fold-global-offsets.ll b/llvm/test/CodeGen/AArch64/fold-global-offsets.ll
index 897d35a..8de0f0d 100644
--- a/llvm/test/CodeGen/AArch64/fold-global-offsets.ll
+++ b/llvm/test/CodeGen/AArch64/fold-global-offsets.ll
@@ -131,7 +131,7 @@ define i32 @f7() {
; GISEL-NEXT: ret
entry:
- %lshr = lshr i128 bitcast (<2 x i64> <i64 undef, i64 ptrtoint (ptr getelementptr inbounds ({ [9 x ptr], [8 x ptr] }, ptr @x3, i64 0, inrange i32 1, i64 2) to i64)> to i128), 64
+ %lshr = lshr i128 bitcast (<2 x i64> <i64 undef, i64 ptrtoint (ptr getelementptr inbounds ({ [9 x ptr], [8 x ptr] }, ptr @x3, i64 0, i32 1, i64 2) to i64)> to i128), 64
%trunc = trunc i128 %lshr to i64
%inttoptr = inttoptr i64 %trunc to ptr
%gep = getelementptr i32, ptr %inttoptr, i64 5
diff --git a/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll b/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll
index 1ea87bb..0a3b9a0 100644
--- a/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll
@@ -73,9 +73,8 @@ define void @fptoui_v8f32_to_v8i8_no_loop(ptr %A, ptr %dst) {
; CHECK-NEXT: ldp q0, q1, [x0]
; CHECK-NEXT: fcvtzs.4s v1, v1
; CHECK-NEXT: fcvtzs.4s v0, v0
-; CHECK-NEXT: xtn.4h v1, v1
-; CHECK-NEXT: xtn.4h v0, v0
-; CHECK-NEXT: uzp1.8b v0, v0, v1
+; CHECK-NEXT: uzp1.8h v0, v0, v1
+; CHECK-NEXT: xtn.8b v0, v0
; CHECK-NEXT: str d0, [x1]
; CHECK-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/fptoi.ll b/llvm/test/CodeGen/AArch64/fptoi.ll
index 67190e8..7af01b5 100644
--- a/llvm/test/CodeGen/AArch64/fptoi.ll
+++ b/llvm/test/CodeGen/AArch64/fptoi.ll
@@ -1096,30 +1096,17 @@ entry:
}
define <3 x i16> @fptos_v3f64_v3i16(<3 x double> %a) {
-; CHECK-SD-LABEL: fptos_v3f64_v3i16:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2
-; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-SD-NEXT: fcvtzs v1.2d, v2.2d
-; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT: xtn v1.2s, v1.2d
-; CHECK-SD-NEXT: xtn v0.2s, v0.2d
-; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: fptos_v3f64_v3i16:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2
-; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT: fcvtzs v1.2d, v2.2d
-; CHECK-GI-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-GI-NEXT: uzp1 v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT: xtn v0.4h, v0.4s
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: fptos_v3f64_v3i16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-NEXT: fcvtzs v1.2d, v2.2d
+; CHECK-NEXT: fcvtzs v0.2d, v0.2d
+; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: ret
entry:
%c = fptosi <3 x double> %a to <3 x i16>
ret <3 x i16> %c
@@ -1134,9 +1121,8 @@ define <3 x i16> @fptou_v3f64_v3i16(<3 x double> %a) {
; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
; CHECK-SD-NEXT: fcvtzs v1.2d, v2.2d
; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT: xtn v1.2s, v1.2d
-; CHECK-SD-NEXT: xtn v0.2s, v0.2d
-; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: xtn v0.4h, v0.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptou_v3f64_v3i16:
@@ -1160,9 +1146,8 @@ define <4 x i16> @fptos_v4f64_v4i16(<4 x double> %a) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: fcvtzs v1.2d, v1.2d
; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT: xtn v1.2s, v1.2d
-; CHECK-SD-NEXT: xtn v0.2s, v0.2d
-; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: xtn v0.4h, v0.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptos_v4f64_v4i16:
@@ -1182,9 +1167,8 @@ define <4 x i16> @fptou_v4f64_v4i16(<4 x double> %a) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: fcvtzs v1.2d, v1.2d
; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT: xtn v1.2s, v1.2d
-; CHECK-SD-NEXT: xtn v0.2s, v0.2d
-; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: xtn v0.4h, v0.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptou_v4f64_v4i16:
@@ -1600,9 +1584,8 @@ define <3 x i8> @fptos_v3f64_v3i8(<3 x double> %a) {
; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
; CHECK-SD-NEXT: fcvtzs v1.2d, v2.2d
; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT: xtn v1.2s, v1.2d
-; CHECK-SD-NEXT: xtn v0.2s, v0.2d
-; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: xtn v0.4h, v0.4s
; CHECK-SD-NEXT: umov w0, v0.h[0]
; CHECK-SD-NEXT: umov w1, v0.h[1]
; CHECK-SD-NEXT: umov w2, v0.h[2]
@@ -1638,9 +1621,8 @@ define <3 x i8> @fptou_v3f64_v3i8(<3 x double> %a) {
; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
; CHECK-SD-NEXT: fcvtzs v1.2d, v2.2d
; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT: xtn v1.2s, v1.2d
-; CHECK-SD-NEXT: xtn v0.2s, v0.2d
-; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: xtn v0.4h, v0.4s
; CHECK-SD-NEXT: umov w0, v0.h[0]
; CHECK-SD-NEXT: umov w1, v0.h[1]
; CHECK-SD-NEXT: umov w2, v0.h[2]
@@ -1672,9 +1654,8 @@ define <4 x i8> @fptos_v4f64_v4i8(<4 x double> %a) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: fcvtzs v1.2d, v1.2d
; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT: xtn v1.2s, v1.2d
-; CHECK-SD-NEXT: xtn v0.2s, v0.2d
-; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: xtn v0.4h, v0.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptos_v4f64_v4i8:
@@ -1694,9 +1675,8 @@ define <4 x i8> @fptou_v4f64_v4i8(<4 x double> %a) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: fcvtzs v1.2d, v1.2d
; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT: xtn v1.2s, v1.2d
-; CHECK-SD-NEXT: xtn v0.2s, v0.2d
-; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: xtn v0.4h, v0.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptou_v4f64_v4i8:
@@ -1718,13 +1698,10 @@ define <8 x i8> @fptos_v8f64_v8i8(<8 x double> %a) {
; CHECK-SD-NEXT: fcvtzs v2.2d, v2.2d
; CHECK-SD-NEXT: fcvtzs v1.2d, v1.2d
; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT: xtn v3.2s, v3.2d
-; CHECK-SD-NEXT: xtn v2.2s, v2.2d
-; CHECK-SD-NEXT: xtn v1.2s, v1.2d
-; CHECK-SD-NEXT: xtn v0.2s, v0.2d
-; CHECK-SD-NEXT: uzp1 v2.4h, v2.4h, v3.4h
-; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v2.8b
+; CHECK-SD-NEXT: uzp1 v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT: xtn v0.8b, v0.8h
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptos_v8f64_v8i8:
@@ -1750,13 +1727,10 @@ define <8 x i8> @fptou_v8f64_v8i8(<8 x double> %a) {
; CHECK-SD-NEXT: fcvtzs v2.2d, v2.2d
; CHECK-SD-NEXT: fcvtzs v1.2d, v1.2d
; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT: xtn v3.2s, v3.2d
-; CHECK-SD-NEXT: xtn v2.2s, v2.2d
-; CHECK-SD-NEXT: xtn v1.2s, v1.2d
-; CHECK-SD-NEXT: xtn v0.2s, v0.2d
-; CHECK-SD-NEXT: uzp1 v2.4h, v2.4h, v3.4h
-; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v2.8b
+; CHECK-SD-NEXT: uzp1 v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT: xtn v0.8b, v0.8h
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptou_v8f64_v8i8:
@@ -1786,21 +1760,13 @@ define <16 x i8> @fptos_v16f64_v16i8(<16 x double> %a) {
; CHECK-SD-NEXT: fcvtzs v2.2d, v2.2d
; CHECK-SD-NEXT: fcvtzs v1.2d, v1.2d
; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT: xtn v7.2s, v7.2d
-; CHECK-SD-NEXT: xtn v6.2s, v6.2d
-; CHECK-SD-NEXT: xtn v5.2s, v5.2d
-; CHECK-SD-NEXT: xtn v4.2s, v4.2d
-; CHECK-SD-NEXT: xtn v3.2s, v3.2d
-; CHECK-SD-NEXT: xtn v2.2s, v2.2d
-; CHECK-SD-NEXT: xtn v1.2s, v1.2d
-; CHECK-SD-NEXT: xtn v0.2s, v0.2d
-; CHECK-SD-NEXT: uzp1 v6.4h, v6.4h, v7.4h
-; CHECK-SD-NEXT: uzp1 v4.4h, v4.4h, v5.4h
-; CHECK-SD-NEXT: uzp1 v2.4h, v2.4h, v3.4h
-; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT: mov v4.d[1], v6.d[0]
-; CHECK-SD-NEXT: mov v0.d[1], v2.d[0]
-; CHECK-SD-NEXT: uzp1 v0.16b, v0.16b, v4.16b
+; CHECK-SD-NEXT: uzp1 v6.4s, v6.4s, v7.4s
+; CHECK-SD-NEXT: uzp1 v4.4s, v4.4s, v5.4s
+; CHECK-SD-NEXT: uzp1 v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: uzp1 v1.8h, v4.8h, v6.8h
+; CHECK-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT: uzp1 v0.16b, v0.16b, v1.16b
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptos_v16f64_v16i8:
@@ -1837,21 +1803,13 @@ define <16 x i8> @fptou_v16f64_v16i8(<16 x double> %a) {
; CHECK-SD-NEXT: fcvtzs v2.2d, v2.2d
; CHECK-SD-NEXT: fcvtzs v1.2d, v1.2d
; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT: xtn v7.2s, v7.2d
-; CHECK-SD-NEXT: xtn v6.2s, v6.2d
-; CHECK-SD-NEXT: xtn v5.2s, v5.2d
-; CHECK-SD-NEXT: xtn v4.2s, v4.2d
-; CHECK-SD-NEXT: xtn v3.2s, v3.2d
-; CHECK-SD-NEXT: xtn v2.2s, v2.2d
-; CHECK-SD-NEXT: xtn v1.2s, v1.2d
-; CHECK-SD-NEXT: xtn v0.2s, v0.2d
-; CHECK-SD-NEXT: uzp1 v6.4h, v6.4h, v7.4h
-; CHECK-SD-NEXT: uzp1 v4.4h, v4.4h, v5.4h
-; CHECK-SD-NEXT: uzp1 v2.4h, v2.4h, v3.4h
-; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT: mov v4.d[1], v6.d[0]
-; CHECK-SD-NEXT: mov v0.d[1], v2.d[0]
-; CHECK-SD-NEXT: uzp1 v0.16b, v0.16b, v4.16b
+; CHECK-SD-NEXT: uzp1 v6.4s, v6.4s, v7.4s
+; CHECK-SD-NEXT: uzp1 v4.4s, v4.4s, v5.4s
+; CHECK-SD-NEXT: uzp1 v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: uzp1 v1.8h, v4.8h, v6.8h
+; CHECK-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT: uzp1 v0.16b, v0.16b, v1.16b
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptou_v16f64_v16i8:
@@ -1900,36 +1858,20 @@ define <32 x i8> @fptos_v32f64_v32i8(<32 x double> %a) {
; CHECK-SD-NEXT: fcvtzs v18.2d, v18.2d
; CHECK-SD-NEXT: fcvtzs v17.2d, v17.2d
; CHECK-SD-NEXT: fcvtzs v16.2d, v16.2d
-; CHECK-SD-NEXT: xtn v7.2s, v7.2d
-; CHECK-SD-NEXT: xtn v6.2s, v6.2d
-; CHECK-SD-NEXT: xtn v5.2s, v5.2d
-; CHECK-SD-NEXT: xtn v4.2s, v4.2d
-; CHECK-SD-NEXT: xtn v3.2s, v3.2d
-; CHECK-SD-NEXT: xtn v2.2s, v2.2d
-; CHECK-SD-NEXT: xtn v1.2s, v1.2d
-; CHECK-SD-NEXT: xtn v0.2s, v0.2d
-; CHECK-SD-NEXT: xtn v23.2s, v23.2d
-; CHECK-SD-NEXT: xtn v22.2s, v22.2d
-; CHECK-SD-NEXT: xtn v21.2s, v21.2d
-; CHECK-SD-NEXT: xtn v20.2s, v20.2d
-; CHECK-SD-NEXT: xtn v19.2s, v19.2d
-; CHECK-SD-NEXT: xtn v18.2s, v18.2d
-; CHECK-SD-NEXT: xtn v17.2s, v17.2d
-; CHECK-SD-NEXT: xtn v16.2s, v16.2d
-; CHECK-SD-NEXT: uzp1 v6.4h, v6.4h, v7.4h
-; CHECK-SD-NEXT: uzp1 v4.4h, v4.4h, v5.4h
-; CHECK-SD-NEXT: uzp1 v2.4h, v2.4h, v3.4h
-; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT: uzp1 v1.4h, v22.4h, v23.4h
-; CHECK-SD-NEXT: uzp1 v3.4h, v20.4h, v21.4h
-; CHECK-SD-NEXT: uzp1 v5.4h, v18.4h, v19.4h
-; CHECK-SD-NEXT: uzp1 v7.4h, v16.4h, v17.4h
-; CHECK-SD-NEXT: mov v4.d[1], v6.d[0]
-; CHECK-SD-NEXT: mov v0.d[1], v2.d[0]
-; CHECK-SD-NEXT: mov v3.d[1], v1.d[0]
-; CHECK-SD-NEXT: mov v7.d[1], v5.d[0]
+; CHECK-SD-NEXT: uzp1 v6.4s, v6.4s, v7.4s
+; CHECK-SD-NEXT: uzp1 v4.4s, v4.4s, v5.4s
+; CHECK-SD-NEXT: uzp1 v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: uzp1 v3.4s, v20.4s, v21.4s
+; CHECK-SD-NEXT: uzp1 v1.4s, v22.4s, v23.4s
+; CHECK-SD-NEXT: uzp1 v5.4s, v18.4s, v19.4s
+; CHECK-SD-NEXT: uzp1 v7.4s, v16.4s, v17.4s
+; CHECK-SD-NEXT: uzp1 v4.8h, v4.8h, v6.8h
+; CHECK-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT: uzp1 v1.8h, v3.8h, v1.8h
+; CHECK-SD-NEXT: uzp1 v2.8h, v7.8h, v5.8h
; CHECK-SD-NEXT: uzp1 v0.16b, v0.16b, v4.16b
-; CHECK-SD-NEXT: uzp1 v1.16b, v7.16b, v3.16b
+; CHECK-SD-NEXT: uzp1 v1.16b, v2.16b, v1.16b
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptos_v32f64_v32i8:
@@ -1997,36 +1939,20 @@ define <32 x i8> @fptou_v32f64_v32i8(<32 x double> %a) {
; CHECK-SD-NEXT: fcvtzs v18.2d, v18.2d
; CHECK-SD-NEXT: fcvtzs v17.2d, v17.2d
; CHECK-SD-NEXT: fcvtzs v16.2d, v16.2d
-; CHECK-SD-NEXT: xtn v7.2s, v7.2d
-; CHECK-SD-NEXT: xtn v6.2s, v6.2d
-; CHECK-SD-NEXT: xtn v5.2s, v5.2d
-; CHECK-SD-NEXT: xtn v4.2s, v4.2d
-; CHECK-SD-NEXT: xtn v3.2s, v3.2d
-; CHECK-SD-NEXT: xtn v2.2s, v2.2d
-; CHECK-SD-NEXT: xtn v1.2s, v1.2d
-; CHECK-SD-NEXT: xtn v0.2s, v0.2d
-; CHECK-SD-NEXT: xtn v23.2s, v23.2d
-; CHECK-SD-NEXT: xtn v22.2s, v22.2d
-; CHECK-SD-NEXT: xtn v21.2s, v21.2d
-; CHECK-SD-NEXT: xtn v20.2s, v20.2d
-; CHECK-SD-NEXT: xtn v19.2s, v19.2d
-; CHECK-SD-NEXT: xtn v18.2s, v18.2d
-; CHECK-SD-NEXT: xtn v17.2s, v17.2d
-; CHECK-SD-NEXT: xtn v16.2s, v16.2d
-; CHECK-SD-NEXT: uzp1 v6.4h, v6.4h, v7.4h
-; CHECK-SD-NEXT: uzp1 v4.4h, v4.4h, v5.4h
-; CHECK-SD-NEXT: uzp1 v2.4h, v2.4h, v3.4h
-; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT: uzp1 v1.4h, v22.4h, v23.4h
-; CHECK-SD-NEXT: uzp1 v3.4h, v20.4h, v21.4h
-; CHECK-SD-NEXT: uzp1 v5.4h, v18.4h, v19.4h
-; CHECK-SD-NEXT: uzp1 v7.4h, v16.4h, v17.4h
-; CHECK-SD-NEXT: mov v4.d[1], v6.d[0]
-; CHECK-SD-NEXT: mov v0.d[1], v2.d[0]
-; CHECK-SD-NEXT: mov v3.d[1], v1.d[0]
-; CHECK-SD-NEXT: mov v7.d[1], v5.d[0]
+; CHECK-SD-NEXT: uzp1 v6.4s, v6.4s, v7.4s
+; CHECK-SD-NEXT: uzp1 v4.4s, v4.4s, v5.4s
+; CHECK-SD-NEXT: uzp1 v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: uzp1 v3.4s, v20.4s, v21.4s
+; CHECK-SD-NEXT: uzp1 v1.4s, v22.4s, v23.4s
+; CHECK-SD-NEXT: uzp1 v5.4s, v18.4s, v19.4s
+; CHECK-SD-NEXT: uzp1 v7.4s, v16.4s, v17.4s
+; CHECK-SD-NEXT: uzp1 v4.8h, v4.8h, v6.8h
+; CHECK-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT: uzp1 v1.8h, v3.8h, v1.8h
+; CHECK-SD-NEXT: uzp1 v2.8h, v7.8h, v5.8h
; CHECK-SD-NEXT: uzp1 v0.16b, v0.16b, v4.16b
-; CHECK-SD-NEXT: uzp1 v1.16b, v7.16b, v3.16b
+; CHECK-SD-NEXT: uzp1 v1.16b, v2.16b, v1.16b
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptou_v32f64_v32i8:
@@ -3026,9 +2952,8 @@ define <8 x i8> @fptos_v8f32_v8i8(<8 x float> %a) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: fcvtzs v1.4s, v1.4s
; CHECK-SD-NEXT: fcvtzs v0.4s, v0.4s
-; CHECK-SD-NEXT: xtn v1.4h, v1.4s
-; CHECK-SD-NEXT: xtn v0.4h, v0.4s
-; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT: uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT: xtn v0.8b, v0.8h
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptos_v8f32_v8i8:
@@ -3048,9 +2973,8 @@ define <8 x i8> @fptou_v8f32_v8i8(<8 x float> %a) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: fcvtzs v1.4s, v1.4s
; CHECK-SD-NEXT: fcvtzs v0.4s, v0.4s
-; CHECK-SD-NEXT: xtn v1.4h, v1.4s
-; CHECK-SD-NEXT: xtn v0.4h, v0.4s
-; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT: uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT: xtn v0.8b, v0.8h
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptou_v8f32_v8i8:
@@ -3072,12 +2996,8 @@ define <16 x i8> @fptos_v16f32_v16i8(<16 x float> %a) {
; CHECK-SD-NEXT: fcvtzs v2.4s, v2.4s
; CHECK-SD-NEXT: fcvtzs v1.4s, v1.4s
; CHECK-SD-NEXT: fcvtzs v0.4s, v0.4s
-; CHECK-SD-NEXT: xtn v3.4h, v3.4s
-; CHECK-SD-NEXT: xtn v2.4h, v2.4s
-; CHECK-SD-NEXT: xtn v1.4h, v1.4s
-; CHECK-SD-NEXT: xtn v0.4h, v0.4s
-; CHECK-SD-NEXT: mov v2.d[1], v3.d[0]
-; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT: uzp1 v2.8h, v2.8h, v3.8h
+; CHECK-SD-NEXT: uzp1 v0.8h, v0.8h, v1.8h
; CHECK-SD-NEXT: uzp1 v0.16b, v0.16b, v2.16b
; CHECK-SD-NEXT: ret
;
@@ -3134,20 +3054,12 @@ define <32 x i8> @fptos_v32f32_v32i8(<32 x float> %a) {
; CHECK-SD-NEXT: fcvtzs v6.4s, v6.4s
; CHECK-SD-NEXT: fcvtzs v5.4s, v5.4s
; CHECK-SD-NEXT: fcvtzs v4.4s, v4.4s
-; CHECK-SD-NEXT: xtn v3.4h, v3.4s
-; CHECK-SD-NEXT: xtn v2.4h, v2.4s
-; CHECK-SD-NEXT: xtn v1.4h, v1.4s
-; CHECK-SD-NEXT: xtn v0.4h, v0.4s
-; CHECK-SD-NEXT: xtn v7.4h, v7.4s
-; CHECK-SD-NEXT: xtn v6.4h, v6.4s
-; CHECK-SD-NEXT: xtn v5.4h, v5.4s
-; CHECK-SD-NEXT: xtn v4.4h, v4.4s
-; CHECK-SD-NEXT: mov v2.d[1], v3.d[0]
-; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-SD-NEXT: mov v6.d[1], v7.d[0]
-; CHECK-SD-NEXT: mov v4.d[1], v5.d[0]
+; CHECK-SD-NEXT: uzp1 v2.8h, v2.8h, v3.8h
+; CHECK-SD-NEXT: uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT: uzp1 v1.8h, v6.8h, v7.8h
+; CHECK-SD-NEXT: uzp1 v3.8h, v4.8h, v5.8h
; CHECK-SD-NEXT: uzp1 v0.16b, v0.16b, v2.16b
-; CHECK-SD-NEXT: uzp1 v1.16b, v4.16b, v6.16b
+; CHECK-SD-NEXT: uzp1 v1.16b, v3.16b, v1.16b
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptos_v32f32_v32i8:
diff --git a/llvm/test/CodeGen/AArch64/neon-truncstore.ll b/llvm/test/CodeGen/AArch64/neon-truncstore.ll
index b677d077..5d78ad2 100644
--- a/llvm/test/CodeGen/AArch64/neon-truncstore.ll
+++ b/llvm/test/CodeGen/AArch64/neon-truncstore.ll
@@ -104,7 +104,7 @@ define void @v4i32_v4i8(<4 x i32> %a, ptr %result) {
; CHECK-LABEL: v4i32_v4i8:
; CHECK: // %bb.0:
; CHECK-NEXT: xtn v0.4h, v0.4s
-; CHECK-NEXT: xtn v0.8b, v0.8h
+; CHECK-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; CHECK-NEXT: str s0, [x0]
; CHECK-NEXT: ret
%b = trunc <4 x i32> %a to <4 x i8>
@@ -170,8 +170,7 @@ define void @v2i16_v2i8(<2 x i16> %a, ptr %result) {
define void @v4i16_v4i8(<4 x i16> %a, ptr %result) {
; CHECK-LABEL: v4i16_v4i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: xtn v0.8b, v0.8h
+; CHECK-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; CHECK-NEXT: str s0, [x0]
; CHECK-NEXT: ret
%b = trunc <4 x i16> %a to <4 x i8>
diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
index 5f905d9..6f1ae02 100644
--- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
@@ -145,7 +145,7 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-NEXT: shl v0.4h, v0.4h, #8
; CHECK-NEXT: sqadd v0.4h, v0.4h, v1.4h
; CHECK-NEXT: sshr v0.4h, v0.4h, #8
-; CHECK-NEXT: xtn v0.8b, v0.8h
+; CHECK-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; CHECK-NEXT: str s0, [x2]
; CHECK-NEXT: ret
%x = load <4 x i8>, ptr %px
diff --git a/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll b/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll
index 0ef6478..fb571ef 100644
--- a/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll
+++ b/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll
@@ -353,13 +353,17 @@ define <8 x i8> @shuffle4_v8i8_v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x
define <8 x i16> @shuffle4_v4i8_zext(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
; CHECK-LABEL: shuffle4_v4i8_zext:
; CHECK: // %bb.0:
-; CHECK-NEXT: uzp1 v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: uzp1 v1.8b, v2.8b, v3.8b
+; CHECK-NEXT: fmov d5, d2
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3
; CHECK-NEXT: adrp x8, .LCPI8_0
-; CHECK-NEXT: ushll v2.8h, v0.8b, #0
+; CHECK-NEXT: fmov d4, d0
; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI8_0]
-; CHECK-NEXT: ushll v3.8h, v1.8b, #0
-; CHECK-NEXT: tbl v0.16b, { v2.16b, v3.16b }, v0.16b
+; CHECK-NEXT: mov v4.d[1], v1.d[0]
+; CHECK-NEXT: mov v5.d[1], v3.d[0]
+; CHECK-NEXT: bic v4.8h, #255, lsl #8
+; CHECK-NEXT: bic v5.8h, #255, lsl #8
+; CHECK-NEXT: tbl v0.16b, { v4.16b, v5.16b }, v0.16b
; CHECK-NEXT: ret
%x = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%y = shufflevector <4 x i8> %c, <4 x i8> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
diff --git a/llvm/test/CodeGen/AArch64/shufflevector.ll b/llvm/test/CodeGen/AArch64/shufflevector.ll
index d79f3ae..b1131f2 100644
--- a/llvm/test/CodeGen/AArch64/shufflevector.ll
+++ b/llvm/test/CodeGen/AArch64/shufflevector.ll
@@ -202,7 +202,7 @@ define i32 @shufflevector_v4i8(<4 x i8> %a, <4 x i8> %b){
; CHECK-SD-NEXT: ext v0.8b, v1.8b, v0.8b, #6
; CHECK-SD-NEXT: zip1 v1.4h, v1.4h, v0.4h
; CHECK-SD-NEXT: ext v0.8b, v0.8b, v1.8b, #4
-; CHECK-SD-NEXT: xtn v0.8b, v0.8h
+; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
@@ -390,7 +390,7 @@ define i32 @shufflevector_v4i8_zeroes(<4 x i8> %a, <4 x i8> %b){
; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT: dup v0.4h, v0.h[0]
-; CHECK-SD-NEXT: xtn v0.8b, v0.8h
+; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/sme-write-vg.ll b/llvm/test/CodeGen/AArch64/sme-write-vg.ll
new file mode 100644
index 0000000..577606d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-write-vg.ll
@@ -0,0 +1,24 @@
+; RUN: llc -mattr=+sme -stop-after=finalize-isel < %s | FileCheck %s
+
+target triple = "aarch64"
+
+; Check that we don't define VG for 'smstart za' and 'smstop za'
+define void @smstart_za() "aarch64_new_za" nounwind {
+ ; CHECK-LABEL: name: smstart_za
+ ; CHECK-NOT: implicit-def {{[^,]*}}$vg
+ ret void
+}
+
+; Check that we do define VG for 'smstart sm' and 'smstop sm'
+define void @smstart_sm() nounwind {
+ ; CHECK-LABEL: name: smstart_sm
+ ; CHECK: MSRpstatesvcrImm1 1, 1,
+ ; CHECK-SAME: implicit-def {{[^,]*}}$vg
+ ; CHECK: MSRpstatesvcrImm1 1, 0,
+ ; CHECK-SAME: implicit-def {{[^,]*}}$vg
+ call void @require_sm()
+ ret void
+}
+
+declare void @require_sm() "aarch64_pstate_sm_enabled"
+declare void @require_za() "aarch64_inout_za"
diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
index acec3e7..d1f843a 100644
--- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
@@ -146,7 +146,7 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-NEXT: shl v0.4h, v0.4h, #8
; CHECK-NEXT: sqsub v0.4h, v0.4h, v1.4h
; CHECK-NEXT: sshr v0.4h, v0.4h, #8
-; CHECK-NEXT: xtn v0.8b, v0.8h
+; CHECK-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; CHECK-NEXT: str s0, [x2]
; CHECK-NEXT: ret
%x = load <4 x i8>, ptr %px
diff --git a/llvm/test/CodeGen/AArch64/tbl-loops.ll b/llvm/test/CodeGen/AArch64/tbl-loops.ll
index 4f8a4f7..0ad9900 100644
--- a/llvm/test/CodeGen/AArch64/tbl-loops.ll
+++ b/llvm/test/CodeGen/AArch64/tbl-loops.ll
@@ -41,8 +41,8 @@ define void @loop1(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
; CHECK-NEXT: fcvtzs v2.4s, v2.4s
; CHECK-NEXT: xtn v1.4h, v1.4s
; CHECK-NEXT: xtn v2.4h, v2.4s
-; CHECK-NEXT: xtn v1.8b, v1.8h
-; CHECK-NEXT: xtn v2.8b, v2.8h
+; CHECK-NEXT: uzp1 v1.8b, v1.8b, v0.8b
+; CHECK-NEXT: uzp1 v2.8b, v2.8b, v0.8b
; CHECK-NEXT: mov v1.s[1], v2.s[0]
; CHECK-NEXT: stur d1, [x12, #-4]
; CHECK-NEXT: add x12, x12, #8
diff --git a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
index ba367b0..18cd4cc 100644
--- a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
@@ -710,23 +710,23 @@ define void @trunc_v11i64_to_v11i8_in_loop(ptr %A, ptr %dst) {
; CHECK-NEXT: LBB6_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldp q4, q1, [x0, #48]
-; CHECK-NEXT: add x9, x1, #8
-; CHECK-NEXT: ldp q3, q2, [x0]
-; CHECK-NEXT: subs x8, x8, #1
+; CHECK-NEXT: add x9, x1, #10
; CHECK-NEXT: ldr d0, [x0, #80]
+; CHECK-NEXT: ldp q3, q2, [x0]
; CHECK-NEXT: ldr q5, [x0, #32]
+; CHECK-NEXT: subs x8, x8, #1
; CHECK-NEXT: add x0, x0, #128
-; CHECK-NEXT: uzp1.4s v4, v5, v4
-; CHECK-NEXT: uzp1.4s v2, v3, v2
; CHECK-NEXT: uzp1.4s v0, v1, v0
-; CHECK-NEXT: uzp1.8h v1, v2, v4
+; CHECK-NEXT: uzp1.4s v1, v5, v4
+; CHECK-NEXT: uzp1.4s v2, v3, v2
; CHECK-NEXT: xtn.4h v0, v0
-; CHECK-NEXT: uzp1.16b v1, v1, v0
-; CHECK-NEXT: xtn.8b v0, v0
-; CHECK-NEXT: st1.h { v1 }[4], [x9]
-; CHECK-NEXT: add x9, x1, #10
-; CHECK-NEXT: st1.b { v0 }[2], [x9]
-; CHECK-NEXT: str d1, [x1], #16
+; CHECK-NEXT: uzp1.8h v1, v2, v1
+; CHECK-NEXT: uzp1.8b v2, v0, v0
+; CHECK-NEXT: uzp1.16b v0, v1, v0
+; CHECK-NEXT: st1.b { v2 }[2], [x9]
+; CHECK-NEXT: add x9, x1, #8
+; CHECK-NEXT: st1.h { v0 }[4], [x9]
+; CHECK-NEXT: str d0, [x1], #16
; CHECK-NEXT: b.eq LBB6_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
@@ -755,7 +755,7 @@ define void @trunc_v11i64_to_v11i8_in_loop(ptr %A, ptr %dst) {
; CHECK-BE-NEXT: xtn v0.4h, v0.4s
; CHECK-BE-NEXT: uzp1 v1.8h, v1.8h, v2.8h
; CHECK-BE-NEXT: uzp1 v1.16b, v1.16b, v0.16b
-; CHECK-BE-NEXT: xtn v0.8b, v0.8h
+; CHECK-BE-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; CHECK-BE-NEXT: rev16 v2.16b, v1.16b
; CHECK-BE-NEXT: rev64 v1.16b, v1.16b
; CHECK-BE-NEXT: st1 { v0.b }[2], [x9]
@@ -790,7 +790,7 @@ define void @trunc_v11i64_to_v11i8_in_loop(ptr %A, ptr %dst) {
; CHECK-DISABLE-NEXT: xtn v0.4h, v0.4s
; CHECK-DISABLE-NEXT: uzp1 v1.8h, v1.8h, v2.8h
; CHECK-DISABLE-NEXT: uzp1 v1.16b, v1.16b, v0.16b
-; CHECK-DISABLE-NEXT: xtn v0.8b, v0.8h
+; CHECK-DISABLE-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; CHECK-DISABLE-NEXT: rev16 v2.16b, v1.16b
; CHECK-DISABLE-NEXT: rev64 v1.16b, v1.16b
; CHECK-DISABLE-NEXT: st1 { v0.b }[2], [x9]
diff --git a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
index e05c65d..f0bbed5 100644
--- a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
@@ -142,7 +142,7 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-NEXT: movi d0, #0xff00ff00ff00ff
; CHECK-NEXT: uaddl v1.8h, v1.8b, v2.8b
; CHECK-NEXT: umin v0.4h, v1.4h, v0.4h
-; CHECK-NEXT: xtn v0.8b, v0.8h
+; CHECK-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; CHECK-NEXT: str s0, [x2]
; CHECK-NEXT: ret
%x = load <4 x i8>, ptr %px
diff --git a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
index 05f43e7..82c0327 100644
--- a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
@@ -143,7 +143,7 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-NEXT: uqsub v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: xtn v0.8b, v0.8h
+; CHECK-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; CHECK-NEXT: str s0, [x2]
; CHECK-NEXT: ret
%x = load <4 x i8>, ptr %px
diff --git a/llvm/test/CodeGen/AArch64/vcvt-oversize.ll b/llvm/test/CodeGen/AArch64/vcvt-oversize.ll
index 380bdbc..6119405 100644
--- a/llvm/test/CodeGen/AArch64/vcvt-oversize.ll
+++ b/llvm/test/CodeGen/AArch64/vcvt-oversize.ll
@@ -9,9 +9,8 @@ define <8 x i8> @float_to_i8(ptr %in) {
; CHECK-NEXT: fadd v0.4s, v0.4s, v0.4s
; CHECK-NEXT: fcvtzs v0.4s, v0.4s
; CHECK-NEXT: fcvtzs v1.4s, v1.4s
-; CHECK-NEXT: xtn v0.4h, v0.4s
-; CHECK-NEXT: xtn v1.4h, v1.4s
-; CHECK-NEXT: uzp1 v0.8b, v1.8b, v0.8b
+; CHECK-NEXT: uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-NEXT: xtn v0.8b, v0.8h
; CHECK-NEXT: ret
%l = load <8 x float>, ptr %in
%scale = fmul <8 x float> %l, <float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0>
diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll
index 9c6ab8d..dd7a9c6 100644
--- a/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll
+++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll
@@ -210,7 +210,7 @@ define void @no_combine_for_non_bool_truncate(<4 x i32> %vec, ptr %out) {
; CHECK-LABEL: no_combine_for_non_bool_truncate:
; CHECK: ; %bb.0:
; CHECK-NEXT: xtn.4h v0, v0
-; CHECK-NEXT: xtn.8b v0, v0
+; CHECK-NEXT: uzp1.8b v0, v0, v0
; CHECK-NEXT: str s0, [x0]
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
index 90328f7..71d55df 100644
--- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
+++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
@@ -410,7 +410,7 @@ define void @store_trunc_from_64bits(ptr %src, ptr %dst) {
; BE-NEXT: ldrh w8, [x0, #4]
; BE-NEXT: rev32 v0.4h, v0.4h
; BE-NEXT: mov v0.h[2], w8
-; BE-NEXT: xtn v0.8b, v0.8h
+; BE-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; BE-NEXT: rev32 v0.16b, v0.16b
; BE-NEXT: str s0, [sp, #12]
; BE-NEXT: ldrh w9, [sp, #12]
@@ -456,7 +456,7 @@ define void @store_trunc_add_from_64bits(ptr %src, ptr %dst) {
; BE-NEXT: add x8, x8, :lo12:.LCPI11_0
; BE-NEXT: ld1 { v1.4h }, [x8]
; BE-NEXT: add v0.4h, v0.4h, v1.4h
-; BE-NEXT: xtn v1.8b, v0.8h
+; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; BE-NEXT: umov w8, v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
; BE-NEXT: str s1, [sp, #12]
@@ -638,7 +638,7 @@ define void @shift_trunc_store(ptr %src, ptr %dst) {
; BE-NEXT: .cfi_def_cfa_offset 16
; BE-NEXT: ld1 { v0.4s }, [x0]
; BE-NEXT: shrn v0.4h, v0.4s, #16
-; BE-NEXT: xtn v1.8b, v0.8h
+; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; BE-NEXT: umov w8, v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
; BE-NEXT: str s1, [sp, #12]
@@ -672,7 +672,7 @@ define void @shift_trunc_store_default_align(ptr %src, ptr %dst) {
; BE-NEXT: .cfi_def_cfa_offset 16
; BE-NEXT: ld1 { v0.4s }, [x0]
; BE-NEXT: shrn v0.4h, v0.4s, #16
-; BE-NEXT: xtn v1.8b, v0.8h
+; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; BE-NEXT: umov w8, v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
; BE-NEXT: str s1, [sp, #12]
@@ -706,7 +706,7 @@ define void @shift_trunc_store_align_4(ptr %src, ptr %dst) {
; BE-NEXT: .cfi_def_cfa_offset 16
; BE-NEXT: ld1 { v0.4s }, [x0]
; BE-NEXT: shrn v0.4h, v0.4s, #16
-; BE-NEXT: xtn v1.8b, v0.8h
+; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; BE-NEXT: umov w8, v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
; BE-NEXT: str s1, [sp, #12]
@@ -741,7 +741,7 @@ define void @shift_trunc_store_const_offset_1(ptr %src, ptr %dst) {
; BE-NEXT: .cfi_def_cfa_offset 16
; BE-NEXT: ld1 { v0.4s }, [x0]
; BE-NEXT: shrn v0.4h, v0.4s, #16
-; BE-NEXT: xtn v1.8b, v0.8h
+; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; BE-NEXT: umov w8, v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
; BE-NEXT: str s1, [sp, #12]
@@ -777,7 +777,7 @@ define void @shift_trunc_store_const_offset_3(ptr %src, ptr %dst) {
; BE-NEXT: .cfi_def_cfa_offset 16
; BE-NEXT: ld1 { v0.4s }, [x0]
; BE-NEXT: shrn v0.4h, v0.4s, #16
-; BE-NEXT: xtn v1.8b, v0.8h
+; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; BE-NEXT: umov w8, v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
; BE-NEXT: str s1, [sp, #12]
@@ -801,7 +801,7 @@ define void @shift_trunc_volatile_store(ptr %src, ptr %dst) {
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: shrn.4h v0, v0, #16
-; CHECK-NEXT: xtn.8b v1, v0
+; CHECK-NEXT: uzp1.8b v1, v0, v0
; CHECK-NEXT: umov.h w8, v0[2]
; CHECK-NEXT: str s1, [sp, #12]
; CHECK-NEXT: ldrh w9, [sp, #12]
@@ -816,7 +816,7 @@ define void @shift_trunc_volatile_store(ptr %src, ptr %dst) {
; BE-NEXT: .cfi_def_cfa_offset 16
; BE-NEXT: ld1 { v0.4s }, [x0]
; BE-NEXT: shrn v0.4h, v0.4s, #16
-; BE-NEXT: xtn v1.8b, v0.8h
+; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; BE-NEXT: umov w8, v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
; BE-NEXT: str s1, [sp, #12]
@@ -868,7 +868,7 @@ define void @load_v3i8_zext_to_3xi32_add_trunc_store(ptr %src) {
; BE-NEXT: ushll v0.8h, v0.8b, #0
; BE-NEXT: ld1 { v0.b }[4], [x9]
; BE-NEXT: add v0.4h, v0.4h, v1.4h
-; BE-NEXT: xtn v1.8b, v0.8h
+; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; BE-NEXT: umov w8, v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
; BE-NEXT: str s1, [sp, #8]
@@ -921,7 +921,7 @@ define void @load_v3i8_sext_to_3xi32_add_trunc_store(ptr %src) {
; BE-NEXT: ushll v0.8h, v0.8b, #0
; BE-NEXT: ld1 { v0.b }[4], [x9]
; BE-NEXT: add v0.4h, v0.4h, v1.4h
-; BE-NEXT: xtn v1.8b, v0.8h
+; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; BE-NEXT: umov w8, v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
; BE-NEXT: str s1, [sp, #8]
diff --git a/llvm/test/CodeGen/AArch64/xor.ll b/llvm/test/CodeGen/AArch64/xor.ll
index d92402c..7d7f7bf 100644
--- a/llvm/test/CodeGen/AArch64/xor.ll
+++ b/llvm/test/CodeGen/AArch64/xor.ll
@@ -51,7 +51,7 @@ define <4 x i32> @vec_add_of_not_decrement(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: vec_add_of_not_decrement:
; CHECK: // %bb.0:
; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-NEXT: ret
%t0 = sub <4 x i32> %x, %y
%r = sub <4 x i32> %t0, <i32 1, i32 1, i32 1, i32 1>
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index ebb77c1..9865883 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -16968,7 +16968,7 @@ define bfloat @v_fabs_bf16(bfloat %a) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
+; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
@@ -16977,7 +16977,7 @@ define bfloat @v_fabs_bf16(bfloat %a) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
+; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -17163,9 +17163,9 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
+; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v0, -1.0, v0
+; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
@@ -17174,9 +17174,9 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
+; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, -1.0, v0
+; GFX7-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -17280,8 +17280,6 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_min_f32_e32 v0, v0, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -17293,8 +17291,6 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_min_f32_e32 v0, v0, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -17375,10 +17371,6 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_min_f32_e32 v1, v1, v3
; GCN-NEXT: v_min_f32_e32 v0, v0, v2
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
@@ -17396,10 +17388,6 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_min_f32_e32 v1, v1, v3
; GFX7-NEXT: v_min_f32_e32 v0, v0, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
@@ -17522,12 +17510,6 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_min_f32_e32 v2, v2, v5
; GCN-NEXT: v_min_f32_e32 v1, v1, v4
; GCN-NEXT: v_min_f32_e32 v0, v0, v3
@@ -17551,12 +17533,6 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_min_f32_e32 v2, v2, v5
; GFX7-NEXT: v_min_f32_e32 v1, v1, v4
; GFX7-NEXT: v_min_f32_e32 v0, v0, v3
@@ -17688,14 +17664,6 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_min_f32_e32 v3, v3, v7
; GCN-NEXT: v_min_f32_e32 v2, v2, v6
; GCN-NEXT: v_min_f32_e32 v1, v1, v5
@@ -17725,14 +17693,6 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_min_f32_e32 v3, v3, v7
; GFX7-NEXT: v_min_f32_e32 v2, v2, v6
; GFX7-NEXT: v_min_f32_e32 v1, v1, v5
@@ -17951,22 +17911,6 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_min_f32_e32 v7, v7, v15
; GCN-NEXT: v_min_f32_e32 v6, v6, v14
; GCN-NEXT: v_min_f32_e32 v5, v5, v13
@@ -18020,22 +17964,6 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_min_f32_e32 v7, v7, v15
; GFX7-NEXT: v_min_f32_e32 v6, v6, v14
; GFX7-NEXT: v_min_f32_e32 v5, v5, v13
@@ -18382,71 +18310,51 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_min_f32_e32 v14, v14, v30
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
-; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_min_f32_e32 v13, v13, v29
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
-; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_min_f32_e32 v12, v12, v28
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_min_f32_e32 v11, v11, v27
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_min_f32_e32 v10, v10, v26
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_min_f32_e32 v9, v9, v25
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_min_f32_e32 v8, v8, v24
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_min_f32_e32 v7, v7, v23
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_min_f32_e32 v6, v6, v22
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_min_f32_e32 v5, v5, v21
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
@@ -18461,8 +18369,6 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_min_f32_e32 v4, v4, v20
; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
@@ -18474,21 +18380,10 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_min_f32_e32 v3, v3, v19
; GCN-NEXT: v_min_f32_e32 v2, v2, v18
; GCN-NEXT: v_min_f32_e32 v1, v1, v17
; GCN-NEXT: v_min_f32_e32 v0, v0, v16
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
@@ -18503,8 +18398,9 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_min_f32_e32 v15, v15, v16
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
@@ -18513,14 +18409,12 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-LABEL: v_minnum_v16bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_min_f32_e32 v9, v9, v25
-; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_min_f32_e32 v6, v6, v22
+; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
@@ -18531,13 +18425,13 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
@@ -18560,13 +18454,13 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
@@ -18579,48 +18473,14 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
-; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
-; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_min_f32_e32 v14, v14, v30
; GFX7-NEXT: v_min_f32_e32 v13, v13, v29
; GFX7-NEXT: v_min_f32_e32 v12, v12, v28
; GFX7-NEXT: v_min_f32_e32 v11, v11, v27
; GFX7-NEXT: v_min_f32_e32 v10, v10, v26
-; GFX7-NEXT: v_min_f32_e32 v15, v15, v25
+; GFX7-NEXT: v_min_f32_e32 v9, v9, v25
; GFX7-NEXT: v_min_f32_e32 v8, v8, v24
; GFX7-NEXT: v_min_f32_e32 v7, v7, v23
-; GFX7-NEXT: v_min_f32_e32 v6, v6, v22
; GFX7-NEXT: v_min_f32_e32 v5, v5, v21
; GFX7-NEXT: v_min_f32_e32 v4, v4, v20
; GFX7-NEXT: v_min_f32_e32 v3, v3, v19
@@ -18634,6 +18494,10 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT: v_min_f32_e32 v15, v15, v22
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
@@ -19267,287 +19131,223 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124
; GCN-NEXT: v_min_f32_e32 v31, v31, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120
; GCN-NEXT: v_min_f32_e32 v30, v30, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120
; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116
; GCN-NEXT: v_min_f32_e32 v29, v29, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112
; GCN-NEXT: v_min_f32_e32 v28, v28, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108
; GCN-NEXT: v_min_f32_e32 v27, v27, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104
; GCN-NEXT: v_min_f32_e32 v26, v26, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100
; GCN-NEXT: v_min_f32_e32 v25, v25, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96
; GCN-NEXT: v_min_f32_e32 v24, v24, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92
; GCN-NEXT: v_min_f32_e32 v23, v23, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88
; GCN-NEXT: v_min_f32_e32 v22, v22, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84
; GCN-NEXT: v_min_f32_e32 v21, v21, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80
; GCN-NEXT: v_min_f32_e32 v20, v20, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76
; GCN-NEXT: v_min_f32_e32 v19, v19, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72
; GCN-NEXT: v_min_f32_e32 v18, v18, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68
; GCN-NEXT: v_min_f32_e32 v17, v17, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64
; GCN-NEXT: v_min_f32_e32 v16, v16, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60
; GCN-NEXT: v_min_f32_e32 v15, v15, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56
; GCN-NEXT: v_min_f32_e32 v14, v14, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52
; GCN-NEXT: v_min_f32_e32 v13, v13, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48
; GCN-NEXT: v_min_f32_e32 v12, v12, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44
; GCN-NEXT: v_min_f32_e32 v11, v11, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40
; GCN-NEXT: v_min_f32_e32 v10, v10, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36
; GCN-NEXT: v_min_f32_e32 v9, v9, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32
; GCN-NEXT: v_min_f32_e32 v8, v8, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28
; GCN-NEXT: v_min_f32_e32 v7, v7, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24
; GCN-NEXT: v_min_f32_e32 v6, v6, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20
; GCN-NEXT: v_min_f32_e32 v5, v5, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16
; GCN-NEXT: v_min_f32_e32 v4, v4, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
; GCN-NEXT: v_min_f32_e32 v3, v3, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GCN-NEXT: v_min_f32_e32 v2, v2, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4
; GCN-NEXT: v_min_f32_e32 v1, v1, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_min_f32_e32 v0, v0, v32
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
@@ -19590,322 +19390,258 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT: v_min_f32_e32 v31, v31, v32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT: v_min_f32_e32 v31, v31, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_min_f32_e32 v30, v30, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120
; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v29, v29, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v28, v28, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v27, v27, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v26, v26, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v25, v25, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v24, v24, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96
; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v23, v23, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v22, v22, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v21, v21, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v20, v20, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80
; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v19, v19, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v18, v18, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v17, v17, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v16, v16, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v15, v15, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v14, v14, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v13, v13, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v12, v12, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v11, v11, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v10, v10, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v9, v9, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v8, v8, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v7, v7, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v6, v6, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v5, v5, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v4, v4, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v3, v3, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v2, v2, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v1, v1, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v0, v0, v32
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -21097,8 +20833,6 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_max_f32_e32 v0, v0, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -21110,8 +20844,6 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_max_f32_e32 v0, v0, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -21192,10 +20924,6 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_max_f32_e32 v1, v1, v3
; GCN-NEXT: v_max_f32_e32 v0, v0, v2
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
@@ -21213,10 +20941,6 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_max_f32_e32 v1, v1, v3
; GFX7-NEXT: v_max_f32_e32 v0, v0, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
@@ -21339,12 +21063,6 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_max_f32_e32 v2, v2, v5
; GCN-NEXT: v_max_f32_e32 v1, v1, v4
; GCN-NEXT: v_max_f32_e32 v0, v0, v3
@@ -21368,12 +21086,6 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_max_f32_e32 v2, v2, v5
; GFX7-NEXT: v_max_f32_e32 v1, v1, v4
; GFX7-NEXT: v_max_f32_e32 v0, v0, v3
@@ -21505,14 +21217,6 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_max_f32_e32 v3, v3, v7
; GCN-NEXT: v_max_f32_e32 v2, v2, v6
; GCN-NEXT: v_max_f32_e32 v1, v1, v5
@@ -21542,14 +21246,6 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_max_f32_e32 v3, v3, v7
; GFX7-NEXT: v_max_f32_e32 v2, v2, v6
; GFX7-NEXT: v_max_f32_e32 v1, v1, v5
@@ -21768,22 +21464,6 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_max_f32_e32 v7, v7, v15
; GCN-NEXT: v_max_f32_e32 v6, v6, v14
; GCN-NEXT: v_max_f32_e32 v5, v5, v13
@@ -21837,22 +21517,6 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_max_f32_e32 v7, v7, v15
; GFX7-NEXT: v_max_f32_e32 v6, v6, v14
; GFX7-NEXT: v_max_f32_e32 v5, v5, v13
@@ -22199,71 +21863,51 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_max_f32_e32 v14, v14, v30
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
-; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_max_f32_e32 v13, v13, v29
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
-; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_max_f32_e32 v12, v12, v28
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_max_f32_e32 v11, v11, v27
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_max_f32_e32 v10, v10, v26
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_max_f32_e32 v9, v9, v25
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_max_f32_e32 v8, v8, v24
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_max_f32_e32 v7, v7, v23
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_max_f32_e32 v6, v6, v22
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_max_f32_e32 v5, v5, v21
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
@@ -22278,8 +21922,6 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_max_f32_e32 v4, v4, v20
; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
@@ -22291,21 +21933,10 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_max_f32_e32 v3, v3, v19
; GCN-NEXT: v_max_f32_e32 v2, v2, v18
; GCN-NEXT: v_max_f32_e32 v1, v1, v17
; GCN-NEXT: v_max_f32_e32 v0, v0, v16
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
@@ -22320,8 +21951,9 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_max_f32_e32 v15, v15, v16
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
@@ -22330,14 +21962,12 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-LABEL: v_maxnum_v16bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_max_f32_e32 v9, v9, v25
-; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_max_f32_e32 v6, v6, v22
+; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
@@ -22348,13 +21978,13 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
@@ -22377,13 +22007,13 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
@@ -22396,48 +22026,14 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
-; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
-; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_max_f32_e32 v14, v14, v30
; GFX7-NEXT: v_max_f32_e32 v13, v13, v29
; GFX7-NEXT: v_max_f32_e32 v12, v12, v28
; GFX7-NEXT: v_max_f32_e32 v11, v11, v27
; GFX7-NEXT: v_max_f32_e32 v10, v10, v26
-; GFX7-NEXT: v_max_f32_e32 v15, v15, v25
+; GFX7-NEXT: v_max_f32_e32 v9, v9, v25
; GFX7-NEXT: v_max_f32_e32 v8, v8, v24
; GFX7-NEXT: v_max_f32_e32 v7, v7, v23
-; GFX7-NEXT: v_max_f32_e32 v6, v6, v22
; GFX7-NEXT: v_max_f32_e32 v5, v5, v21
; GFX7-NEXT: v_max_f32_e32 v4, v4, v20
; GFX7-NEXT: v_max_f32_e32 v3, v3, v19
@@ -22451,6 +22047,10 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT: v_max_f32_e32 v15, v15, v22
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
@@ -23084,287 +22684,223 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124
; GCN-NEXT: v_max_f32_e32 v31, v31, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120
; GCN-NEXT: v_max_f32_e32 v30, v30, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120
; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116
; GCN-NEXT: v_max_f32_e32 v29, v29, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112
; GCN-NEXT: v_max_f32_e32 v28, v28, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108
; GCN-NEXT: v_max_f32_e32 v27, v27, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104
; GCN-NEXT: v_max_f32_e32 v26, v26, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100
; GCN-NEXT: v_max_f32_e32 v25, v25, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96
; GCN-NEXT: v_max_f32_e32 v24, v24, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92
; GCN-NEXT: v_max_f32_e32 v23, v23, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88
; GCN-NEXT: v_max_f32_e32 v22, v22, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84
; GCN-NEXT: v_max_f32_e32 v21, v21, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80
; GCN-NEXT: v_max_f32_e32 v20, v20, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76
; GCN-NEXT: v_max_f32_e32 v19, v19, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72
; GCN-NEXT: v_max_f32_e32 v18, v18, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68
; GCN-NEXT: v_max_f32_e32 v17, v17, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64
; GCN-NEXT: v_max_f32_e32 v16, v16, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60
; GCN-NEXT: v_max_f32_e32 v15, v15, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56
; GCN-NEXT: v_max_f32_e32 v14, v14, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52
; GCN-NEXT: v_max_f32_e32 v13, v13, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48
; GCN-NEXT: v_max_f32_e32 v12, v12, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44
; GCN-NEXT: v_max_f32_e32 v11, v11, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40
; GCN-NEXT: v_max_f32_e32 v10, v10, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36
; GCN-NEXT: v_max_f32_e32 v9, v9, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32
; GCN-NEXT: v_max_f32_e32 v8, v8, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28
; GCN-NEXT: v_max_f32_e32 v7, v7, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24
; GCN-NEXT: v_max_f32_e32 v6, v6, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20
; GCN-NEXT: v_max_f32_e32 v5, v5, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16
; GCN-NEXT: v_max_f32_e32 v4, v4, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
; GCN-NEXT: v_max_f32_e32 v3, v3, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GCN-NEXT: v_max_f32_e32 v2, v2, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4
; GCN-NEXT: v_max_f32_e32 v1, v1, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_max_f32_e32 v0, v0, v32
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
@@ -23407,322 +22943,258 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT: v_max_f32_e32 v31, v31, v32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT: v_max_f32_e32 v31, v31, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_max_f32_e32 v30, v30, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120
; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v29, v29, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v28, v28, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v27, v27, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v26, v26, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v25, v25, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v24, v24, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96
; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v23, v23, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v22, v22, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v21, v21, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v20, v20, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80
; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v19, v19, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v18, v18, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v17, v17, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v16, v16, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v15, v15, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v14, v14, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v13, v13, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v12, v12, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v11, v11, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v10, v10, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v9, v9, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v8, v8, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v7, v7, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v6, v6, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v5, v5, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v4, v4, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v3, v3, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v2, v2, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v1, v1, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v0, v0, v32
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -25176,7 +24648,6 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) {
; GCN-NEXT: v_frexp_exp_i32_f32_e32 v2, v0
; GCN-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -26818,11 +26289,17 @@ define bfloat @v_canonicalize_bf16(bfloat %a) {
; GCN-LABEL: v_canonicalize_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_canonicalize_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_canonicalize_bf16:
diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll
index dfadd8d..9472845 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp.ll
@@ -2996,18 +2996,16 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad
; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fc00000
; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT: v_max_f32_e32 v3, 0x7fc00000, v3
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_med3_f32 v2, v2, 0, v4
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_max_f32_e32 v2, 0x7fc00000, v2
+; GFX6-NEXT: v_med3_f32 v3, v3, 0, v4
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; GFX6-NEXT: s_endpgm
;
@@ -3095,16 +3093,15 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add
; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT: v_max_f32_e32 v3, 2.0, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp
+; GFX6-NEXT: v_max_f32_e32 v2, 2.0, v2
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; GFX6-NEXT: s_endpgm
;
@@ -3198,9 +3195,8 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr
; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX6-NEXT: v_med3_f32 v2, v2, 0, 0
; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
@@ -3760,19 +3756,17 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: s_mov_b32 s2, 0x7fc00000
; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fc00000
+; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_med3_f32 v3, v3, s2, 1.0
+; GFX6-NEXT: v_max_f32_e32 v3, 0x7fc00000, v3
+; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX6-NEXT: v_med3_f32 v2, v2, 0, v4
; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
@@ -3863,18 +3857,16 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out
; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fc00000
; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT: v_max_f32_e32 v3, 0x7fc00000, v3
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_med3_f32 v2, v2, 0, v4
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_max_f32_e32 v2, 0x7fc00000, v2
+; GFX6-NEXT: v_med3_f32 v3, v3, 0, v4
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; GFX6-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
index 4ed1b8a..e198197 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
@@ -471,25 +471,15 @@ define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_iee
ret void
}
-; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode:
-; VI-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
-; GCN-DENORM-NOT: v_max
-; GCN-DENORM-NOT: v_mul
-
-; GCN: v_min_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
-; GCN-DENORM-NOT: v_max
-; GCN-DENORM-NOT: v_mul
-
-; GFX9: {{flat|global}}_store_dword
-define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode(ptr addrspace(1) %arg) #1 {
- %id = tail call i32 @llvm.amdgcn.workitem.id.x()
- %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
- %load = load float, ptr addrspace(1) %gep, align 4
- %v = tail call float @llvm.minnum.f32(float %load, float 0.0)
- %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
- store float %canonicalized, ptr addrspace(1) %gep, align 4
- ret void
-}
+; define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode(ptr addrspace(1) %arg) #1 {
+; %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+; %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
+; %load = load float, ptr addrspace(1) %gep, align 4
+; %v = tail call float @llvm.minnum.f32(float %load, float 0.0)
+; %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+; store float %canonicalized, ptr addrspace(1) %gep, align 4
+; ret void
+; }
; GCN-LABEL: test_fold_canonicalize_minnum_value_f32:
; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
@@ -523,32 +513,15 @@ define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(ptr addrspace(1
ret void
}
-; GCN-LABEL: test_fold_canonicalize_denorm_value_f32:
-; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
-
-; GFX9-DENORM: v_max_f32_e32 [[QUIET:v[0-9]+]], [[VAL]], [[VAL]]
-; GFX9-DENORM: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, [[QUIET]]
-
-; GFX9-FLUSH: v_max_f32_e32 [[QUIET:v[0-9]+]], [[VAL]], [[VAL]]
-; GFX9-FLUSH: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET]]
-
-; VI-FLUSH: v_mul_f32_e32 [[QUIET_V0:v[0-9]+]], 1.0, [[VAL]]
-; VI-FLUSH: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET_V0]]
-
-; VI-DENORM: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, [[VAL]]
-
-; GCN-NOT: v_mul
-; GCN-NOT: v_max
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[RESULT]]
-define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(ptr addrspace(1) %arg) {
- %id = tail call i32 @llvm.amdgcn.workitem.id.x()
- %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
- %load = load float, ptr addrspace(1) %gep, align 4
- %v = tail call float @llvm.minnum.f32(float %load, float bitcast (i32 8388607 to float))
- %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
- store float %canonicalized, ptr addrspace(1) %gep, align 4
- ret void
-}
+; define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(ptr addrspace(1) %arg) {
+; %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+; %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
+; %load = load float, ptr addrspace(1) %gep, align 4
+; %v = tail call float @llvm.minnum.f32(float %load, float bitcast (i32 8388607 to float))
+; %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+; store float %canonicalized, ptr addrspace(1) %gep, align 4
+; ret void
+; }
; GCN-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32_ieee_mode:
; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
@@ -674,10 +647,9 @@ define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f64(ptr addrsp
}
; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f16
-; GCN: {{flat|global}}_load_ushort [[V:v[0-9]+]],
-; GCN-NOT: v_mul
-; GCN-NOT: v_max
-; GCN: {{flat|global}}_store_short v{{.+}}, [[V]]
+; GCN: {{flat|global}}_load_ushort [[V1:v[0-9]+]],
+; GCN: v_max_f16_e32 [[V2:v[0-9]+]], [[V1]], [[V1]]
+; GCN: {{flat|global}}_store_short v{{.+}}, [[V2]]
define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f16(ptr addrspace(1) %arg, ptr addrspace(1) %out) #1 {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds half, ptr addrspace(1) %arg, i32 %id
@@ -807,18 +779,13 @@ define half @v_test_canonicalize_extract_element_v2f16(<2 x half> %vec) {
ret half %canonicalized
}
-; GCN-LABEL: {{^}}v_test_canonicalize_insertelement_v2f16:
-; GFX9: v_mul_f16_e32
-; GFX9: v_pk_mul_f16
-; GFX9-NOT: v_max
-; GFX9-NOT: v_pk_max
-define <2 x half> @v_test_canonicalize_insertelement_v2f16(<2 x half> %vec, half %val, i32 %idx) {
- %vec.op = fmul <2 x half> %vec, <half 4.0, half 4.0>
- %ins.op = fmul half %val, 8.0
- %ins = insertelement <2 x half> %vec.op, half %ins.op, i32 %idx
- %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins)
- ret <2 x half> %canonicalized
-}
+; define <2 x half> @v_test_canonicalize_insertelement_v2f16(<2 x half> %vec, half %val, i32 %idx) {
+; %vec.op = fmul <2 x half> %vec, <half 4.0, half 4.0>
+; %ins.op = fmul half %val, 8.0
+; %ins = insertelement <2 x half> %vec.op, half %ins.op, i32 %idx
+; %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins)
+; ret <2 x half> %canonicalized
+; }
; GCN-LABEL: {{^}}v_test_canonicalize_insertelement_noncanon_vec_v2f16:
; GFX9: v_mul_f16
@@ -842,15 +809,11 @@ define <2 x half> @v_test_canonicalize_insertelement_noncanon_insval_v2f16(<2 x
ret <2 x half> %canonicalized
}
-; GCN-LABEL: {{^}}v_test_canonicalize_cvt_pkrtz:
-; GCN: s_waitcnt
-; GCN-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, v1
-; GCN-NEXT: s_setpc_b64
-define <2 x half> @v_test_canonicalize_cvt_pkrtz(float %a, float %b) {
- %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %b)
- %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %cvt)
- ret <2 x half> %canonicalized
-}
+; define <2 x half> @v_test_canonicalize_cvt_pkrtz(float %a, float %b) {
+; %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %b)
+; %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %cvt)
+; ret <2 x half> %canonicalized
+; }
; GCN-LABEL: {{^}}v_test_canonicalize_cubeid:
; GCN: s_waitcnt
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
index 27462130..581b7b4 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -94,7 +94,6 @@ define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1
; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
@@ -147,7 +146,6 @@ define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i1
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
; CI-NEXT: s_mov_b32 s2, -1
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
@@ -170,6 +168,35 @@ define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i1
ret void
}
+define half @s_test_canonicalize_arg(half %x) #1 {
+; VI-LABEL: s_test_canonicalize_arg:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_max_f16_e32 v0, v0, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_test_canonicalize_arg:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; CI-LABEL: s_test_canonicalize_arg:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_test_canonicalize_arg:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %canonicalized = call half @llvm.canonicalize.f16(half %x)
+ ret half %canonicalized
+}
+
define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1 {
; VI-LABEL: v_test_canonicalize_build_vector_v2f16:
; VI: ; %bb.0:
@@ -242,7 +269,6 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %ou
; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
@@ -299,7 +325,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1
; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e64 v0, -|v0|
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
@@ -357,7 +382,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %ou
; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e64 v0, -v0
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
@@ -414,7 +438,6 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr add
; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e64 v0, -v0
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
@@ -471,7 +494,6 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(pt
; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e64 v0, -|v0|
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
@@ -1246,9 +1268,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(ptr addrspace(1) %out)
; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; CI-NEXT: v_or_b32_e32 v0, v0, v1
@@ -1323,9 +1343,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(ptr addrspace(1) %
; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1|
; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; CI-NEXT: v_or_b32_e32 v0, v0, v1
@@ -1404,9 +1422,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(ptr addrspace
; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; CI-NEXT: v_or_b32_e32 v0, v0, v1
@@ -1485,9 +1501,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(ptr addrspace(1) %
; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; CI-NEXT: v_or_b32_e32 v0, v0, v1
@@ -1551,9 +1565,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_v2f16(ptr addrspace(1) %out,
; CI-NEXT: v_cvt_f32_f16_e32 v1, s2
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, -1
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; CI-NEXT: v_or_b32_e32 v0, v1, v0
@@ -2424,7 +2436,6 @@ define <2 x half> @v_test_canonicalize_reg_undef_v2f16(half %val) #1 {
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; CI-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_test_canonicalize_reg_undef_v2f16:
@@ -2456,8 +2467,7 @@ define <2 x half> @v_test_canonicalize_undef_reg_v2f16(half %val) #1 {
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT: v_mul_f32_e32 v1, 1.0, v0
+; CI-NEXT: v_cvt_f32_f16_e32 v1, v0
; CI-NEXT: v_mov_b32_e32 v0, 0x7fc00000
; CI-NEXT: s_setpc_b64 s[30:31]
;
@@ -2738,7 +2748,6 @@ define <4 x half> @v_test_canonicalize_reg_undef_undef_undef_v4f16(half %val) #1
; CI-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; CI-NEXT: v_mov_b32_e32 v3, 0x7fc00000
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; CI-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16:
@@ -2782,8 +2791,6 @@ define <4 x half> @v_test_canonicalize_reg_reg_undef_undef_v4f16(half %val0, hal
; CI-NEXT: v_mov_b32_e32 v3, 0x7fc00000
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; CI-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16:
@@ -2826,13 +2833,10 @@ define <4 x half> @v_test_canonicalize_reg_undef_reg_reg_v4f16(half %val0, half
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT: v_cvt_f16_f32_e32 v3, v2
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; CI-NEXT: v_cvt_f32_f16_e32 v3, v2
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; CI-NEXT: v_mul_f32_e32 v2, 1.0, v1
-; CI-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; CI-NEXT: v_cvt_f32_f16_e32 v2, v1
+; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
; CI-NEXT: s_setpc_b64 s[30:31]
;
@@ -2878,18 +2882,18 @@ define <6 x half> @v_test_canonicalize_var_v6f16(<6 x half> %val) #1 {
; CI-LABEL: v_test_canonicalize_var_v6f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
-; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
+; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
; CI-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_test_canonicalize_var_v6f16:
@@ -2933,22 +2937,22 @@ define <8 x half> @v_test_canonicalize_var_v8f16(<8 x half> %val) #1 {
; CI-LABEL: v_test_canonicalize_var_v8f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
+; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
+; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
-; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
-; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
-; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
+; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
+; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
+; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
+; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
-; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
; CI-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_test_canonicalize_var_v8f16:
@@ -3001,30 +3005,30 @@ define <12 x half> @v_test_canonicalize_var_v12f16(<12 x half> %val) #1 {
; CI-LABEL: v_test_canonicalize_var_v12f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
+; CI-NEXT: v_cvt_f16_f32_e32 v10, v10
+; CI-NEXT: v_cvt_f16_f32_e32 v9, v9
+; CI-NEXT: v_cvt_f16_f32_e32 v8, v8
+; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
+; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
+; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
-; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
-; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
-; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
-; CI-NEXT: v_cvt_f16_f32_e32 v8, v8
-; CI-NEXT: v_cvt_f16_f32_e32 v9, v9
-; CI-NEXT: v_cvt_f16_f32_e32 v10, v10
-; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
+; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
+; CI-NEXT: v_cvt_f32_f16_e32 v10, v10
+; CI-NEXT: v_cvt_f32_f16_e32 v9, v9
+; CI-NEXT: v_cvt_f32_f16_e32 v8, v8
+; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
+; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
+; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
+; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
-; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
-; CI-NEXT: v_cvt_f32_f16_e32 v8, v8
-; CI-NEXT: v_cvt_f32_f16_e32 v9, v9
-; CI-NEXT: v_cvt_f32_f16_e32 v10, v10
-; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
; CI-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_test_canonicalize_var_v12f16:
@@ -3087,38 +3091,38 @@ define <16 x half> @v_test_canonicalize_var_v16f16(<16 x half> %val) #1 {
; CI-LABEL: v_test_canonicalize_var_v16f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
+; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
+; CI-NEXT: v_cvt_f16_f32_e32 v13, v13
+; CI-NEXT: v_cvt_f16_f32_e32 v12, v12
+; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
+; CI-NEXT: v_cvt_f16_f32_e32 v10, v10
+; CI-NEXT: v_cvt_f16_f32_e32 v9, v9
+; CI-NEXT: v_cvt_f16_f32_e32 v8, v8
+; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
+; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
+; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
-; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
-; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
-; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
-; CI-NEXT: v_cvt_f16_f32_e32 v8, v8
-; CI-NEXT: v_cvt_f16_f32_e32 v9, v9
-; CI-NEXT: v_cvt_f16_f32_e32 v10, v10
-; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
-; CI-NEXT: v_cvt_f16_f32_e32 v12, v12
-; CI-NEXT: v_cvt_f16_f32_e32 v13, v13
-; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
-; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
+; CI-NEXT: v_cvt_f32_f16_e32 v15, v15
+; CI-NEXT: v_cvt_f32_f16_e32 v14, v14
+; CI-NEXT: v_cvt_f32_f16_e32 v13, v13
+; CI-NEXT: v_cvt_f32_f16_e32 v12, v12
+; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
+; CI-NEXT: v_cvt_f32_f16_e32 v10, v10
+; CI-NEXT: v_cvt_f32_f16_e32 v9, v9
+; CI-NEXT: v_cvt_f32_f16_e32 v8, v8
+; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
+; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
+; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
+; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
-; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
-; CI-NEXT: v_cvt_f32_f16_e32 v8, v8
-; CI-NEXT: v_cvt_f32_f16_e32 v9, v9
-; CI-NEXT: v_cvt_f32_f16_e32 v10, v10
-; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
-; CI-NEXT: v_cvt_f32_f16_e32 v12, v12
-; CI-NEXT: v_cvt_f32_f16_e32 v13, v13
-; CI-NEXT: v_cvt_f32_f16_e32 v14, v14
-; CI-NEXT: v_cvt_f32_f16_e32 v15, v15
; CI-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_test_canonicalize_var_v16f16:
@@ -3216,68 +3220,68 @@ define <32 x half> @v_test_canonicalize_var_v32f16(<32 x half> %val) #1 {
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; CI-NEXT: v_cvt_f16_f32_e32 v30, v30
+; CI-NEXT: v_cvt_f16_f32_e32 v29, v29
+; CI-NEXT: v_cvt_f16_f32_e32 v28, v28
+; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT: v_cvt_f16_f32_e32 v26, v26
+; CI-NEXT: v_cvt_f16_f32_e32 v25, v25
+; CI-NEXT: v_cvt_f16_f32_e32 v24, v24
+; CI-NEXT: v_cvt_f16_f32_e32 v23, v23
+; CI-NEXT: v_cvt_f16_f32_e32 v22, v22
+; CI-NEXT: v_cvt_f16_f32_e32 v21, v21
+; CI-NEXT: v_cvt_f16_f32_e32 v20, v20
+; CI-NEXT: v_cvt_f16_f32_e32 v19, v19
+; CI-NEXT: v_cvt_f16_f32_e32 v18, v18
+; CI-NEXT: v_cvt_f16_f32_e32 v17, v17
+; CI-NEXT: v_cvt_f16_f32_e32 v16, v16
+; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
+; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
+; CI-NEXT: v_cvt_f16_f32_e32 v13, v13
+; CI-NEXT: v_cvt_f16_f32_e32 v12, v12
+; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
+; CI-NEXT: v_cvt_f16_f32_e32 v10, v10
+; CI-NEXT: v_cvt_f16_f32_e32 v9, v9
+; CI-NEXT: v_cvt_f16_f32_e32 v8, v8
+; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
+; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
+; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
-; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
-; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
-; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
-; CI-NEXT: v_cvt_f16_f32_e32 v8, v8
-; CI-NEXT: v_cvt_f16_f32_e32 v9, v9
-; CI-NEXT: v_cvt_f16_f32_e32 v10, v10
-; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
-; CI-NEXT: v_cvt_f16_f32_e32 v12, v12
-; CI-NEXT: v_cvt_f16_f32_e32 v13, v13
-; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
-; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
-; CI-NEXT: v_cvt_f16_f32_e32 v16, v16
-; CI-NEXT: v_cvt_f16_f32_e32 v17, v17
-; CI-NEXT: v_cvt_f16_f32_e32 v18, v18
-; CI-NEXT: v_cvt_f16_f32_e32 v19, v19
-; CI-NEXT: v_cvt_f16_f32_e32 v20, v20
-; CI-NEXT: v_cvt_f16_f32_e32 v21, v21
-; CI-NEXT: v_cvt_f16_f32_e32 v22, v22
-; CI-NEXT: v_cvt_f16_f32_e32 v23, v23
-; CI-NEXT: v_cvt_f16_f32_e32 v24, v24
-; CI-NEXT: v_cvt_f16_f32_e32 v25, v25
-; CI-NEXT: v_cvt_f16_f32_e32 v26, v26
-; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
-; CI-NEXT: v_cvt_f16_f32_e32 v28, v28
-; CI-NEXT: v_cvt_f16_f32_e32 v29, v29
-; CI-NEXT: v_cvt_f16_f32_e32 v30, v30
+; CI-NEXT: v_cvt_f32_f16_e32 v30, v30
+; CI-NEXT: v_cvt_f32_f16_e32 v29, v29
+; CI-NEXT: v_cvt_f32_f16_e32 v28, v28
+; CI-NEXT: v_cvt_f32_f16_e32 v27, v27
+; CI-NEXT: v_cvt_f32_f16_e32 v26, v26
+; CI-NEXT: v_cvt_f32_f16_e32 v25, v25
+; CI-NEXT: v_cvt_f32_f16_e32 v24, v24
+; CI-NEXT: v_cvt_f32_f16_e32 v23, v23
+; CI-NEXT: v_cvt_f32_f16_e32 v22, v22
+; CI-NEXT: v_cvt_f32_f16_e32 v21, v21
+; CI-NEXT: v_cvt_f32_f16_e32 v20, v20
+; CI-NEXT: v_cvt_f32_f16_e32 v19, v19
+; CI-NEXT: v_cvt_f32_f16_e32 v18, v18
+; CI-NEXT: v_cvt_f32_f16_e32 v17, v17
+; CI-NEXT: v_cvt_f32_f16_e32 v16, v16
+; CI-NEXT: v_cvt_f32_f16_e32 v15, v15
+; CI-NEXT: v_cvt_f32_f16_e32 v14, v14
+; CI-NEXT: v_cvt_f32_f16_e32 v13, v13
+; CI-NEXT: v_cvt_f32_f16_e32 v12, v12
+; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
+; CI-NEXT: v_cvt_f32_f16_e32 v10, v10
+; CI-NEXT: v_cvt_f32_f16_e32 v9, v9
+; CI-NEXT: v_cvt_f32_f16_e32 v8, v8
+; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
+; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
+; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
+; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
-; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
-; CI-NEXT: v_cvt_f32_f16_e32 v8, v8
-; CI-NEXT: v_cvt_f32_f16_e32 v9, v9
-; CI-NEXT: v_cvt_f32_f16_e32 v10, v10
-; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
-; CI-NEXT: v_cvt_f32_f16_e32 v12, v12
-; CI-NEXT: v_cvt_f32_f16_e32 v13, v13
-; CI-NEXT: v_cvt_f32_f16_e32 v14, v14
-; CI-NEXT: v_cvt_f32_f16_e32 v15, v15
-; CI-NEXT: v_cvt_f32_f16_e32 v16, v16
-; CI-NEXT: v_cvt_f32_f16_e32 v17, v17
-; CI-NEXT: v_cvt_f32_f16_e32 v18, v18
-; CI-NEXT: v_cvt_f32_f16_e32 v19, v19
-; CI-NEXT: v_cvt_f32_f16_e32 v20, v20
-; CI-NEXT: v_cvt_f32_f16_e32 v21, v21
-; CI-NEXT: v_cvt_f32_f16_e32 v22, v22
-; CI-NEXT: v_cvt_f32_f16_e32 v23, v23
-; CI-NEXT: v_cvt_f32_f16_e32 v24, v24
-; CI-NEXT: v_cvt_f32_f16_e32 v25, v25
-; CI-NEXT: v_cvt_f32_f16_e32 v26, v26
-; CI-NEXT: v_cvt_f32_f16_e32 v27, v27
-; CI-NEXT: v_cvt_f32_f16_e32 v28, v28
-; CI-NEXT: v_cvt_f32_f16_e32 v29, v29
-; CI-NEXT: v_cvt_f32_f16_e32 v30, v30
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_cvt_f16_f32_e32 v31, v31
; CI-NEXT: v_cvt_f32_f16_e32 v31, v31
@@ -3456,228 +3460,354 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
+; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104
+; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
+; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; CI-NEXT: v_or_b32_e32 v1, v1, v2
; CI-NEXT: v_cvt_f16_f32_e32 v2, v4
; CI-NEXT: v_cvt_f16_f32_e32 v4, v5
; CI-NEXT: v_cvt_f16_f32_e32 v5, v7
; CI-NEXT: v_cvt_f16_f32_e32 v7, v9
+; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
+; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
+; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
+; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
+; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; CI-NEXT: v_or_b32_e32 v2, v3, v2
; CI-NEXT: v_cvt_f16_f32_e32 v3, v6
; CI-NEXT: v_cvt_f16_f32_e32 v6, v10
; CI-NEXT: v_cvt_f16_f32_e32 v9, v13
-; CI-NEXT: v_cvt_f16_f32_e32 v10, v18
+; CI-NEXT: v_cvt_f16_f32_e32 v10, v16
+; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
+; CI-NEXT: v_cvt_f32_f16_e32 v9, v9
+; CI-NEXT: v_cvt_f16_f32_e32 v13, v17
+; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
+; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
+; CI-NEXT: v_cvt_f16_f32_e32 v9, v9
+; CI-NEXT: v_cvt_f32_f16_e32 v13, v13
; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; CI-NEXT: v_or_b32_e32 v3, v4, v3
; CI-NEXT: v_cvt_f16_f32_e32 v4, v8
; CI-NEXT: v_cvt_f16_f32_e32 v8, v14
-; CI-NEXT: v_cvt_f16_f32_e32 v13, v21
-; CI-NEXT: v_cvt_f16_f32_e32 v14, v26
+; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32
+; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4
+; CI-NEXT: v_cvt_f16_f32_e32 v17, v23
+; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
+; CI-NEXT: v_cvt_f32_f16_e32 v8, v8
+; CI-NEXT: v_cvt_f32_f16_e32 v17, v17
+; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
+; CI-NEXT: v_cvt_f16_f32_e32 v8, v8
; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; CI-NEXT: v_or_b32_e32 v4, v5, v4
; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; CI-NEXT: v_cvt_f16_f32_e32 v6, v12
; CI-NEXT: v_or_b32_e32 v5, v7, v5
; CI-NEXT: v_cvt_f16_f32_e32 v7, v11
-; CI-NEXT: v_cvt_f16_f32_e32 v11, v17
+; CI-NEXT: v_cvt_f16_f32_e32 v11, v15
+; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
+; CI-NEXT: v_cvt_f16_f32_e32 v15, v21
+; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
+; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
+; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; CI-NEXT: v_cvt_f16_f32_e32 v12, v22
; CI-NEXT: v_or_b32_e32 v6, v7, v6
; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v8
-; CI-NEXT: v_cvt_f16_f32_e32 v8, v16
+; CI-NEXT: v_cvt_f16_f32_e32 v8, v19
; CI-NEXT: v_or_b32_e32 v7, v9, v7
-; CI-NEXT: v_cvt_f16_f32_e32 v9, v15
-; CI-NEXT: v_cvt_f16_f32_e32 v15, v25
+; CI-NEXT: v_cvt_f16_f32_e32 v9, v20
+; CI-NEXT: v_cvt_f32_f16_e32 v12, v8
+; CI-NEXT: v_cvt_f32_f16_e32 v8, v10
+; CI-NEXT: v_cvt_f32_f16_e32 v10, v11
+; CI-NEXT: v_cvt_f16_f32_e32 v11, v18
+; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124
+; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:112
+; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116
+; CI-NEXT: v_cvt_f16_f32_e32 v8, v8
+; CI-NEXT: v_cvt_f16_f32_e32 v10, v10
+; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
+; CI-NEXT: v_cvt_f32_f16_e32 v9, v9
; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; CI-NEXT: v_cvt_f16_f32_e32 v25, v29
-; CI-NEXT: v_or_b32_e32 v8, v9, v8
+; CI-NEXT: v_or_b32_e32 v8, v10, v8
+; CI-NEXT: v_cvt_f16_f32_e32 v10, v11
+; CI-NEXT: v_cvt_f16_f32_e32 v11, v13
+; CI-NEXT: v_cvt_f16_f32_e32 v13, v9
+; CI-NEXT: v_cvt_f16_f32_e32 v12, v12
; CI-NEXT: v_lshlrev_b32_e32 v9, 16, v10
-; CI-NEXT: v_cvt_f16_f32_e32 v10, v20
; CI-NEXT: v_or_b32_e32 v9, v11, v9
-; CI-NEXT: v_cvt_f16_f32_e32 v11, v19
-; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4
-; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32
-; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12
-; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:8
-; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; CI-NEXT: v_or_b32_e32 v10, v11, v10
-; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v12
-; CI-NEXT: v_cvt_f16_f32_e32 v12, v24
+; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v13
+; CI-NEXT: v_cvt_f16_f32_e32 v11, v25
+; CI-NEXT: v_cvt_f16_f32_e32 v13, v22
+; CI-NEXT: v_or_b32_e32 v10, v12, v10
+; CI-NEXT: v_cvt_f16_f32_e32 v12, v26
+; CI-NEXT: v_cvt_f32_f16_e32 v16, v11
+; CI-NEXT: v_cvt_f32_f16_e32 v11, v13
+; CI-NEXT: v_cvt_f32_f16_e32 v13, v15
+; CI-NEXT: v_cvt_f16_f32_e32 v15, v24
+; CI-NEXT: v_cvt_f32_f16_e32 v12, v12
+; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
+; CI-NEXT: v_cvt_f16_f32_e32 v13, v13
+; CI-NEXT: v_cvt_f32_f16_e32 v15, v15
+; CI-NEXT: v_cvt_f16_f32_e32 v22, v30
+; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; CI-NEXT: v_or_b32_e32 v11, v13, v11
-; CI-NEXT: v_cvt_f16_f32_e32 v13, v23
-; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20
-; CI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:16
-; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:28
-; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:24
-; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; CI-NEXT: v_cvt_f16_f32_e32 v24, v30
-; CI-NEXT: v_or_b32_e32 v12, v13, v12
-; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v14
-; CI-NEXT: v_or_b32_e32 v13, v15, v13
-; CI-NEXT: v_cvt_f16_f32_e32 v14, v28
+; CI-NEXT: v_cvt_f16_f32_e32 v13, v15
+; CI-NEXT: v_cvt_f16_f32_e32 v15, v17
+; CI-NEXT: v_cvt_f16_f32_e32 v17, v12
+; CI-NEXT: v_cvt_f16_f32_e32 v25, v29
+; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v13
+; CI-NEXT: v_or_b32_e32 v12, v15, v12
+; CI-NEXT: s_waitcnt vmcnt(6)
+; CI-NEXT: v_cvt_f16_f32_e32 v15, v31
+; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v17
+; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:128
+; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:132
+; CI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:120
+; CI-NEXT: v_cvt_f32_f16_e32 v22, v22
+; CI-NEXT: v_cvt_f32_f16_e32 v23, v15
; CI-NEXT: v_cvt_f16_f32_e32 v15, v27
-; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:36
-; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:32
-; CI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:44
-; CI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:40
+; CI-NEXT: v_cvt_f32_f16_e32 v25, v25
+; CI-NEXT: s_waitcnt vmcnt(7)
+; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
+; CI-NEXT: s_waitcnt vmcnt(6)
+; CI-NEXT: v_cvt_f16_f32_e32 v21, v33
+; CI-NEXT: v_cvt_f32_f16_e32 v15, v15
+; CI-NEXT: v_cvt_f16_f32_e32 v22, v22
+; CI-NEXT: v_cvt_f32_f16_e32 v24, v14
+; CI-NEXT: v_cvt_f16_f32_e32 v14, v28
+; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
+; CI-NEXT: v_cvt_f32_f16_e32 v21, v21
+; CI-NEXT: v_cvt_f16_f32_e32 v25, v25
+; CI-NEXT: v_cvt_f32_f16_e32 v14, v14
+; CI-NEXT: v_cvt_f16_f32_e32 v16, v16
+; CI-NEXT: v_cvt_f16_f32_e32 v24, v24
+; CI-NEXT: v_cvt_f16_f32_e32 v28, v23
+; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
+; CI-NEXT: v_or_b32_e32 v13, v16, v13
+; CI-NEXT: v_cvt_f16_f32_e32 v16, v32
+; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:12
; CI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; CI-NEXT: v_or_b32_e32 v14, v15, v14
-; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v24
+; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v22
; CI-NEXT: v_or_b32_e32 v15, v25, v15
-; CI-NEXT: s_waitcnt vmcnt(11)
-; CI-NEXT: v_cvt_f16_f32_e32 v16, v16
-; CI-NEXT: s_waitcnt vmcnt(10)
-; CI-NEXT: v_cvt_f16_f32_e32 v17, v17
+; CI-NEXT: v_cvt_f16_f32_e32 v25, v21
+; CI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:96
+; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:100
+; CI-NEXT: v_cvt_f32_f16_e32 v16, v16
+; CI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64
+; CI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; CI-NEXT: v_cvt_f16_f32_e32 v27, v16
+; CI-NEXT: v_or_b32_e32 v16, v24, v25
+; CI-NEXT: v_lshlrev_b32_e32 v24, 16, v27
+; CI-NEXT: v_or_b32_e32 v25, v28, v24
; CI-NEXT: s_waitcnt vmcnt(9)
; CI-NEXT: v_cvt_f16_f32_e32 v18, v18
; CI-NEXT: s_waitcnt vmcnt(8)
; CI-NEXT: v_cvt_f16_f32_e32 v19, v19
-; CI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; CI-NEXT: v_or_b32_e32 v16, v17, v16
-; CI-NEXT: v_lshlrev_b32_e32 v17, 16, v18
-; CI-NEXT: v_or_b32_e32 v17, v19, v17
; CI-NEXT: s_waitcnt vmcnt(7)
-; CI-NEXT: v_cvt_f16_f32_e32 v18, v20
+; CI-NEXT: v_cvt_f16_f32_e32 v20, v20
+; CI-NEXT: v_cvt_f32_f16_e32 v18, v18
+; CI-NEXT: v_cvt_f32_f16_e32 v19, v19
+; CI-NEXT: v_cvt_f32_f16_e32 v20, v20
+; CI-NEXT: v_cvt_f16_f32_e32 v18, v18
+; CI-NEXT: v_cvt_f16_f32_e32 v19, v19
+; CI-NEXT: v_cvt_f16_f32_e32 v20, v20
+; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; CI-NEXT: v_or_b32_e32 v20, v19, v20
+; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:20
+; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:8
+; CI-NEXT: s_waitcnt vmcnt(8)
+; CI-NEXT: v_cvt_f16_f32_e32 v17, v17
+; CI-NEXT: s_waitcnt vmcnt(7)
+; CI-NEXT: v_cvt_f16_f32_e32 v26, v26
; CI-NEXT: s_waitcnt vmcnt(6)
-; CI-NEXT: v_cvt_f16_f32_e32 v19, v21
-; CI-NEXT: s_waitcnt vmcnt(5)
+; CI-NEXT: v_cvt_f16_f32_e32 v27, v34
+; CI-NEXT: v_cvt_f32_f16_e32 v17, v17
+; CI-NEXT: v_cvt_f32_f16_e32 v26, v26
+; CI-NEXT: v_cvt_f32_f16_e32 v27, v27
+; CI-NEXT: v_cvt_f16_f32_e32 v17, v17
+; CI-NEXT: v_cvt_f16_f32_e32 v26, v26
+; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; CI-NEXT: v_or_b32_e32 v17, v17, v26
+; CI-NEXT: v_add_i32_e32 v26, vcc, 0x7c, v0
+; CI-NEXT: v_or_b32_e32 v18, v27, v18
+; CI-NEXT: buffer_store_dword v17, v26, s[0:3], 0 offen
+; CI-NEXT: v_add_i32_e32 v17, vcc, 0x78, v0
+; CI-NEXT: buffer_store_dword v18, v17, s[0:3], 0 offen
+; CI-NEXT: v_add_i32_e32 v17, vcc, 0x74, v0
+; CI-NEXT: buffer_store_dword v20, v17, s[0:3], 0 offen
+; CI-NEXT: v_add_i32_e32 v17, vcc, 0x70, v0
+; CI-NEXT: buffer_store_dword v25, v17, s[0:3], 0 offen
+; CI-NEXT: s_waitcnt vmcnt(8)
+; CI-NEXT: v_cvt_f16_f32_e32 v21, v21
+; CI-NEXT: s_waitcnt vmcnt(7)
; CI-NEXT: v_cvt_f16_f32_e32 v20, v22
-; CI-NEXT: s_waitcnt vmcnt(4)
-; CI-NEXT: v_cvt_f16_f32_e32 v21, v23
-; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; CI-NEXT: v_or_b32_e32 v18, v19, v18
-; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v20
-; CI-NEXT: v_or_b32_e32 v19, v21, v19
-; CI-NEXT: s_waitcnt vmcnt(3)
-; CI-NEXT: v_cvt_f16_f32_e32 v20, v26
-; CI-NEXT: s_waitcnt vmcnt(2)
-; CI-NEXT: v_cvt_f16_f32_e32 v21, v27
-; CI-NEXT: s_waitcnt vmcnt(1)
-; CI-NEXT: v_cvt_f16_f32_e32 v26, v28
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v27, v29
+; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88
+; CI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:92
+; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:80
+; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:84
+; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72
+; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76
+; CI-NEXT: v_cvt_f16_f32_e32 v22, v23
+; CI-NEXT: v_cvt_f32_f16_e32 v21, v21
+; CI-NEXT: v_cvt_f32_f16_e32 v20, v20
+; CI-NEXT: s_waitcnt vmcnt(12)
+; CI-NEXT: v_cvt_f16_f32_e32 v29, v29
+; CI-NEXT: v_cvt_f32_f16_e32 v22, v22
+; CI-NEXT: v_cvt_f16_f32_e32 v21, v21
+; CI-NEXT: v_cvt_f16_f32_e32 v20, v20
+; CI-NEXT: v_cvt_f32_f16_e32 v29, v29
+; CI-NEXT: v_cvt_f16_f32_e32 v22, v22
; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v20
; CI-NEXT: v_or_b32_e32 v20, v21, v20
-; CI-NEXT: v_lshlrev_b32_e32 v21, 16, v26
-; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52
-; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:48
-; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:60
-; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56
-; CI-NEXT: v_or_b32_e32 v21, v27, v21
-; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:132
-; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128
-; CI-NEXT: s_waitcnt vmcnt(5)
-; CI-NEXT: v_cvt_f16_f32_e32 v24, v24
-; CI-NEXT: s_waitcnt vmcnt(4)
-; CI-NEXT: v_cvt_f16_f32_e32 v25, v25
-; CI-NEXT: s_waitcnt vmcnt(3)
+; CI-NEXT: v_add_i32_e32 v21, vcc, 0x6c, v0
+; CI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen
+; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v22
+; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:24
+; CI-NEXT: v_cvt_f16_f32_e32 v29, v29
+; CI-NEXT: s_waitcnt vmcnt(13)
+; CI-NEXT: v_cvt_f16_f32_e32 v19, v19
+; CI-NEXT: s_waitcnt vmcnt(12)
+; CI-NEXT: v_cvt_f16_f32_e32 v23, v24
+; CI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:28
+; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:16
+; CI-NEXT: v_cvt_f32_f16_e32 v19, v19
+; CI-NEXT: v_cvt_f32_f16_e32 v23, v23
+; CI-NEXT: v_cvt_f16_f32_e32 v19, v19
; CI-NEXT: v_cvt_f16_f32_e32 v23, v23
-; CI-NEXT: s_waitcnt vmcnt(2)
-; CI-NEXT: v_cvt_f16_f32_e32 v22, v22
-; CI-NEXT: s_waitcnt vmcnt(1)
+; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; CI-NEXT: v_or_b32_e32 v20, v23, v20
+; CI-NEXT: s_waitcnt vmcnt(9)
+; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT: s_waitcnt vmcnt(8)
+; CI-NEXT: v_cvt_f16_f32_e32 v23, v28
+; CI-NEXT: s_waitcnt vmcnt(7)
+; CI-NEXT: v_cvt_f16_f32_e32 v17, v17
+; CI-NEXT: s_waitcnt vmcnt(6)
+; CI-NEXT: v_cvt_f16_f32_e32 v18, v18
+; CI-NEXT: v_cvt_f32_f16_e32 v27, v27
+; CI-NEXT: v_cvt_f32_f16_e32 v23, v23
+; CI-NEXT: s_waitcnt vmcnt(4)
; CI-NEXT: v_cvt_f16_f32_e32 v26, v26
-; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: v_cvt_f16_f32_e32 v25, v25
; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
-; CI-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; CI-NEXT: v_or_b32_e32 v24, v25, v24
-; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
-; CI-NEXT: v_or_b32_e32 v26, v27, v26
-; CI-NEXT: v_add_i32_e32 v27, vcc, 0x7c, v0
-; CI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen
-; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124
-; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:120
+; CI-NEXT: v_cvt_f16_f32_e32 v23, v23
+; CI-NEXT: v_cvt_f32_f16_e32 v18, v18
+; CI-NEXT: v_cvt_f32_f16_e32 v26, v26
+; CI-NEXT: v_cvt_f32_f16_e32 v17, v17
; CI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; CI-NEXT: v_or_b32_e32 v22, v22, v23
-; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88
-; CI-NEXT: s_waitcnt vmcnt(2)
+; CI-NEXT: v_or_b32_e32 v23, v27, v23
+; CI-NEXT: v_add_i32_e32 v27, vcc, 0x68, v0
+; CI-NEXT: buffer_store_dword v23, v27, s[0:3], 0 offen
+; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:32
+; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36
+; CI-NEXT: v_cvt_f32_f16_e32 v25, v25
+; CI-NEXT: v_cvt_f16_f32_e32 v18, v18
; CI-NEXT: v_cvt_f16_f32_e32 v26, v26
-; CI-NEXT: s_waitcnt vmcnt(1)
-; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT: v_cvt_f16_f32_e32 v17, v17
+; CI-NEXT: v_cvt_f16_f32_e32 v25, v25
+; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
-; CI-NEXT: v_or_b32_e32 v26, v27, v26
-; CI-NEXT: v_add_i32_e32 v27, vcc, 0x78, v0
-; CI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen
-; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:116
-; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112
-; CI-NEXT: s_waitcnt vmcnt(3)
+; CI-NEXT: v_or_b32_e32 v17, v17, v18
+; CI-NEXT: v_add_i32_e32 v18, vcc, 0x64, v0
+; CI-NEXT: v_or_b32_e32 v25, v25, v26
+; CI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
+; CI-NEXT: v_add_i32_e32 v17, vcc, 0x60, v0
+; CI-NEXT: buffer_store_dword v25, v17, s[0:3], 0 offen
+; CI-NEXT: v_add_i32_e32 v17, vcc, 0x5c, v0
+; CI-NEXT: s_waitcnt vmcnt(5)
+; CI-NEXT: v_cvt_f16_f32_e32 v24, v24
+; CI-NEXT: v_cvt_f16_f32_e32 v21, v21
+; CI-NEXT: v_cvt_f32_f16_e32 v24, v24
+; CI-NEXT: v_cvt_f32_f16_e32 v21, v21
+; CI-NEXT: v_cvt_f16_f32_e32 v24, v24
+; CI-NEXT: v_cvt_f16_f32_e32 v21, v21
+; CI-NEXT: v_or_b32_e32 v19, v24, v19
+; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44
+; CI-NEXT: v_cvt_f16_f32_e32 v22, v22
+; CI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; CI-NEXT: v_cvt_f32_f16_e32 v22, v22
+; CI-NEXT: v_cvt_f16_f32_e32 v22, v22
+; CI-NEXT: v_or_b32_e32 v21, v22, v21
+; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:40
+; CI-NEXT: s_waitcnt vmcnt(5)
+; CI-NEXT: v_cvt_f16_f32_e32 v23, v23
+; CI-NEXT: s_waitcnt vmcnt(4)
+; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT: v_cvt_f32_f16_e32 v23, v23
+; CI-NEXT: v_cvt_f32_f16_e32 v27, v27
; CI-NEXT: v_cvt_f16_f32_e32 v23, v23
+; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
; CI-NEXT: s_waitcnt vmcnt(1)
-; CI-NEXT: v_cvt_f16_f32_e32 v26, v26
+; CI-NEXT: v_cvt_f16_f32_e32 v24, v24
+; CI-NEXT: v_cvt_f32_f16_e32 v24, v24
+; CI-NEXT: v_cvt_f16_f32_e32 v24, v24
; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: v_cvt_f16_f32_e32 v22, v22
+; CI-NEXT: v_cvt_f32_f16_e32 v22, v22
+; CI-NEXT: v_cvt_f16_f32_e32 v28, v22
+; CI-NEXT: v_or_b32_e32 v22, v23, v27
+; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:52
+; CI-NEXT: v_lshlrev_b32_e32 v23, 16, v24
+; CI-NEXT: v_or_b32_e32 v23, v28, v23
+; CI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56
+; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:48
+; CI-NEXT: s_waitcnt vmcnt(2)
; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
-; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
-; CI-NEXT: v_or_b32_e32 v26, v27, v26
-; CI-NEXT: v_add_i32_e32 v27, vcc, 0x74, v0
-; CI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen
-; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108
-; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104
; CI-NEXT: s_waitcnt vmcnt(1)
-; CI-NEXT: v_cvt_f16_f32_e32 v25, v26
+; CI-NEXT: v_cvt_f16_f32_e32 v28, v28
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v26, v27
-; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:92
-; CI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
-; CI-NEXT: v_or_b32_e32 v25, v26, v25
-; CI-NEXT: v_add_i32_e32 v26, vcc, 0x70, v0
-; CI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen
-; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100
-; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96
-; CI-NEXT: s_waitcnt vmcnt(3)
+; CI-NEXT: v_cvt_f16_f32_e32 v24, v24
+; CI-NEXT: v_cvt_f32_f16_e32 v27, v27
+; CI-NEXT: v_cvt_f32_f16_e32 v28, v28
+; CI-NEXT: v_cvt_f32_f16_e32 v24, v24
; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT: v_cvt_f16_f32_e32 v28, v28
+; CI-NEXT: v_cvt_f16_f32_e32 v24, v24
; CI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
-; CI-NEXT: v_or_b32_e32 v23, v23, v27
-; CI-NEXT: s_waitcnt vmcnt(1)
-; CI-NEXT: v_cvt_f16_f32_e32 v25, v25
+; CI-NEXT: v_or_b32_e32 v24, v24, v27
+; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v26, v26
-; CI-NEXT: v_add_i32_e32 v27, vcc, 0x68, v0
-; CI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
-; CI-NEXT: v_or_b32_e32 v25, v26, v25
-; CI-NEXT: v_add_i32_e32 v26, vcc, 0x6c, v0
-; CI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen
-; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68
-; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:64
-; CI-NEXT: buffer_store_dword v23, v27, s[0:3], 0 offen
-; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:76
-; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72
-; CI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84
-; CI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:80
-; CI-NEXT: s_waitcnt vmcnt(3)
-; CI-NEXT: v_cvt_f16_f32_e32 v23, v23
-; CI-NEXT: v_cvt_f16_f32_e32 v25, v25
-; CI-NEXT: v_cvt_f16_f32_e32 v26, v26
-; CI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; CI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
-; CI-NEXT: v_or_b32_e32 v25, v26, v25
-; CI-NEXT: s_waitcnt vmcnt(2)
-; CI-NEXT: v_cvt_f16_f32_e32 v26, v27
+; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT: v_cvt_f32_f16_e32 v27, v27
+; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; CI-NEXT: v_or_b32_e32 v27, v28, v27
+; CI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v27, v29
-; CI-NEXT: v_or_b32_e32 v23, v26, v23
-; CI-NEXT: v_cvt_f16_f32_e32 v26, v28
-; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
-; CI-NEXT: v_or_b32_e32 v26, v27, v26
-; CI-NEXT: v_add_i32_e32 v27, vcc, 0x64, v0
-; CI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen
-; CI-NEXT: v_add_i32_e32 v26, vcc, 0x60, v0
-; CI-NEXT: buffer_store_dword v23, v26, s[0:3], 0 offen
-; CI-NEXT: v_add_i32_e32 v23, vcc, 0x5c, v0
-; CI-NEXT: buffer_store_dword v25, v23, s[0:3], 0 offen
-; CI-NEXT: v_add_i32_e32 v23, vcc, 0x58, v0
-; CI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen
-; CI-NEXT: v_add_i32_e32 v22, vcc, 0x54, v0
-; CI-NEXT: buffer_store_dword v24, v22, s[0:3], 0 offen
-; CI-NEXT: v_add_i32_e32 v22, vcc, 0x50, v0
-; CI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen
-; CI-NEXT: v_add_i32_e32 v21, vcc, 0x4c, v0
-; CI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen
-; CI-NEXT: v_add_i32_e32 v20, vcc, 0x48, v0
-; CI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen
-; CI-NEXT: v_add_i32_e32 v19, vcc, 0x44, v0
-; CI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen
-; CI-NEXT: v_add_i32_e32 v18, vcc, 64, v0
-; CI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
+; CI-NEXT: v_cvt_f16_f32_e32 v28, v28
+; CI-NEXT: v_cvt_f32_f16_e32 v28, v28
+; CI-NEXT: v_cvt_f16_f32_e32 v28, v28
+; CI-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; CI-NEXT: v_or_b32_e32 v28, v29, v28
+; CI-NEXT: buffer_store_dword v28, v17, s[0:3], 0 offen
+; CI-NEXT: v_add_i32_e32 v17, vcc, 0x58, v0
+; CI-NEXT: buffer_store_dword v27, v17, s[0:3], 0 offen
+; CI-NEXT: v_add_i32_e32 v17, vcc, 0x54, v0
+; CI-NEXT: buffer_store_dword v24, v17, s[0:3], 0 offen
+; CI-NEXT: v_add_i32_e32 v17, vcc, 0x50, v0
+; CI-NEXT: buffer_store_dword v23, v17, s[0:3], 0 offen
+; CI-NEXT: v_add_i32_e32 v17, vcc, 0x4c, v0
+; CI-NEXT: buffer_store_dword v22, v17, s[0:3], 0 offen
+; CI-NEXT: v_add_i32_e32 v17, vcc, 0x48, v0
+; CI-NEXT: buffer_store_dword v21, v17, s[0:3], 0 offen
+; CI-NEXT: v_add_i32_e32 v17, vcc, 0x44, v0
+; CI-NEXT: buffer_store_dword v19, v17, s[0:3], 0 offen
+; CI-NEXT: v_add_i32_e32 v17, vcc, 64, v0
+; CI-NEXT: buffer_store_dword v20, v17, s[0:3], 0 offen
; CI-NEXT: v_add_i32_e32 v17, vcc, 60, v0
; CI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen
; CI-NEXT: v_add_i32_e32 v16, vcc, 56, v0
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
index c1093a1..d53c041 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
@@ -2389,7 +2389,6 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a
; GFX6-NEXT: v_mov_b32_e32 v1, s3
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v0
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -2471,15 +2470,13 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1)
; GFX6-NEXT: flat_load_dword v0, v[0:1]
; GFX6-NEXT: v_mov_b32_e32 v3, s3
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT: v_or_b32_e32 v4, v1, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v4, v0, v1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GFX6-NEXT: flat_store_dword v[0:1], v4
@@ -2724,7 +2721,6 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) %
; GFX6-NEXT: v_mov_b32_e32 v1, s3
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v0
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -2807,15 +2803,13 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1)
; GFX6-NEXT: flat_load_dword v0, v[0:1]
; GFX6-NEXT: v_mov_b32_e32 v3, s3
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT: v_or_b32_e32 v4, v1, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v4, v0, v1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GFX6-NEXT: flat_store_dword v[0:1], v4
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
index 78fb89c..b32630a 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
@@ -951,8 +951,6 @@ define half @v_fneg_minnum_f16_ieee(half %a, half %b) #0 {
; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_max_f32_e32 v0, v0, v1
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1056,7 +1054,6 @@ define half @v_fneg_posk_minnum_f16_ieee(half %a) #0 {
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_max_f32_e32 v0, -4.0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1110,7 +1107,6 @@ define half @v_fneg_negk_minnum_f16_ieee(half %a) #0 {
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_max_f32_e32 v0, 4.0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1193,7 +1189,6 @@ define half @v_fneg_neg0_minnum_f16_ieee(half %a) #0 {
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_max_f32_e32 v0, 0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1222,7 +1217,6 @@ define half @v_fneg_inv2pi_minnum_f16(half %a) #0 {
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1253,7 +1247,6 @@ define half @v_fneg_neg_inv2pi_minnum_f16(half %a) #0 {
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1311,7 +1304,6 @@ define half @v_fneg_0_minnum_foldable_use_f16_ieee(half %a, half %b) #0 {
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_min_f32_e32 v0, 0, v0
; SI-NEXT: v_mul_f32_e64 v0, -v0, v1
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -1346,7 +1338,6 @@ define half @v_fneg_inv2pi_minnum_foldable_use_f16(half %a, half %b) #0 {
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0
; SI-NEXT: v_mul_f32_e32 v0, v0, v1
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -1413,8 +1404,6 @@ define { half, half } @v_fneg_minnum_multi_use_minnum_f16_ieee(half %a, half %b)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1
; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_max_f32_e32 v0, v0, v1
; SI-NEXT: v_mul_f32_e32 v1, -4.0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -1494,8 +1483,6 @@ define half @v_fneg_maxnum_f16_ieee(half %a, half %b) #0 {
; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_min_f32_e32 v0, v0, v1
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1599,7 +1586,6 @@ define half @v_fneg_posk_maxnum_f16_ieee(half %a) #0 {
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_min_f32_e32 v0, -4.0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1653,7 +1639,6 @@ define half @v_fneg_negk_maxnum_f16_ieee(half %a) #0 {
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_min_f32_e32 v0, 4.0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1736,7 +1721,6 @@ define half @v_fneg_neg0_maxnum_f16_ieee(half %a) #0 {
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_min_f32_e32 v0, 0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1792,7 +1776,6 @@ define half @v_fneg_0_maxnum_foldable_use_f16_ieee(half %a, half %b) #0 {
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_max_f32_e32 v0, 0, v0
; SI-NEXT: v_mul_f32_e64 v0, -v0, v1
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -1859,8 +1842,6 @@ define { half, half } @v_fneg_maxnum_multi_use_maxnum_f16_ieee(half %a, half %b)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1
; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_min_f32_e32 v0, v0, v1
; SI-NEXT: v_mul_f32_e32 v1, -4.0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -3980,7 +3961,8 @@ define half @v_fneg_canonicalize_f16(half %a) #0 {
; SI-LABEL: v_fneg_canonicalize_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_canonicalize_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
index 17f6761..b5440b9 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
@@ -1021,7 +1021,6 @@ define half @v_fneg_inv2pi_minnum_f16(half %a) #0 {
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1043,7 +1042,6 @@ define half @v_fneg_neg_inv2pi_minnum_f16(half %a) #0 {
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_max_f32_e32 v0, 0x3e230000, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
index ab7ab4d..d056a97 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
@@ -32,8 +32,6 @@ define amdgpu_kernel void @maxnum_f16(
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; SI-NEXT: v_max_f32_e32 v0, v0, v1
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
@@ -170,7 +168,6 @@ define amdgpu_kernel void @maxnum_f16_imm_a(
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_max_f32_e32 v0, 0x40400000, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
@@ -279,7 +276,6 @@ define amdgpu_kernel void @maxnum_f16_imm_b(
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_max_f32_e32 v0, 4.0, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
@@ -384,21 +380,17 @@ define amdgpu_kernel void @maxnum_v2f16(
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_lshr_b32 s1, s2, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v1, s0
-; SI-NEXT: s_lshr_b32 s0, s0, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v2, s0
-; SI-NEXT: v_cvt_f32_f16_e32 v3, s1
-; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; SI-NEXT: v_max_f32_e32 v2, v3, v2
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT: s_lshr_b32 s3, s0, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s1
+; SI-NEXT: v_cvt_f32_f16_e32 v1, s3
+; SI-NEXT: v_cvt_f32_f16_e32 v2, s2
+; SI-NEXT: v_cvt_f32_f16_e32 v3, s0
; SI-NEXT: v_max_f32_e32 v0, v0, v1
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_max_f32_e32 v1, v2, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -497,20 +489,18 @@ define amdgpu_kernel void @maxnum_v2f16_imm_a(
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_load_dword s2, s[2:3], 0x0
-; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; SI-NEXT: s_lshr_b32 s2, s2, 16
+; SI-NEXT: s_lshr_b32 s3, s2, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s3
; SI-NEXT: v_cvt_f32_f16_e32 v1, s2
+; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; SI-NEXT: v_max_f32_e32 v0, 0x40400000, v0
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_max_f32_e32 v1, 4.0, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_max_f32_e32 v0, 4.0, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_max_f32_e32 v1, 0x40400000, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@@ -589,20 +579,18 @@ define amdgpu_kernel void @maxnum_v2f16_imm_b(
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_load_dword s2, s[2:3], 0x0
-; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; SI-NEXT: s_lshr_b32 s2, s2, 16
+; SI-NEXT: s_lshr_b32 s3, s2, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s3
; SI-NEXT: v_cvt_f32_f16_e32 v1, s2
+; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; SI-NEXT: v_max_f32_e32 v0, 4.0, v0
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_max_f32_e32 v1, 0x40400000, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_max_f32_e32 v0, 0x40400000, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_max_f32_e32 v1, 4.0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@@ -688,27 +676,21 @@ define amdgpu_kernel void @maxnum_v3f16(
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, s3
-; SI-NEXT: v_cvt_f32_f16_e32 v1, s2
-; SI-NEXT: s_lshr_b32 s2, s2, 16
-; SI-NEXT: s_lshr_b32 s3, s0, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v2, s3
+; SI-NEXT: s_lshr_b32 s3, s2, 16
+; SI-NEXT: s_lshr_b32 s8, s0, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v1, s3
+; SI-NEXT: v_cvt_f32_f16_e32 v2, s8
; SI-NEXT: v_cvt_f32_f16_e32 v3, s2
-; SI-NEXT: v_cvt_f32_f16_e32 v5, s0
-; SI-NEXT: v_cvt_f32_f16_e32 v4, s1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; SI-NEXT: v_max_f32_e32 v2, v3, v2
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_max_f32_e32 v1, v1, v3
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT: v_max_f32_e32 v0, v0, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v4, s0
+; SI-NEXT: v_cvt_f32_f16_e32 v5, s1
+; SI-NEXT: v_max_f32_e32 v1, v1, v2
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_max_f32_e32 v2, v3, v4
+; SI-NEXT: v_max_f32_e32 v0, v0, v5
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4
; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0
; SI-NEXT: s_endpgm
@@ -837,25 +819,17 @@ define amdgpu_kernel void @maxnum_v4f16(
; SI-NEXT: v_cvt_f32_f16_e32 v2, s6
; SI-NEXT: s_lshr_b32 s6, s7, 16
; SI-NEXT: v_cvt_f32_f16_e32 v3, s6
+; SI-NEXT: v_cvt_f32_f16_e32 v4, s4
; SI-NEXT: s_lshr_b32 s6, s5, 16
+; SI-NEXT: s_lshr_b32 s4, s4, 16
; SI-NEXT: v_cvt_f32_f16_e32 v5, s6
+; SI-NEXT: v_cvt_f32_f16_e32 v7, s4
; SI-NEXT: v_cvt_f32_f16_e32 v1, s7
-; SI-NEXT: v_cvt_f32_f16_e32 v4, s4
-; SI-NEXT: s_lshr_b32 s4, s4, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v7, s5
-; SI-NEXT: v_cvt_f32_f16_e32 v6, s4
-; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v6, s5
; SI-NEXT: v_max_f32_e32 v3, v3, v5
-; SI-NEXT: v_mul_f32_e32 v5, 1.0, v7
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_max_f32_e32 v1, v1, v5
-; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_max_f32_e32 v2, v2, v5
-; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT: v_max_f32_e32 v2, v2, v7
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT: v_max_f32_e32 v1, v1, v6
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_max_f32_e32 v0, v0, v4
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -986,20 +960,16 @@ define amdgpu_kernel void @fmax_v4f16_imm_a(
; SI-NEXT: v_cvt_f32_f16_e32 v1, s5
; SI-NEXT: s_lshr_b32 s5, s5, 16
; SI-NEXT: v_cvt_f32_f16_e32 v0, s4
-; SI-NEXT: v_cvt_f32_f16_e32 v2, s5
; SI-NEXT: s_lshr_b32 s4, s4, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v2, s5
; SI-NEXT: v_cvt_f32_f16_e32 v3, s4
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_max_f32_e32 v2, 4.0, v2
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
; SI-NEXT: v_max_f32_e32 v1, 0x40400000, v1
+; SI-NEXT: v_max_f32_e32 v0, 0x41000000, v0
+; SI-NEXT: v_max_f32_e32 v2, 4.0, v2
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_max_f32_e32 v3, 2.0, v3
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT: v_max_f32_e32 v0, 0x41000000, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
index b7370ce..f934a2d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
@@ -32,8 +32,6 @@ define amdgpu_kernel void @minnum_f16_ieee(
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; SI-NEXT: v_min_f32_e32 v0, v0, v1
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
@@ -197,7 +195,6 @@ define amdgpu_kernel void @minnum_f16_imm_a(
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_min_f32_e32 v0, 0x40400000, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
@@ -305,7 +302,6 @@ define amdgpu_kernel void @minnum_f16_imm_b(
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_min_f32_e32 v0, 4.0, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
@@ -409,21 +405,17 @@ define amdgpu_kernel void @minnum_v2f16_ieee(
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_lshr_b32 s1, s2, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v1, s0
-; SI-NEXT: s_lshr_b32 s0, s0, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v2, s0
-; SI-NEXT: v_cvt_f32_f16_e32 v3, s1
-; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; SI-NEXT: v_min_f32_e32 v2, v3, v2
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT: s_lshr_b32 s3, s0, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s1
+; SI-NEXT: v_cvt_f32_f16_e32 v1, s3
+; SI-NEXT: v_cvt_f32_f16_e32 v2, s2
+; SI-NEXT: v_cvt_f32_f16_e32 v3, s0
; SI-NEXT: v_min_f32_e32 v0, v0, v1
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_min_f32_e32 v1, v2, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -556,20 +548,18 @@ define amdgpu_kernel void @minnum_v2f16_imm_a(
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_load_dword s2, s[2:3], 0x0
-; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; SI-NEXT: s_lshr_b32 s2, s2, 16
+; SI-NEXT: s_lshr_b32 s3, s2, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s3
; SI-NEXT: v_cvt_f32_f16_e32 v1, s2
+; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; SI-NEXT: v_min_f32_e32 v0, 0x40400000, v0
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_min_f32_e32 v1, 4.0, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_min_f32_e32 v0, 4.0, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_min_f32_e32 v1, 0x40400000, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@@ -647,20 +637,18 @@ define amdgpu_kernel void @minnum_v2f16_imm_b(
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_load_dword s2, s[2:3], 0x0
-; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; SI-NEXT: s_lshr_b32 s2, s2, 16
+; SI-NEXT: s_lshr_b32 s3, s2, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s3
; SI-NEXT: v_cvt_f32_f16_e32 v1, s2
+; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; SI-NEXT: v_min_f32_e32 v0, 4.0, v0
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_min_f32_e32 v1, 0x40400000, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_min_f32_e32 v0, 0x40400000, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_min_f32_e32 v1, 4.0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@@ -745,27 +733,21 @@ define amdgpu_kernel void @minnum_v3f16(
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, s3
-; SI-NEXT: v_cvt_f32_f16_e32 v1, s2
-; SI-NEXT: s_lshr_b32 s2, s2, 16
-; SI-NEXT: s_lshr_b32 s3, s0, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v2, s3
+; SI-NEXT: s_lshr_b32 s3, s2, 16
+; SI-NEXT: s_lshr_b32 s8, s0, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v1, s3
+; SI-NEXT: v_cvt_f32_f16_e32 v2, s8
; SI-NEXT: v_cvt_f32_f16_e32 v3, s2
-; SI-NEXT: v_cvt_f32_f16_e32 v5, s0
-; SI-NEXT: v_cvt_f32_f16_e32 v4, s1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; SI-NEXT: v_min_f32_e32 v2, v3, v2
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_min_f32_e32 v1, v1, v3
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT: v_min_f32_e32 v0, v0, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v4, s0
+; SI-NEXT: v_cvt_f32_f16_e32 v5, s1
+; SI-NEXT: v_min_f32_e32 v1, v1, v2
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_min_f32_e32 v2, v3, v4
+; SI-NEXT: v_min_f32_e32 v0, v0, v5
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4
; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0
; SI-NEXT: s_endpgm
@@ -893,25 +875,17 @@ define amdgpu_kernel void @minnum_v4f16(
; SI-NEXT: v_cvt_f32_f16_e32 v2, s6
; SI-NEXT: s_lshr_b32 s6, s7, 16
; SI-NEXT: v_cvt_f32_f16_e32 v3, s6
+; SI-NEXT: v_cvt_f32_f16_e32 v4, s4
; SI-NEXT: s_lshr_b32 s6, s5, 16
+; SI-NEXT: s_lshr_b32 s4, s4, 16
; SI-NEXT: v_cvt_f32_f16_e32 v5, s6
+; SI-NEXT: v_cvt_f32_f16_e32 v7, s4
; SI-NEXT: v_cvt_f32_f16_e32 v1, s7
-; SI-NEXT: v_cvt_f32_f16_e32 v4, s4
-; SI-NEXT: s_lshr_b32 s4, s4, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v7, s5
-; SI-NEXT: v_cvt_f32_f16_e32 v6, s4
-; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v6, s5
; SI-NEXT: v_min_f32_e32 v3, v3, v5
-; SI-NEXT: v_mul_f32_e32 v5, 1.0, v7
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_min_f32_e32 v1, v1, v5
-; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_min_f32_e32 v2, v2, v5
-; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT: v_min_f32_e32 v2, v2, v7
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT: v_min_f32_e32 v1, v1, v6
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_min_f32_e32 v0, v0, v4
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -1041,20 +1015,16 @@ define amdgpu_kernel void @fmin_v4f16_imm_a(
; SI-NEXT: v_cvt_f32_f16_e32 v1, s5
; SI-NEXT: s_lshr_b32 s5, s5, 16
; SI-NEXT: v_cvt_f32_f16_e32 v0, s4
-; SI-NEXT: v_cvt_f32_f16_e32 v2, s5
; SI-NEXT: s_lshr_b32 s4, s4, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v2, s5
; SI-NEXT: v_cvt_f32_f16_e32 v3, s4
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_min_f32_e32 v2, 4.0, v2
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
; SI-NEXT: v_min_f32_e32 v1, 0x40400000, v1
+; SI-NEXT: v_min_f32_e32 v0, 0x41000000, v0
+; SI-NEXT: v_min_f32_e32 v2, 4.0, v2
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_min_f32_e32 v3, 2.0, v3
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT: v_min_f32_e32 v0, 0x41000000, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
index fb3e79b..5b7f0e7 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
@@ -951,56 +951,70 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
; SDAG-GFX1100-LABEL: v_mad_mix_v3f32_clamp_postcvt:
; SDAG-GFX1100: ; %bb.0:
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
; SDAG-GFX1100-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
-; SDAG-GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; SDAG-GFX1100-NEXT: v_pack_b32_f16 v1, v1, 0
-; SDAG-GFX1100-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-NEXT: v_pack_b32_f16 v0, v1, 0
; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; SDAG-GFX1100-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; SDAG-GFX1100-NEXT: v_mov_b32_e32 v0, v3
+; SDAG-GFX1100-NEXT: v_pk_max_f16 v1, v6, 0
+; SDAG-GFX1100-NEXT: v_pk_max_f16 v2, v0, 0
+; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-NEXT: v_pk_min_f16 v0, v1, 1.0 op_sel_hi:[1,0]
+; SDAG-GFX1100-NEXT: v_pk_min_f16 v1, v2, 1.0 op_sel_hi:[1,0]
; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX900-LABEL: v_mad_mix_v3f32_clamp_postcvt:
; SDAG-GFX900: ; %bb.0:
; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
-; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
; SDAG-GFX900-NEXT: v_pack_b32_f16 v1, v1, 0
-; SDAG-GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; SDAG-GFX900-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; SDAG-GFX900-NEXT: v_mov_b32_e32 v0, v3
+; SDAG-GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX900-NEXT: v_pk_max_f16 v1, v1, 0
+; SDAG-GFX900-NEXT: v_pk_max_f16 v0, v6, 0
+; SDAG-GFX900-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; SDAG-GFX900-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0]
; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX906-LABEL: v_mad_mix_v3f32_clamp_postcvt:
; SDAG-GFX906: ; %bb.0:
; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
-; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
; SDAG-GFX906-NEXT: v_pack_b32_f16 v1, v1, 0
-; SDAG-GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; SDAG-GFX906-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; SDAG-GFX906-NEXT: v_mov_b32_e32 v0, v3
+; SDAG-GFX906-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX906-NEXT: v_pk_max_f16 v1, v1, 0
+; SDAG-GFX906-NEXT: v_pk_max_f16 v0, v6, 0
+; SDAG-GFX906-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; SDAG-GFX906-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0]
; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-VI-LABEL: v_mad_mix_v3f32_clamp_postcvt:
; SDAG-VI: ; %bb.0:
; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SDAG-VI-NEXT: v_mac_f32_e32 v8, v6, v7
; SDAG-VI-NEXT: v_mac_f32_e32 v4, v0, v2
-; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v8 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; SDAG-VI-NEXT: v_cvt_f16_f32_e64 v2, v4 clamp
; SDAG-VI-NEXT: v_mac_f32_e32 v5, v1, v3
-; SDAG-VI-NEXT: v_cvt_f16_f32_e64 v1, v5 clamp
+; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v0, v8
+; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v1, v4
+; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v2, v5
+; SDAG-VI-NEXT: v_max_f16_e32 v0, 0, v0
+; SDAG-VI-NEXT: v_max_f16_e32 v3, 0, v1
+; SDAG-VI-NEXT: v_max_f16_e32 v1, 0, v2
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, 0x3c00
+; SDAG-VI-NEXT: v_min_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; SDAG-VI-NEXT: v_min_f16_e32 v2, 1.0, v3
+; SDAG-VI-NEXT: v_min_f16_e32 v1, 1.0, v1
; SDAG-VI-NEXT: v_or_b32_e32 v0, v2, v0
; SDAG-VI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1139,63 +1153,80 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
}
define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 {
-; GFX1100-LABEL: v_mad_mix_v4f32_clamp_postcvt:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; GFX1100-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX1100-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX1100-LABEL: v_mad_mix_v4f32_clamp_postcvt:
+; SDAG-GFX1100: ; %bb.0:
+; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-NEXT: v_pk_max_f16 v0, v6, 0
+; SDAG-GFX1100-NEXT: v_pk_max_f16 v1, v7, 0
+; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; SDAG-GFX1100-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0]
+; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_mad_mix_v4f32_clamp_postcvt:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX900-NEXT: v_mad_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp
-; GFX900-NEXT: v_mad_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX900-NEXT: v_mov_b32_e32 v0, v6
-; GFX900-NEXT: v_mov_b32_e32 v1, v2
-; GFX900-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX900-LABEL: v_mad_mix_v4f32_clamp_postcvt:
+; SDAG-GFX900: ; %bb.0:
+; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
+; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1]
+; SDAG-GFX900-NEXT: v_mad_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX900-NEXT: v_pk_max_f16 v1, v7, 0
+; SDAG-GFX900-NEXT: v_pk_max_f16 v0, v6, 0
+; SDAG-GFX900-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; SDAG-GFX900-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0]
+; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31]
;
-; GFX906-LABEL: v_mad_mix_v4f32_clamp_postcvt:
-; GFX906: ; %bb.0:
-; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; GFX906-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX906-NEXT: v_fma_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp
-; GFX906-NEXT: v_fma_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX906-NEXT: v_mov_b32_e32 v0, v6
-; GFX906-NEXT: v_mov_b32_e32 v1, v2
-; GFX906-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX906-LABEL: v_mad_mix_v4f32_clamp_postcvt:
+; SDAG-GFX906: ; %bb.0:
+; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
+; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1]
+; SDAG-GFX906-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX906-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX906-NEXT: v_pk_max_f16 v1, v7, 0
+; SDAG-GFX906-NEXT: v_pk_max_f16 v0, v6, 0
+; SDAG-GFX906-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; SDAG-GFX906-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0]
+; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-VI-LABEL: v_mad_mix_v4f32_clamp_postcvt:
; SDAG-VI: ; %bb.0:
; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v5, v5
+; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v11, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v4, v4
+; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SDAG-VI-NEXT: v_mac_f32_e32 v10, v7, v9
; SDAG-VI-NEXT: v_mac_f32_e32 v11, v6, v8
-; SDAG-VI-NEXT: v_mac_f32_e32 v5, v1, v3
; SDAG-VI-NEXT: v_mac_f32_e32 v4, v0, v2
-; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v11 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v1, v10 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; SDAG-VI-NEXT: v_cvt_f16_f32_e64 v2, v4 clamp
-; SDAG-VI-NEXT: v_cvt_f16_f32_e64 v3, v5 clamp
+; SDAG-VI-NEXT: v_mac_f32_e32 v5, v1, v3
+; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v0, v10
+; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v1, v11
+; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v2, v4
+; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v3, v5
+; SDAG-VI-NEXT: v_max_f16_e32 v0, 0, v0
+; SDAG-VI-NEXT: v_max_f16_e32 v1, 0, v1
+; SDAG-VI-NEXT: v_max_f16_e32 v2, 0, v2
+; SDAG-VI-NEXT: v_max_f16_e32 v3, 0, v3
+; SDAG-VI-NEXT: v_mov_b32_e32 v4, 0x3c00
+; SDAG-VI-NEXT: v_min_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; SDAG-VI-NEXT: v_min_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; SDAG-VI-NEXT: v_min_f16_e32 v3, 1.0, v3
+; SDAG-VI-NEXT: v_min_f16_e32 v2, 1.0, v2
; SDAG-VI-NEXT: v_or_b32_e32 v0, v2, v0
; SDAG-VI-NEXT: v_or_b32_e32 v1, v3, v1
; SDAG-VI-NEXT: s_setpc_b64 s[30:31]
@@ -1241,6 +1272,40 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s
; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX1100-LABEL: v_mad_mix_v4f32_clamp_postcvt:
+; GISEL-GFX1100: ; %bb.0:
+; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX1100-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7
+; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX900-LABEL: v_mad_mix_v4f32_clamp_postcvt:
+; GISEL-GFX900: ; %bb.0:
+; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX900-NEXT: v_mad_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX900-NEXT: v_mov_b32_e32 v0, v6
+; GISEL-GFX900-NEXT: v_mov_b32_e32 v1, v2
+; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX906-LABEL: v_mad_mix_v4f32_clamp_postcvt:
+; GISEL-GFX906: ; %bb.0:
+; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX906-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX906-NEXT: v_fma_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX906-NEXT: v_mov_b32_e32 v0, v6
+; GISEL-GFX906-NEXT: v_mov_b32_e32 v1, v2
+; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-VI-LABEL: v_mad_mix_v4f32_clamp_postcvt:
; GISEL-VI: ; %bb.0:
; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/Hexagon/addrmode-immop.mir b/llvm/test/CodeGen/Hexagon/addrmode-immop.mir
index 3069cbe..1412d31 100644
--- a/llvm/test/CodeGen/Hexagon/addrmode-immop.mir
+++ b/llvm/test/CodeGen/Hexagon/addrmode-immop.mir
@@ -15,7 +15,7 @@
; Function Attrs: norecurse
define void @f0() #0 {
b0:
- %v0 = load ptr, ptr getelementptr (i8, ptr getelementptr inbounds ({ [3 x ptr], [3 x ptr] }, ptr @g0, i32 0, inrange i32 0, i32 3), i32 sub (i32 ptrtoint (ptr @f1 to i32), i32 1)), align 4
+ %v0 = load ptr, ptr getelementptr (i8, ptr getelementptr inbounds ({ [3 x ptr], [3 x ptr] }, ptr @g0, i32 0, i32 0, i32 3), i32 sub (i32 ptrtoint (ptr @f1 to i32), i32 1)), align 4
%v1 = call i32 %v0(ptr nonnull undef)
unreachable
}
@@ -33,7 +33,7 @@ tracksRegLiveness: true
body: |
bb.0.b0:
$r2 = A2_tfrsi @g0 + 12
- $r2 = L2_loadri_io killed $r2, @f1 - 1 :: (load (s32) from `ptr getelementptr (i8, ptr getelementptr inbounds ({ [3 x ptr], [3 x ptr] }, ptr @g0, i32 0, inrange i32 0, i32 3), i32 sub (i32 ptrtoint (ptr @f1 to i32), i32 1))`)
+ $r2 = L2_loadri_io killed $r2, @f1 - 1 :: (load (s32) from `ptr getelementptr (i8, ptr getelementptr inbounds ({ [3 x ptr], [3 x ptr] }, ptr @g0, i32 0, i32 0, i32 3), i32 sub (i32 ptrtoint (ptr @f1 to i32), i32 1))`)
ADJCALLSTACKDOWN 0, 0, implicit-def $r29, implicit-def dead $r30, implicit $r31, implicit $r30, implicit $r29
PS_callr_nr killed $r2, hexagoncsr, implicit undef $r0, implicit-def $r29, implicit-def dead $r0
ADJCALLSTACKUP 0, 0, implicit-def dead $r29, implicit-def dead $r30, implicit-def dead $r31, implicit $r29
diff --git a/llvm/test/CodeGen/NVPTX/b52037.ll b/llvm/test/CodeGen/NVPTX/b52037.ll
index d9322da..5d1c390 100644
--- a/llvm/test/CodeGen/NVPTX/b52037.ll
+++ b/llvm/test/CodeGen/NVPTX/b52037.ll
@@ -47,7 +47,7 @@ bb:
%tmp5 = load ptr, ptr %tmp4, align 8
%tmp9 = getelementptr inbounds %struct.zot, ptr %tmp, i64 0, i32 2, i32 1
store ptr %tmp5, ptr %tmp9, align 8
- store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @global_1, i64 0, inrange i32 0, i64 3), ptr %tmp, align 16
+ store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @global_1, i64 0, i32 0, i64 3), ptr %tmp, align 16
%tmp.i1 = tail call i64 @foo()
%tmp44.i16 = getelementptr inbounds i16, ptr %tmp5, i64 undef
%tmp45.i17 = load i16, ptr %tmp44.i16, align 2
diff --git a/llvm/test/CodeGen/PowerPC/remove-copy-crunsetcrbit.mir b/llvm/test/CodeGen/PowerPC/remove-copy-crunsetcrbit.mir
index 3a312d2..f3ef95b 100644
--- a/llvm/test/CodeGen/PowerPC/remove-copy-crunsetcrbit.mir
+++ b/llvm/test/CodeGen/PowerPC/remove-copy-crunsetcrbit.mir
@@ -130,7 +130,7 @@ body: |
%22:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, @c
%10:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, @e
%13:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, @a
- %14:g8rc_and_g8rc_nox0 = ADDItocL killed %13, @a, implicit $x2
+ %14:g8rc_and_g8rc_nox0 = ADDItocL8 killed %13, @a, implicit $x2
bb.2.while.body:
successors: %bb.4(0x30000000), %bb.3(0x50000000)
diff --git a/llvm/test/CodeGen/PowerPC/toc-data-large-array.ll b/llvm/test/CodeGen/PowerPC/toc-data-large-array.ll
new file mode 100644
index 0000000..90f40d9
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/toc-data-large-array.ll
@@ -0,0 +1,16 @@
+; RUN: not --crash llc -mtriple powerpc-ibm-aix-xcoff < %s 2>&1 | FileCheck %s --check-prefix CHECK-ERROR
+; RUN: not --crash llc -mtriple powerpc64-ibm-aix-xcoff < %s 2>&1 | FileCheck %s --check-prefix CHECK-ERROR
+
+@a = global [5 x i16] zeroinitializer, align 2 #0
+
+; Function Attrs: noinline
+define i16 @foo() #1 {
+entry:
+ %0 = load i16, ptr @a, align 2
+ ret i16 %0
+}
+
+attributes #0 = { "toc-data" }
+attributes #1 = { noinline }
+
+; CHECK-ERROR: LLVM ERROR: A GlobalVariable with size larger than a TOC entry is not currently supported by the toc data transformation.
diff --git a/llvm/test/CodeGen/PowerPC/toc-data-large-array2.ll b/llvm/test/CodeGen/PowerPC/toc-data-large-array2.ll
new file mode 100644
index 0000000..f870e99
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/toc-data-large-array2.ll
@@ -0,0 +1,8 @@
+; RUN: not --crash llc -mtriple powerpc-ibm-aix-xcoff < %s 2>&1 | FileCheck %s --check-prefix CHECK-ERROR
+; RUN: not --crash llc -mtriple powerpc64-ibm-aix-xcoff < %s 2>&1 | FileCheck %s --check-prefix CHECK-ERROR
+
+@a = global [5 x i16] zeroinitializer, align 2 #0
+
+attributes #0 = { "toc-data" }
+
+; CHECK-ERROR: LLVM ERROR: A GlobalVariable with size larger than a TOC entry is not currently supported by the toc data transformation.
diff --git a/llvm/test/CodeGen/PowerPC/toc-data-struct-array.ll b/llvm/test/CodeGen/PowerPC/toc-data-struct-array.ll
new file mode 100644
index 0000000..a5c9a8b
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/toc-data-struct-array.ll
@@ -0,0 +1,110 @@
+; RUN: llc -mtriple powerpc-ibm-aix-xcoff < %s | FileCheck %s --check-prefix CHECK
+; RUN: llc -mtriple powerpc64-ibm-aix-xcoff < %s | FileCheck %s --check-prefix CHECK
+
+; RUN: llc -filetype=obj -mtriple powerpc-ibm-aix-xcoff < %s -o %t32.o
+; RUN: llvm-readobj %t32.o --syms | FileCheck %s --check-prefix=OBJ32
+; RUN: llc -filetype=obj -mtriple powerpc64-ibm-aix-xcoff < %s -o %t64.o
+; RUN: llvm-readobj %t64.o --syms | FileCheck %s --check-prefix=OBJ64
+
+%struct.small_struct = type { i16 }
+
+@a = global %struct.small_struct zeroinitializer, align 2 #0
+@b = global [2 x i16] zeroinitializer, align 2 #0
+
+; Function Attrs: noinline
+define i16 @foo() #1 {
+entry:
+ %0 = load i16, ptr @a, align 2
+ %1 = load i16, ptr @b, align 2
+ %add = add nsw i16 %0, %1
+ ret i16 %add
+}
+
+attributes #0 = { "toc-data" }
+attributes #1 = { noinline }
+
+; CHECK: .toc
+; CHECK-NEXT: .csect a[TD],2
+; CHECK-NEXT: .globl a[TD] # @a
+; CHECK-NEXT: .align 1
+; CHECK-NEXT: .space 2
+; CHECK-NEXT: .csect b[TD],2
+; CHECK-NEXT: .globl b[TD] # @b
+; CHECK-NEXT: .align 1
+; CHECK-NEXT: .space 4
+
+; OBJ32: Symbol {
+; OBJ32: Name: a
+; OBJ32-NEXT: Value (RelocatableAddress): 0x3C
+; OBJ32-NEXT: Section: .data
+; OBJ32-NEXT: Type: 0x0
+; OBJ32-NEXT: StorageClass: C_EXT (0x2)
+; OBJ32-NEXT: NumberOfAuxEntries: 1
+; OBJ32-NEXT: CSECT Auxiliary Entry {
+; OBJ32-NEXT: Index: {{[0-9]+}}
+; OBJ32-NEXT: SectionLen: 2
+; OBJ32-NEXT: ParameterHashIndex: 0x0
+; OBJ32-NEXT: TypeChkSectNum: 0x0
+; OBJ32-NEXT: SymbolAlignmentLog2: 2
+; OBJ32-NEXT: SymbolType: XTY_SD (0x1)
+; OBJ32-NEXT: StorageMappingClass: XMC_TD (0x10)
+; OBJ32-NEXT: StabInfoIndex: 0x0
+; OBJ32-NEXT: StabSectNum: 0x0
+; OBJ32-NEXT: }
+; OBJ32-NEXT: }
+; OBJ32-NEXT: Symbol {
+; OBJ32: Name: b
+; OBJ32-NEXT: Value (RelocatableAddress): 0x40
+; OBJ32-NEXT: Section: .data
+; OBJ32-NEXT: Type: 0x0
+; OBJ32-NEXT: StorageClass: C_EXT (0x2)
+; OBJ32-NEXT: NumberOfAuxEntries: 1
+; OBJ32-NEXT: CSECT Auxiliary Entry {
+; OBJ32-NEXT: Index: {{[0-9]+}}
+; OBJ32-NEXT: SectionLen: 4
+; OBJ32-NEXT: ParameterHashIndex: 0x0
+; OBJ32-NEXT: TypeChkSectNum: 0x0
+; OBJ32-NEXT: SymbolAlignmentLog2: 2
+; OBJ32-NEXT: SymbolType: XTY_SD (0x1)
+; OBJ32-NEXT: StorageMappingClass: XMC_TD (0x10)
+; OBJ32-NEXT: StabInfoIndex: 0x0
+; OBJ32-NEXT: StabSectNum: 0x0
+; OBJ32-NEXT: }
+; OBJ32-NEXT: }
+
+; OBJ64: Symbol {
+; OBJ64: Name: a
+; OBJ64-NEXT: Value (RelocatableAddress): 0x48
+; OBJ64-NEXT: Section: .data
+; OBJ64-NEXT: Type: 0x0
+; OBJ64-NEXT: StorageClass: C_EXT (0x2)
+; OBJ64-NEXT: NumberOfAuxEntries: 1
+; OBJ64-NEXT: CSECT Auxiliary Entry {
+; OBJ64-NEXT: Index: {{[0-9]+}}
+; OBJ64-NEXT: SectionLen: 2
+; OBJ64-NEXT: ParameterHashIndex: 0x0
+; OBJ64-NEXT: TypeChkSectNum: 0x0
+; OBJ64-NEXT: SymbolAlignmentLog2: 2
+; OBJ64-NEXT: SymbolType: XTY_SD (0x1)
+; OBJ64-NEXT: StorageMappingClass: XMC_TD (0x10)
+; OBJ64-NEXT: Auxiliary Type: AUX_CSECT (0xFB)
+; OBJ64-NEXT: }
+; OBJ64-NEXT: }
+; OBJ64-NEXT: Symbol {
+; OBJ64: Name: b
+; OBJ64-NEXT: Value (RelocatableAddress): 0x4C
+; OBJ64-NEXT: Section: .data
+; OBJ64-NEXT: Type: 0x0
+; OBJ64-NEXT: StorageClass: C_EXT (0x2)
+; OBJ64-NEXT: NumberOfAuxEntries: 1
+; OBJ64-NEXT: CSECT Auxiliary Entry {
+; OBJ64-NEXT: Index: {{[0-9]+}}
+; OBJ64-NEXT: SectionLen: 4
+; OBJ64-NEXT: ParameterHashIndex: 0x0
+; OBJ64-NEXT: TypeChkSectNum: 0x0
+; OBJ64-NEXT: SymbolAlignmentLog2: 2
+; OBJ64-NEXT: SymbolType: XTY_SD (0x1)
+; OBJ64-NEXT: StorageMappingClass: XMC_TD (0x10)
+; OBJ64-NEXT: Auxiliary Type: AUX_CSECT (0xFB)
+; OBJ64-NEXT: }
+; OBJ64-NEXT: }
diff --git a/llvm/test/CodeGen/RISCV/rvv/compressstore.ll b/llvm/test/CodeGen/RISCV/rvv/compressstore.ll
new file mode 100644
index 0000000..673008d
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/compressstore.ll
@@ -0,0 +1,871 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -verify-machineinstrs -mtriple=riscv64 -mattr=+v,+d,+m,+zbb %s -o - | FileCheck %s --check-prefix=RV64
+; RUN: llc -verify-machineinstrs -mtriple=riscv32 -mattr=+v,+d,+m,+zbb %s -o - | FileCheck %s --check-prefix=RV32
+
+; Compress + store for i8 type
+
+define void @test_compresstore_v1i8(ptr %p, <1 x i1> %mask, <1 x i8> %data) {
+; RV64-LABEL: test_compresstore_v1i8:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
+; RV64-NEXT: vcompress.vm v9, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; RV64-NEXT: vse8.v v9, (a0)
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_compresstore_v1i8:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
+; RV32-NEXT: vcompress.vm v9, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; RV32-NEXT: vse8.v v9, (a0)
+; RV32-NEXT: ret
+entry:
+ tail call void @llvm.masked.compressstore.v1i8(<1 x i8> %data, ptr align 1 %p, <1 x i1> %mask)
+ ret void
+}
+
+define void @test_compresstore_v2i8(ptr %p, <2 x i1> %mask, <2 x i8> %data) {
+; RV64-LABEL: test_compresstore_v2i8:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
+; RV64-NEXT: vcompress.vm v9, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; RV64-NEXT: vse8.v v9, (a0)
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_compresstore_v2i8:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
+; RV32-NEXT: vcompress.vm v9, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; RV32-NEXT: vse8.v v9, (a0)
+; RV32-NEXT: ret
+entry:
+ tail call void @llvm.masked.compressstore.v2i8(<2 x i8> %data, ptr align 1 %p, <2 x i1> %mask)
+ ret void
+}
+
+define void @test_compresstore_v4i8(ptr %p, <4 x i1> %mask, <4 x i8> %data) {
+; RV64-LABEL: test_compresstore_v4i8:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; RV64-NEXT: vcompress.vm v9, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; RV64-NEXT: vse8.v v9, (a0)
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_compresstore_v4i8:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; RV32-NEXT: vcompress.vm v9, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; RV32-NEXT: vse8.v v9, (a0)
+; RV32-NEXT: ret
+entry:
+ tail call void @llvm.masked.compressstore.v4i8(<4 x i8> %data, ptr align 1 %p, <4 x i1> %mask)
+ ret void
+}
+
+define void @test_compresstore_v8i8(ptr %p, <8 x i1> %mask, <8 x i8> %data) {
+; RV64-LABEL: test_compresstore_v8i8:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; RV64-NEXT: vcompress.vm v9, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; RV64-NEXT: vse8.v v9, (a0)
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_compresstore_v8i8:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; RV32-NEXT: vcompress.vm v9, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; RV32-NEXT: vse8.v v9, (a0)
+; RV32-NEXT: ret
+entry:
+ tail call void @llvm.masked.compressstore.v8i8(<8 x i8> %data, ptr align 1 %p, <8 x i1> %mask)
+ ret void
+}
+
+define void @test_compresstore_v16i8(ptr %p, <16 x i1> %mask, <16 x i8> %data) {
+; RV64-LABEL: test_compresstore_v16i8:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; RV64-NEXT: vcompress.vm v9, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; RV64-NEXT: vse8.v v9, (a0)
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_compresstore_v16i8:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; RV32-NEXT: vcompress.vm v9, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; RV32-NEXT: vse8.v v9, (a0)
+; RV32-NEXT: ret
+entry:
+ tail call void @llvm.masked.compressstore.v16i8(<16 x i8> %data, ptr align 1 %p, <16 x i1> %mask)
+ ret void
+}
+
+define void @test_compresstore_v32i8(ptr %p, <32 x i1> %mask, <32 x i8> %data) {
+; RV64-LABEL: test_compresstore_v32i8:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: li a1, 32
+; RV64-NEXT: vsetvli zero, a1, e8, m2, ta, ma
+; RV64-NEXT: vcompress.vm v10, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e8, m2, ta, ma
+; RV64-NEXT: vse8.v v10, (a0)
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_compresstore_v32i8:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: li a1, 32
+; RV32-NEXT: vsetvli zero, a1, e8, m2, ta, ma
+; RV32-NEXT: vcompress.vm v10, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e8, m2, ta, ma
+; RV32-NEXT: vse8.v v10, (a0)
+; RV32-NEXT: ret
+entry:
+ tail call void @llvm.masked.compressstore.v32i8(<32 x i8> %data, ptr align 1 %p, <32 x i1> %mask)
+ ret void
+}
+
+define void @test_compresstore_v64i8(ptr %p, <64 x i1> %mask, <64 x i8> %data) {
+; RV64-LABEL: test_compresstore_v64i8:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: li a1, 64
+; RV64-NEXT: vsetvli zero, a1, e8, m4, ta, ma
+; RV64-NEXT: vcompress.vm v12, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e8, m4, ta, ma
+; RV64-NEXT: vse8.v v12, (a0)
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_compresstore_v64i8:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: li a1, 64
+; RV32-NEXT: vsetvli zero, a1, e8, m4, ta, ma
+; RV32-NEXT: vcompress.vm v12, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e8, m4, ta, ma
+; RV32-NEXT: vse8.v v12, (a0)
+; RV32-NEXT: ret
+entry:
+ tail call void @llvm.masked.compressstore.v64i8(<64 x i8> %data, ptr align 1 %p, <64 x i1> %mask)
+ ret void
+}
+
+define void @test_compresstore_v128i8(ptr %p, <128 x i1> %mask, <128 x i8> %data) {
+; RV64-LABEL: test_compresstore_v128i8:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: li a1, 128
+; RV64-NEXT: vsetvli zero, a1, e8, m8, ta, ma
+; RV64-NEXT: vcompress.vm v16, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e8, m8, ta, ma
+; RV64-NEXT: vse8.v v16, (a0)
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_compresstore_v128i8:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: li a1, 128
+; RV32-NEXT: vsetvli zero, a1, e8, m8, ta, ma
+; RV32-NEXT: vcompress.vm v16, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e8, m8, ta, ma
+; RV32-NEXT: vse8.v v16, (a0)
+; RV32-NEXT: ret
+entry:
+ tail call void @llvm.masked.compressstore.v128i8(<128 x i8> %data, ptr align 1 %p, <128 x i1> %mask)
+ ret void
+}
+
+define void @test_compresstore_v256i8(ptr %p, <256 x i1> %mask, <256 x i8> %data) {
+; RV64-LABEL: test_compresstore_v256i8:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: vmv1r.v v7, v8
+; RV64-NEXT: li a2, 128
+; RV64-NEXT: vsetvli zero, a2, e8, m8, ta, ma
+; RV64-NEXT: vle8.v v24, (a1)
+; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v9, v0, 1
+; RV64-NEXT: vmv.x.s a1, v9
+; RV64-NEXT: vmv.x.s a3, v0
+; RV64-NEXT: vsetvli zero, a2, e8, m8, ta, ma
+; RV64-NEXT: vcompress.vm v8, v16, v0
+; RV64-NEXT: vcpop.m a4, v0
+; RV64-NEXT: vsetvli zero, a4, e8, m8, ta, ma
+; RV64-NEXT: vse8.v v8, (a0)
+; RV64-NEXT: vsetvli zero, a2, e8, m8, ta, ma
+; RV64-NEXT: vcompress.vm v8, v24, v7
+; RV64-NEXT: vcpop.m a2, v7
+; RV64-NEXT: cpop a3, a3
+; RV64-NEXT: cpop a1, a1
+; RV64-NEXT: add a0, a0, a3
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: vsetvli zero, a2, e8, m8, ta, ma
+; RV64-NEXT: vse8.v v8, (a0)
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_compresstore_v256i8:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: vmv1r.v v7, v8
+; RV32-NEXT: li a2, 128
+; RV32-NEXT: vsetvli zero, a2, e8, m8, ta, ma
+; RV32-NEXT: vle8.v v24, (a1)
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v9, v0, 1
+; RV32-NEXT: li a1, 32
+; RV32-NEXT: vsrl.vx v10, v9, a1
+; RV32-NEXT: vmv.x.s a3, v10
+; RV32-NEXT: vsrl.vx v10, v0, a1
+; RV32-NEXT: vmv.x.s a1, v10
+; RV32-NEXT: vmv.x.s a4, v9
+; RV32-NEXT: vmv.x.s a5, v0
+; RV32-NEXT: vsetvli zero, a2, e8, m8, ta, ma
+; RV32-NEXT: vcompress.vm v8, v16, v0
+; RV32-NEXT: vcpop.m a6, v0
+; RV32-NEXT: vsetvli zero, a6, e8, m8, ta, ma
+; RV32-NEXT: vse8.v v8, (a0)
+; RV32-NEXT: cpop a1, a1
+; RV32-NEXT: cpop a5, a5
+; RV32-NEXT: add a1, a5, a1
+; RV32-NEXT: cpop a3, a3
+; RV32-NEXT: cpop a4, a4
+; RV32-NEXT: add a3, a4, a3
+; RV32-NEXT: add a1, a1, a3
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: vsetvli zero, a2, e8, m8, ta, ma
+; RV32-NEXT: vcompress.vm v8, v24, v7
+; RV32-NEXT: vcpop.m a1, v7
+; RV32-NEXT: vsetvli zero, a1, e8, m8, ta, ma
+; RV32-NEXT: vse8.v v8, (a0)
+; RV32-NEXT: ret
+entry:
+ tail call void @llvm.masked.compressstore.v256i8(<256 x i8> %data, ptr align 1 %p, <256 x i1> %mask)
+ ret void
+}
+
+; Compress + store for i16 type
+
+define void @test_compresstore_v1i16(ptr %p, <1 x i1> %mask, <1 x i16> %data) {
+; RV64-LABEL: test_compresstore_v1i16:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
+; RV64-NEXT: vcompress.vm v9, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; RV64-NEXT: vse16.v v9, (a0)
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_compresstore_v1i16:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
+; RV32-NEXT: vcompress.vm v9, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; RV32-NEXT: vse16.v v9, (a0)
+; RV32-NEXT: ret
+entry:
+ tail call void @llvm.masked.compressstore.v1i16(<1 x i16> %data, ptr align 2 %p, <1 x i1> %mask)
+ ret void
+}
+
+define void @test_compresstore_v2i16(ptr %p, <2 x i1> %mask, <2 x i16> %data) {
+; RV64-LABEL: test_compresstore_v2i16:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; RV64-NEXT: vcompress.vm v9, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; RV64-NEXT: vse16.v v9, (a0)
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_compresstore_v2i16:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; RV32-NEXT: vcompress.vm v9, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; RV32-NEXT: vse16.v v9, (a0)
+; RV32-NEXT: ret
+entry:
+ tail call void @llvm.masked.compressstore.v2i16(<2 x i16> %data, ptr align 2 %p, <2 x i1> %mask)
+ ret void
+}
+
+define void @test_compresstore_v4i16(ptr %p, <4 x i1> %mask, <4 x i16> %data) {
+; RV64-LABEL: test_compresstore_v4i16:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; RV64-NEXT: vcompress.vm v9, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; RV64-NEXT: vse16.v v9, (a0)
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_compresstore_v4i16:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; RV32-NEXT: vcompress.vm v9, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; RV32-NEXT: vse16.v v9, (a0)
+; RV32-NEXT: ret
+entry:
+ tail call void @llvm.masked.compressstore.v4i16(<4 x i16> %data, ptr align 2 %p, <4 x i1> %mask)
+ ret void
+}
+
+define void @test_compresstore_v8i16(ptr %p, <8 x i1> %mask, <8 x i16> %data) {
+; RV64-LABEL: test_compresstore_v8i16:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; RV64-NEXT: vcompress.vm v9, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; RV64-NEXT: vse16.v v9, (a0)
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_compresstore_v8i16:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT: vcompress.vm v9, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; RV32-NEXT: vse16.v v9, (a0)
+; RV32-NEXT: ret
+entry:
+ tail call void @llvm.masked.compressstore.v8i16(<8 x i16> %data, ptr align 2 %p, <8 x i1> %mask)
+ ret void
+}
+
+define void @test_compresstore_v16i16(ptr %p, <16 x i1> %mask, <16 x i16> %data) {
+; RV64-LABEL: test_compresstore_v16i16:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; RV64-NEXT: vcompress.vm v10, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e16, m2, ta, ma
+; RV64-NEXT: vse16.v v10, (a0)
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_compresstore_v16i16:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; RV32-NEXT: vcompress.vm v10, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e16, m2, ta, ma
+; RV32-NEXT: vse16.v v10, (a0)
+; RV32-NEXT: ret
+entry:
+ tail call void @llvm.masked.compressstore.v16i16(<16 x i16> %data, ptr align 2 %p, <16 x i1> %mask)
+ ret void
+}
+
+define void @test_compresstore_v32i16(ptr %p, <32 x i1> %mask, <32 x i16> %data) {
+; RV64-LABEL: test_compresstore_v32i16:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: li a1, 32
+; RV64-NEXT: vsetvli zero, a1, e16, m4, ta, ma
+; RV64-NEXT: vcompress.vm v12, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e16, m4, ta, ma
+; RV64-NEXT: vse16.v v12, (a0)
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_compresstore_v32i16:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: li a1, 32
+; RV32-NEXT: vsetvli zero, a1, e16, m4, ta, ma
+; RV32-NEXT: vcompress.vm v12, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e16, m4, ta, ma
+; RV32-NEXT: vse16.v v12, (a0)
+; RV32-NEXT: ret
+entry:
+ tail call void @llvm.masked.compressstore.v32i16(<32 x i16> %data, ptr align 2 %p, <32 x i1> %mask)
+ ret void
+}
+
+define void @test_compresstore_v64i16(ptr %p, <64 x i1> %mask, <64 x i16> %data) {
+; RV64-LABEL: test_compresstore_v64i16:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: li a1, 64
+; RV64-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; RV64-NEXT: vcompress.vm v16, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; RV64-NEXT: vse16.v v16, (a0)
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_compresstore_v64i16:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: li a1, 64
+; RV32-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; RV32-NEXT: vcompress.vm v16, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; RV32-NEXT: vse16.v v16, (a0)
+; RV32-NEXT: ret
+entry:
+ tail call void @llvm.masked.compressstore.v64i16(<64 x i16> %data, ptr align 2 %p, <64 x i1> %mask)
+ ret void
+}
+
+define void @test_compresstore_v128i16(ptr %p, <128 x i1> %mask, <128 x i16> %data) {
+; RV64-LABEL: test_compresstore_v128i16:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: li a1, 64
+; RV64-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; RV64-NEXT: vcompress.vm v24, v8, v0
+; RV64-NEXT: vcpop.m a2, v0
+; RV64-NEXT: vsetvli zero, a2, e16, m8, ta, ma
+; RV64-NEXT: vse16.v v24, (a0)
+; RV64-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v8, v0, 8
+; RV64-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; RV64-NEXT: vcompress.vm v24, v16, v8
+; RV64-NEXT: vcpop.m a2, v8
+; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT: vmv.x.s a1, v0
+; RV64-NEXT: cpop a1, a1
+; RV64-NEXT: slli a1, a1, 1
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: vsetvli zero, a2, e16, m8, ta, ma
+; RV64-NEXT: vse16.v v24, (a0)
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_compresstore_v128i16:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: li a1, 64
+; RV32-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; RV32-NEXT: vcompress.vm v24, v8, v0
+; RV32-NEXT: vcpop.m a2, v0
+; RV32-NEXT: vsetvli zero, a2, e16, m8, ta, ma
+; RV32-NEXT: vse16.v v24, (a0)
+; RV32-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v24, v0, 8
+; RV32-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; RV32-NEXT: vcompress.vm v8, v16, v24
+; RV32-NEXT: vcpop.m a1, v24
+; RV32-NEXT: li a2, 32
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
+; RV32-NEXT: vsrl.vx v16, v0, a2
+; RV32-NEXT: vmv.x.s a2, v16
+; RV32-NEXT: cpop a2, a2
+; RV32-NEXT: vmv.x.s a3, v0
+; RV32-NEXT: cpop a3, a3
+; RV32-NEXT: add a2, a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; RV32-NEXT: vse16.v v8, (a0)
+; RV32-NEXT: ret
+entry:
+ tail call void @llvm.masked.compressstore.v128i16(<128 x i16> %data, ptr align 2 %p, <128 x i1> %mask)
+ ret void
+}
+
+; Compress + store for i32 type
+
+define void @test_compresstore_v1i32(ptr %p, <1 x i1> %mask, <1 x i32> %data) {
+; RV64-LABEL: test_compresstore_v1i32:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
+; RV64-NEXT: vcompress.vm v9, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; RV64-NEXT: vse32.v v9, (a0)
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_compresstore_v1i32:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
+; RV32-NEXT: vcompress.vm v9, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; RV32-NEXT: vse32.v v9, (a0)
+; RV32-NEXT: ret
+entry:
+ tail call void @llvm.masked.compressstore.v1i32(<1 x i32> %data, ptr align 4 %p, <1 x i1> %mask)
+ ret void
+}
+
+define void @test_compresstore_v2i32(ptr %p, <2 x i1> %mask, <2 x i32> %data) {
+; RV64-LABEL: test_compresstore_v2i32:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; RV64-NEXT: vcompress.vm v9, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; RV64-NEXT: vse32.v v9, (a0)
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_compresstore_v2i32:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; RV32-NEXT: vcompress.vm v9, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; RV32-NEXT: vse32.v v9, (a0)
+; RV32-NEXT: ret
+entry:
+ tail call void @llvm.masked.compressstore.v2i32(<2 x i32> %data, ptr align 4 %p, <2 x i1> %mask)
+ ret void
+}
+
+define void @test_compresstore_v4i32(ptr %p, <4 x i1> %mask, <4 x i32> %data) {
+; RV64-LABEL: test_compresstore_v4i32:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT: vcompress.vm v9, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; RV64-NEXT: vse32.v v9, (a0)
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_compresstore_v4i32:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT: vcompress.vm v9, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; RV32-NEXT: vse32.v v9, (a0)
+; RV32-NEXT: ret
+entry:
+ tail call void @llvm.masked.compressstore.v4i32(<4 x i32> %data, ptr align 4 %p, <4 x i1> %mask)
+ ret void
+}
+
+define void @test_compresstore_v8i32(ptr %p, <8 x i1> %mask, <8 x i32> %data) {
+; RV64-LABEL: test_compresstore_v8i32:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT: vcompress.vm v10, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; RV64-NEXT: vse32.v v10, (a0)
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_compresstore_v8i32:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT: vcompress.vm v10, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; RV32-NEXT: vse32.v v10, (a0)
+; RV32-NEXT: ret
+entry:
+ tail call void @llvm.masked.compressstore.v8i32(<8 x i32> %data, ptr align 4 %p, <8 x i1> %mask)
+ ret void
+}
+
+define void @test_compresstore_v16i32(ptr %p, <16 x i1> %mask, <16 x i32> %data) {
+; RV64-LABEL: test_compresstore_v16i32:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT: vcompress.vm v12, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e32, m4, ta, ma
+; RV64-NEXT: vse32.v v12, (a0)
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_compresstore_v16i32:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT: vcompress.vm v12, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e32, m4, ta, ma
+; RV32-NEXT: vse32.v v12, (a0)
+; RV32-NEXT: ret
+entry:
+ tail call void @llvm.masked.compressstore.v16i32(<16 x i32> %data, ptr align 4 %p, <16 x i1> %mask)
+ ret void
+}
+
+define void @test_compresstore_v32i32(ptr %p, <32 x i1> %mask, <32 x i32> %data) {
+; RV64-LABEL: test_compresstore_v32i32:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: li a1, 32
+; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV64-NEXT: vcompress.vm v16, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV64-NEXT: vse32.v v16, (a0)
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_compresstore_v32i32:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: li a1, 32
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vcompress.vm v16, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vse32.v v16, (a0)
+; RV32-NEXT: ret
+entry:
+ tail call void @llvm.masked.compressstore.v32i32(<32 x i32> %data, ptr align 4 %p, <32 x i1> %mask)
+ ret void
+}
+
+define void @test_compresstore_v64i32(ptr %p, <64 x i1> %mask, <64 x i32> %data) {
+; RV64-LABEL: test_compresstore_v64i32:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: li a1, 32
+; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV64-NEXT: vcompress.vm v24, v8, v0
+; RV64-NEXT: vcpop.m a2, v0
+; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV64-NEXT: vse32.v v24, (a0)
+; RV64-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
+; RV64-NEXT: vslidedown.vi v8, v0, 4
+; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV64-NEXT: vcompress.vm v24, v16, v8
+; RV64-NEXT: vcpop.m a1, v8
+; RV64-NEXT: vmv.x.s a2, v0
+; RV64-NEXT: cpopw a2, a2
+; RV64-NEXT: slli a2, a2, 2
+; RV64-NEXT: add a0, a0, a2
+; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV64-NEXT: vse32.v v24, (a0)
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_compresstore_v64i32:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: li a1, 32
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vcompress.vm v24, v8, v0
+; RV32-NEXT: vcpop.m a2, v0
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vse32.v v24, (a0)
+; RV32-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
+; RV32-NEXT: vslidedown.vi v8, v0, 4
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vcompress.vm v24, v16, v8
+; RV32-NEXT: vcpop.m a1, v8
+; RV32-NEXT: vmv.x.s a2, v0
+; RV32-NEXT: cpop a2, a2
+; RV32-NEXT: slli a2, a2, 2
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vse32.v v24, (a0)
+; RV32-NEXT: ret
+entry:
+ tail call void @llvm.masked.compressstore.v64i32(<64 x i32> %data, ptr align 4 %p, <64 x i1> %mask)
+ ret void
+}
+
+; Compress + store for i64 type
+
+define void @test_compresstore_v1i64(ptr %p, <1 x i1> %mask, <1 x i64> %data) {
+; RV64-LABEL: test_compresstore_v1i64:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT: vcompress.vm v9, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT: vse64.v v9, (a0)
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_compresstore_v1i64:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
+; RV32-NEXT: vcompress.vm v9, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e64, m1, ta, ma
+; RV32-NEXT: vse64.v v9, (a0)
+; RV32-NEXT: ret
+entry:
+ tail call void @llvm.masked.compressstore.v1i64(<1 x i64> %data, ptr align 8 %p, <1 x i1> %mask)
+ ret void
+}
+
+define void @test_compresstore_v2i64(ptr %p, <2 x i1> %mask, <2 x i64> %data) {
+; RV64-LABEL: test_compresstore_v2i64:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT: vcompress.vm v9, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT: vse64.v v9, (a0)
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_compresstore_v2i64:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT: vcompress.vm v9, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e64, m1, ta, ma
+; RV32-NEXT: vse64.v v9, (a0)
+; RV32-NEXT: ret
+entry:
+ tail call void @llvm.masked.compressstore.v2i64(<2 x i64> %data, ptr align 8 %p, <2 x i1> %mask)
+ ret void
+}
+
+define void @test_compresstore_v4i64(ptr %p, <4 x i1> %mask, <4 x i64> %data) {
+; RV64-LABEL: test_compresstore_v4i64:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; RV64-NEXT: vcompress.vm v10, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT: vse64.v v10, (a0)
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_compresstore_v4i64:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT: vcompress.vm v10, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e64, m2, ta, ma
+; RV32-NEXT: vse64.v v10, (a0)
+; RV32-NEXT: ret
+entry:
+ tail call void @llvm.masked.compressstore.v4i64(<4 x i64> %data, ptr align 8 %p, <4 x i1> %mask)
+ ret void
+}
+
+define void @test_compresstore_v8i64(ptr %p, <8 x i1> %mask, <8 x i64> %data) {
+; RV64-LABEL: test_compresstore_v8i64:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; RV64-NEXT: vcompress.vm v12, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT: vse64.v v12, (a0)
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_compresstore_v8i64:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT: vcompress.vm v12, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e64, m4, ta, ma
+; RV32-NEXT: vse64.v v12, (a0)
+; RV32-NEXT: ret
+entry:
+ tail call void @llvm.masked.compressstore.v8i64(<8 x i64> %data, ptr align 8 %p, <8 x i1> %mask)
+ ret void
+}
+
+define void @test_compresstore_v16i64(ptr %p, <16 x i1> %mask, <16 x i64> %data) {
+; RV64-LABEL: test_compresstore_v16i64:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT: vcompress.vm v16, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT: vse64.v v16, (a0)
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_compresstore_v16i64:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT: vcompress.vm v16, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT: vse64.v v16, (a0)
+; RV32-NEXT: ret
+entry:
+ tail call void @llvm.masked.compressstore.v16i64(<16 x i64> %data, ptr align 8 %p, <16 x i1> %mask)
+ ret void
+}
+
+define void @test_compresstore_v32i64(ptr %p, <32 x i1> %mask, <32 x i64> %data) {
+; RV64-LABEL: test_compresstore_v32i64:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT: vcompress.vm v24, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT: vse64.v v24, (a0)
+; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
+; RV64-NEXT: vslidedown.vi v24, v0, 2
+; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT: vcompress.vm v8, v16, v24
+; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; RV64-NEXT: vmv.x.s a1, v0
+; RV64-NEXT: zext.h a1, a1
+; RV64-NEXT: cpopw a1, a1
+; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: vcpop.m a1, v24
+; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT: vse64.v v8, (a0)
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_compresstore_v32i64:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT: vcompress.vm v24, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT: vse64.v v24, (a0)
+; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT: vslidedown.vi v24, v0, 2
+; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT: vcompress.vm v8, v16, v24
+; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; RV32-NEXT: vmv.x.s a1, v0
+; RV32-NEXT: zext.h a1, a1
+; RV32-NEXT: cpop a1, a1
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: vcpop.m a1, v24
+; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT: vse64.v v8, (a0)
+; RV32-NEXT: ret
+entry:
+ tail call void @llvm.masked.compressstore.v32i64(<32 x i64> %data, ptr align 8 %p, <32 x i1> %mask)
+ ret void
+}
+
+declare void @llvm.masked.compressstore.v1i8(<1 x i8>, ptr, <1 x i1>)
+declare void @llvm.masked.compressstore.v2i8(<2 x i8>, ptr, <2 x i1>)
+declare void @llvm.masked.compressstore.v4i8(<4 x i8>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v8i8(<8 x i8>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v16i8(<16 x i8>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v32i8(<32 x i8>, ptr, <32 x i1>)
+declare void @llvm.masked.compressstore.v64i8(<64 x i8>, ptr, <64 x i1>)
+declare void @llvm.masked.compressstore.v128i8(<128 x i8>, ptr, <128 x i1>)
+declare void @llvm.masked.compressstore.v256i8(<256 x i8>, ptr, <256 x i1>)
+
+declare void @llvm.masked.compressstore.v1i16(<1 x i16>, ptr, <1 x i1>)
+declare void @llvm.masked.compressstore.v2i16(<2 x i16>, ptr, <2 x i1>)
+declare void @llvm.masked.compressstore.v4i16(<4 x i16>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v8i16(<8 x i16>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v16i16(<16 x i16>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v32i16(<32 x i16>, ptr, <32 x i1>)
+declare void @llvm.masked.compressstore.v64i16(<64 x i16>, ptr, <64 x i1>)
+declare void @llvm.masked.compressstore.v128i16(<128 x i16>, ptr, <128 x i1>)
+
+declare void @llvm.masked.compressstore.v1i32(<1 x i32>, ptr, <1 x i1>)
+declare void @llvm.masked.compressstore.v2i32(<2 x i32>, ptr, <2 x i1>)
+declare void @llvm.masked.compressstore.v4i32(<4 x i32>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v8i32(<8 x i32>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v16i32(<16 x i32>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v32i32(<32 x i32>, ptr, <32 x i1>)
+declare void @llvm.masked.compressstore.v64i32(<64 x i32>, ptr, <64 x i1>)
+
+declare void @llvm.masked.compressstore.v1i64(<1 x i64>, ptr, <1 x i1>)
+declare void @llvm.masked.compressstore.v2i64(<2 x i64>, ptr, <2 x i1>)
+declare void @llvm.masked.compressstore.v4i64(<4 x i64>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v8i64(<8 x i64>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v16i64(<16 x i64>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v32i64(<32 x i64>, ptr, <32 x i1>)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-compressstore-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-compressstore-fp.ll
index 52c5292..36fbdd8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-compressstore-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-compressstore-fp.ll
@@ -6,24 +6,20 @@ declare void @llvm.masked.compressstore.v1f16(<1 x half>, ptr, <1 x i1>)
define void @compressstore_v1f16(ptr %base, <1 x half> %v, <1 x i1> %mask) {
; RV32-LABEL: compressstore_v1f16:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
-; RV32-NEXT: vfirst.m a1, v0
-; RV32-NEXT: bnez a1, .LBB0_2
-; RV32-NEXT: # %bb.1: # %cond.store
; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
-; RV32-NEXT: vse16.v v8, (a0)
-; RV32-NEXT: .LBB0_2: # %else
+; RV32-NEXT: vcompress.vm v9, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; RV32-NEXT: vse16.v v9, (a0)
; RV32-NEXT: ret
;
; RV64-LABEL: compressstore_v1f16:
; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
-; RV64-NEXT: vfirst.m a1, v0
-; RV64-NEXT: bnez a1, .LBB0_2
-; RV64-NEXT: # %bb.1: # %cond.store
; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
-; RV64-NEXT: vse16.v v8, (a0)
-; RV64-NEXT: .LBB0_2: # %else
+; RV64-NEXT: vcompress.vm v9, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; RV64-NEXT: vse16.v v9, (a0)
; RV64-NEXT: ret
call void @llvm.masked.compressstore.v1f16(<1 x half> %v, ptr align 2 %base, <1 x i1> %mask)
ret void
@@ -33,48 +29,20 @@ declare void @llvm.masked.compressstore.v2f16(<2 x half>, ptr, <2 x i1>)
define void @compressstore_v2f16(ptr %base, <2 x half> %v, <2 x i1> %mask) {
; RV32-LABEL: compressstore_v2f16:
; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT: vmv.x.s a1, v0
-; RV32-NEXT: andi a2, a1, 1
-; RV32-NEXT: bnez a2, .LBB1_3
-; RV32-NEXT: # %bb.1: # %else
-; RV32-NEXT: andi a1, a1, 2
-; RV32-NEXT: bnez a1, .LBB1_4
-; RV32-NEXT: .LBB1_2: # %else2
-; RV32-NEXT: ret
-; RV32-NEXT: .LBB1_3: # %cond.store
-; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
-; RV32-NEXT: vse16.v v8, (a0)
-; RV32-NEXT: addi a0, a0, 2
-; RV32-NEXT: andi a1, a1, 2
-; RV32-NEXT: beqz a1, .LBB1_2
-; RV32-NEXT: .LBB1_4: # %cond.store1
-; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
-; RV32-NEXT: vslidedown.vi v8, v8, 1
-; RV32-NEXT: vse16.v v8, (a0)
+; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; RV32-NEXT: vcompress.vm v9, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; RV32-NEXT: vse16.v v9, (a0)
; RV32-NEXT: ret
;
; RV64-LABEL: compressstore_v2f16:
; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT: vmv.x.s a1, v0
-; RV64-NEXT: andi a2, a1, 1
-; RV64-NEXT: bnez a2, .LBB1_3
-; RV64-NEXT: # %bb.1: # %else
-; RV64-NEXT: andi a1, a1, 2
-; RV64-NEXT: bnez a1, .LBB1_4
-; RV64-NEXT: .LBB1_2: # %else2
-; RV64-NEXT: ret
-; RV64-NEXT: .LBB1_3: # %cond.store
-; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
-; RV64-NEXT: vse16.v v8, (a0)
-; RV64-NEXT: addi a0, a0, 2
-; RV64-NEXT: andi a1, a1, 2
-; RV64-NEXT: beqz a1, .LBB1_2
-; RV64-NEXT: .LBB1_4: # %cond.store1
-; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
-; RV64-NEXT: vslidedown.vi v8, v8, 1
-; RV64-NEXT: vse16.v v8, (a0)
+; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; RV64-NEXT: vcompress.vm v9, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; RV64-NEXT: vse16.v v9, (a0)
; RV64-NEXT: ret
call void @llvm.masked.compressstore.v2f16(<2 x half> %v, ptr align 2 %base, <2 x i1> %mask)
ret void
@@ -84,88 +52,20 @@ declare void @llvm.masked.compressstore.v4f16(<4 x half>, ptr, <4 x i1>)
define void @compressstore_v4f16(ptr %base, <4 x half> %v, <4 x i1> %mask) {
; RV32-LABEL: compressstore_v4f16:
; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT: vmv.x.s a1, v0
-; RV32-NEXT: andi a2, a1, 1
-; RV32-NEXT: bnez a2, .LBB2_5
-; RV32-NEXT: # %bb.1: # %else
-; RV32-NEXT: andi a2, a1, 2
-; RV32-NEXT: bnez a2, .LBB2_6
-; RV32-NEXT: .LBB2_2: # %else2
-; RV32-NEXT: andi a2, a1, 4
-; RV32-NEXT: bnez a2, .LBB2_7
-; RV32-NEXT: .LBB2_3: # %else5
-; RV32-NEXT: andi a1, a1, 8
-; RV32-NEXT: bnez a1, .LBB2_8
-; RV32-NEXT: .LBB2_4: # %else8
-; RV32-NEXT: ret
-; RV32-NEXT: .LBB2_5: # %cond.store
-; RV32-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV32-NEXT: vse16.v v8, (a0)
-; RV32-NEXT: addi a0, a0, 2
-; RV32-NEXT: andi a2, a1, 2
-; RV32-NEXT: beqz a2, .LBB2_2
-; RV32-NEXT: .LBB2_6: # %cond.store1
-; RV32-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV32-NEXT: vslidedown.vi v9, v8, 1
-; RV32-NEXT: vse16.v v9, (a0)
-; RV32-NEXT: addi a0, a0, 2
-; RV32-NEXT: andi a2, a1, 4
-; RV32-NEXT: beqz a2, .LBB2_3
-; RV32-NEXT: .LBB2_7: # %cond.store4
-; RV32-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV32-NEXT: vslidedown.vi v9, v8, 2
+; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; RV32-NEXT: vcompress.vm v9, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
; RV32-NEXT: vse16.v v9, (a0)
-; RV32-NEXT: addi a0, a0, 2
-; RV32-NEXT: andi a1, a1, 8
-; RV32-NEXT: beqz a1, .LBB2_4
-; RV32-NEXT: .LBB2_8: # %cond.store7
-; RV32-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV32-NEXT: vslidedown.vi v8, v8, 3
-; RV32-NEXT: vse16.v v8, (a0)
; RV32-NEXT: ret
;
; RV64-LABEL: compressstore_v4f16:
; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT: vmv.x.s a1, v0
-; RV64-NEXT: andi a2, a1, 1
-; RV64-NEXT: bnez a2, .LBB2_5
-; RV64-NEXT: # %bb.1: # %else
-; RV64-NEXT: andi a2, a1, 2
-; RV64-NEXT: bnez a2, .LBB2_6
-; RV64-NEXT: .LBB2_2: # %else2
-; RV64-NEXT: andi a2, a1, 4
-; RV64-NEXT: bnez a2, .LBB2_7
-; RV64-NEXT: .LBB2_3: # %else5
-; RV64-NEXT: andi a1, a1, 8
-; RV64-NEXT: bnez a1, .LBB2_8
-; RV64-NEXT: .LBB2_4: # %else8
-; RV64-NEXT: ret
-; RV64-NEXT: .LBB2_5: # %cond.store
-; RV64-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64-NEXT: vse16.v v8, (a0)
-; RV64-NEXT: addi a0, a0, 2
-; RV64-NEXT: andi a2, a1, 2
-; RV64-NEXT: beqz a2, .LBB2_2
-; RV64-NEXT: .LBB2_6: # %cond.store1
-; RV64-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64-NEXT: vslidedown.vi v9, v8, 1
-; RV64-NEXT: vse16.v v9, (a0)
-; RV64-NEXT: addi a0, a0, 2
-; RV64-NEXT: andi a2, a1, 4
-; RV64-NEXT: beqz a2, .LBB2_3
-; RV64-NEXT: .LBB2_7: # %cond.store4
-; RV64-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64-NEXT: vslidedown.vi v9, v8, 2
+; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; RV64-NEXT: vcompress.vm v9, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
; RV64-NEXT: vse16.v v9, (a0)
-; RV64-NEXT: addi a0, a0, 2
-; RV64-NEXT: andi a1, a1, 8
-; RV64-NEXT: beqz a1, .LBB2_4
-; RV64-NEXT: .LBB2_8: # %cond.store7
-; RV64-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64-NEXT: vslidedown.vi v8, v8, 3
-; RV64-NEXT: vse16.v v8, (a0)
; RV64-NEXT: ret
call void @llvm.masked.compressstore.v4f16(<4 x half> %v, ptr align 2 %base, <4 x i1> %mask)
ret void
@@ -175,168 +75,20 @@ declare void @llvm.masked.compressstore.v8f16(<8 x half>, ptr, <8 x i1>)
define void @compressstore_v8f16(ptr %base, <8 x half> %v, <8 x i1> %mask) {
; RV32-LABEL: compressstore_v8f16:
; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT: vmv.x.s a1, v0
-; RV32-NEXT: andi a2, a1, 1
-; RV32-NEXT: bnez a2, .LBB3_9
-; RV32-NEXT: # %bb.1: # %else
-; RV32-NEXT: andi a2, a1, 2
-; RV32-NEXT: bnez a2, .LBB3_10
-; RV32-NEXT: .LBB3_2: # %else2
-; RV32-NEXT: andi a2, a1, 4
-; RV32-NEXT: bnez a2, .LBB3_11
-; RV32-NEXT: .LBB3_3: # %else5
-; RV32-NEXT: andi a2, a1, 8
-; RV32-NEXT: bnez a2, .LBB3_12
-; RV32-NEXT: .LBB3_4: # %else8
-; RV32-NEXT: andi a2, a1, 16
-; RV32-NEXT: bnez a2, .LBB3_13
-; RV32-NEXT: .LBB3_5: # %else11
-; RV32-NEXT: andi a2, a1, 32
-; RV32-NEXT: bnez a2, .LBB3_14
-; RV32-NEXT: .LBB3_6: # %else14
-; RV32-NEXT: andi a2, a1, 64
-; RV32-NEXT: bnez a2, .LBB3_15
-; RV32-NEXT: .LBB3_7: # %else17
-; RV32-NEXT: andi a1, a1, -128
-; RV32-NEXT: bnez a1, .LBB3_16
-; RV32-NEXT: .LBB3_8: # %else20
-; RV32-NEXT: ret
-; RV32-NEXT: .LBB3_9: # %cond.store
-; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV32-NEXT: vse16.v v8, (a0)
-; RV32-NEXT: addi a0, a0, 2
-; RV32-NEXT: andi a2, a1, 2
-; RV32-NEXT: beqz a2, .LBB3_2
-; RV32-NEXT: .LBB3_10: # %cond.store1
-; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v9, v8, 1
-; RV32-NEXT: vse16.v v9, (a0)
-; RV32-NEXT: addi a0, a0, 2
-; RV32-NEXT: andi a2, a1, 4
-; RV32-NEXT: beqz a2, .LBB3_3
-; RV32-NEXT: .LBB3_11: # %cond.store4
-; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v9, v8, 2
-; RV32-NEXT: vse16.v v9, (a0)
-; RV32-NEXT: addi a0, a0, 2
-; RV32-NEXT: andi a2, a1, 8
-; RV32-NEXT: beqz a2, .LBB3_4
-; RV32-NEXT: .LBB3_12: # %cond.store7
-; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v9, v8, 3
-; RV32-NEXT: vse16.v v9, (a0)
-; RV32-NEXT: addi a0, a0, 2
-; RV32-NEXT: andi a2, a1, 16
-; RV32-NEXT: beqz a2, .LBB3_5
-; RV32-NEXT: .LBB3_13: # %cond.store10
-; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v9, v8, 4
+; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT: vcompress.vm v9, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e16, m1, ta, ma
; RV32-NEXT: vse16.v v9, (a0)
-; RV32-NEXT: addi a0, a0, 2
-; RV32-NEXT: andi a2, a1, 32
-; RV32-NEXT: beqz a2, .LBB3_6
-; RV32-NEXT: .LBB3_14: # %cond.store13
-; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v9, v8, 5
-; RV32-NEXT: vse16.v v9, (a0)
-; RV32-NEXT: addi a0, a0, 2
-; RV32-NEXT: andi a2, a1, 64
-; RV32-NEXT: beqz a2, .LBB3_7
-; RV32-NEXT: .LBB3_15: # %cond.store16
-; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v9, v8, 6
-; RV32-NEXT: vse16.v v9, (a0)
-; RV32-NEXT: addi a0, a0, 2
-; RV32-NEXT: andi a1, a1, -128
-; RV32-NEXT: beqz a1, .LBB3_8
-; RV32-NEXT: .LBB3_16: # %cond.store19
-; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v8, v8, 7
-; RV32-NEXT: vse16.v v8, (a0)
; RV32-NEXT: ret
;
; RV64-LABEL: compressstore_v8f16:
; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT: vmv.x.s a1, v0
-; RV64-NEXT: andi a2, a1, 1
-; RV64-NEXT: bnez a2, .LBB3_9
-; RV64-NEXT: # %bb.1: # %else
-; RV64-NEXT: andi a2, a1, 2
-; RV64-NEXT: bnez a2, .LBB3_10
-; RV64-NEXT: .LBB3_2: # %else2
-; RV64-NEXT: andi a2, a1, 4
-; RV64-NEXT: bnez a2, .LBB3_11
-; RV64-NEXT: .LBB3_3: # %else5
-; RV64-NEXT: andi a2, a1, 8
-; RV64-NEXT: bnez a2, .LBB3_12
-; RV64-NEXT: .LBB3_4: # %else8
-; RV64-NEXT: andi a2, a1, 16
-; RV64-NEXT: bnez a2, .LBB3_13
-; RV64-NEXT: .LBB3_5: # %else11
-; RV64-NEXT: andi a2, a1, 32
-; RV64-NEXT: bnez a2, .LBB3_14
-; RV64-NEXT: .LBB3_6: # %else14
-; RV64-NEXT: andi a2, a1, 64
-; RV64-NEXT: bnez a2, .LBB3_15
-; RV64-NEXT: .LBB3_7: # %else17
-; RV64-NEXT: andi a1, a1, -128
-; RV64-NEXT: bnez a1, .LBB3_16
-; RV64-NEXT: .LBB3_8: # %else20
-; RV64-NEXT: ret
-; RV64-NEXT: .LBB3_9: # %cond.store
-; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64-NEXT: vse16.v v8, (a0)
-; RV64-NEXT: addi a0, a0, 2
-; RV64-NEXT: andi a2, a1, 2
-; RV64-NEXT: beqz a2, .LBB3_2
-; RV64-NEXT: .LBB3_10: # %cond.store1
-; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v9, v8, 1
-; RV64-NEXT: vse16.v v9, (a0)
-; RV64-NEXT: addi a0, a0, 2
-; RV64-NEXT: andi a2, a1, 4
-; RV64-NEXT: beqz a2, .LBB3_3
-; RV64-NEXT: .LBB3_11: # %cond.store4
-; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v9, v8, 2
-; RV64-NEXT: vse16.v v9, (a0)
-; RV64-NEXT: addi a0, a0, 2
-; RV64-NEXT: andi a2, a1, 8
-; RV64-NEXT: beqz a2, .LBB3_4
-; RV64-NEXT: .LBB3_12: # %cond.store7
-; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v9, v8, 3
+; RV64-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; RV64-NEXT: vcompress.vm v9, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e16, m1, ta, ma
; RV64-NEXT: vse16.v v9, (a0)
-; RV64-NEXT: addi a0, a0, 2
-; RV64-NEXT: andi a2, a1, 16
-; RV64-NEXT: beqz a2, .LBB3_5
-; RV64-NEXT: .LBB3_13: # %cond.store10
-; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v9, v8, 4
-; RV64-NEXT: vse16.v v9, (a0)
-; RV64-NEXT: addi a0, a0, 2
-; RV64-NEXT: andi a2, a1, 32
-; RV64-NEXT: beqz a2, .LBB3_6
-; RV64-NEXT: .LBB3_14: # %cond.store13
-; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v9, v8, 5
-; RV64-NEXT: vse16.v v9, (a0)
-; RV64-NEXT: addi a0, a0, 2
-; RV64-NEXT: andi a2, a1, 64
-; RV64-NEXT: beqz a2, .LBB3_7
-; RV64-NEXT: .LBB3_15: # %cond.store16
-; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v9, v8, 6
-; RV64-NEXT: vse16.v v9, (a0)
-; RV64-NEXT: addi a0, a0, 2
-; RV64-NEXT: andi a1, a1, -128
-; RV64-NEXT: beqz a1, .LBB3_8
-; RV64-NEXT: .LBB3_16: # %cond.store19
-; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v8, v8, 7
-; RV64-NEXT: vse16.v v8, (a0)
; RV64-NEXT: ret
call void @llvm.masked.compressstore.v8f16(<8 x half> %v, ptr align 2 %base, <8 x i1> %mask)
ret void
@@ -346,24 +98,20 @@ declare void @llvm.masked.compressstore.v1f32(<1 x float>, ptr, <1 x i1>)
define void @compressstore_v1f32(ptr %base, <1 x float> %v, <1 x i1> %mask) {
; RV32-LABEL: compressstore_v1f32:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
-; RV32-NEXT: vfirst.m a1, v0
-; RV32-NEXT: bnez a1, .LBB4_2
-; RV32-NEXT: # %bb.1: # %cond.store
; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
-; RV32-NEXT: vse32.v v8, (a0)
-; RV32-NEXT: .LBB4_2: # %else
+; RV32-NEXT: vcompress.vm v9, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; RV32-NEXT: vse32.v v9, (a0)
; RV32-NEXT: ret
;
; RV64-LABEL: compressstore_v1f32:
; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
-; RV64-NEXT: vfirst.m a1, v0
-; RV64-NEXT: bnez a1, .LBB4_2
-; RV64-NEXT: # %bb.1: # %cond.store
; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
-; RV64-NEXT: vse32.v v8, (a0)
-; RV64-NEXT: .LBB4_2: # %else
+; RV64-NEXT: vcompress.vm v9, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; RV64-NEXT: vse32.v v9, (a0)
; RV64-NEXT: ret
call void @llvm.masked.compressstore.v1f32(<1 x float> %v, ptr align 4 %base, <1 x i1> %mask)
ret void
@@ -373,48 +121,20 @@ declare void @llvm.masked.compressstore.v2f32(<2 x float>, ptr, <2 x i1>)
define void @compressstore_v2f32(ptr %base, <2 x float> %v, <2 x i1> %mask) {
; RV32-LABEL: compressstore_v2f32:
; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT: vmv.x.s a1, v0
-; RV32-NEXT: andi a2, a1, 1
-; RV32-NEXT: bnez a2, .LBB5_3
-; RV32-NEXT: # %bb.1: # %else
-; RV32-NEXT: andi a1, a1, 2
-; RV32-NEXT: bnez a1, .LBB5_4
-; RV32-NEXT: .LBB5_2: # %else2
-; RV32-NEXT: ret
-; RV32-NEXT: .LBB5_3: # %cond.store
-; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
-; RV32-NEXT: vse32.v v8, (a0)
-; RV32-NEXT: addi a0, a0, 4
-; RV32-NEXT: andi a1, a1, 2
-; RV32-NEXT: beqz a1, .LBB5_2
-; RV32-NEXT: .LBB5_4: # %cond.store1
-; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
-; RV32-NEXT: vslidedown.vi v8, v8, 1
-; RV32-NEXT: vse32.v v8, (a0)
+; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; RV32-NEXT: vcompress.vm v9, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; RV32-NEXT: vse32.v v9, (a0)
; RV32-NEXT: ret
;
; RV64-LABEL: compressstore_v2f32:
; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT: vmv.x.s a1, v0
-; RV64-NEXT: andi a2, a1, 1
-; RV64-NEXT: bnez a2, .LBB5_3
-; RV64-NEXT: # %bb.1: # %else
-; RV64-NEXT: andi a1, a1, 2
-; RV64-NEXT: bnez a1, .LBB5_4
-; RV64-NEXT: .LBB5_2: # %else2
-; RV64-NEXT: ret
-; RV64-NEXT: .LBB5_3: # %cond.store
-; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
-; RV64-NEXT: vse32.v v8, (a0)
-; RV64-NEXT: addi a0, a0, 4
-; RV64-NEXT: andi a1, a1, 2
-; RV64-NEXT: beqz a1, .LBB5_2
-; RV64-NEXT: .LBB5_4: # %cond.store1
-; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
-; RV64-NEXT: vslidedown.vi v8, v8, 1
-; RV64-NEXT: vse32.v v8, (a0)
+; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; RV64-NEXT: vcompress.vm v9, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; RV64-NEXT: vse32.v v9, (a0)
; RV64-NEXT: ret
call void @llvm.masked.compressstore.v2f32(<2 x float> %v, ptr align 4 %base, <2 x i1> %mask)
ret void
@@ -424,88 +144,20 @@ declare void @llvm.masked.compressstore.v4f32(<4 x float>, ptr, <4 x i1>)
define void @compressstore_v4f32(ptr %base, <4 x float> %v, <4 x i1> %mask) {
; RV32-LABEL: compressstore_v4f32:
; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT: vmv.x.s a1, v0
-; RV32-NEXT: andi a2, a1, 1
-; RV32-NEXT: bnez a2, .LBB6_5
-; RV32-NEXT: # %bb.1: # %else
-; RV32-NEXT: andi a2, a1, 2
-; RV32-NEXT: bnez a2, .LBB6_6
-; RV32-NEXT: .LBB6_2: # %else2
-; RV32-NEXT: andi a2, a1, 4
-; RV32-NEXT: bnez a2, .LBB6_7
-; RV32-NEXT: .LBB6_3: # %else5
-; RV32-NEXT: andi a1, a1, 8
-; RV32-NEXT: bnez a1, .LBB6_8
-; RV32-NEXT: .LBB6_4: # %else8
-; RV32-NEXT: ret
-; RV32-NEXT: .LBB6_5: # %cond.store
-; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT: vse32.v v8, (a0)
-; RV32-NEXT: addi a0, a0, 4
-; RV32-NEXT: andi a2, a1, 2
-; RV32-NEXT: beqz a2, .LBB6_2
-; RV32-NEXT: .LBB6_6: # %cond.store1
-; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v9, v8, 1
-; RV32-NEXT: vse32.v v9, (a0)
-; RV32-NEXT: addi a0, a0, 4
-; RV32-NEXT: andi a2, a1, 4
-; RV32-NEXT: beqz a2, .LBB6_3
-; RV32-NEXT: .LBB6_7: # %cond.store4
-; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v9, v8, 2
+; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT: vcompress.vm v9, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma
; RV32-NEXT: vse32.v v9, (a0)
-; RV32-NEXT: addi a0, a0, 4
-; RV32-NEXT: andi a1, a1, 8
-; RV32-NEXT: beqz a1, .LBB6_4
-; RV32-NEXT: .LBB6_8: # %cond.store7
-; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v8, v8, 3
-; RV32-NEXT: vse32.v v8, (a0)
; RV32-NEXT: ret
;
; RV64-LABEL: compressstore_v4f32:
; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT: vmv.x.s a1, v0
-; RV64-NEXT: andi a2, a1, 1
-; RV64-NEXT: bnez a2, .LBB6_5
-; RV64-NEXT: # %bb.1: # %else
-; RV64-NEXT: andi a2, a1, 2
-; RV64-NEXT: bnez a2, .LBB6_6
-; RV64-NEXT: .LBB6_2: # %else2
-; RV64-NEXT: andi a2, a1, 4
-; RV64-NEXT: bnez a2, .LBB6_7
-; RV64-NEXT: .LBB6_3: # %else5
-; RV64-NEXT: andi a1, a1, 8
-; RV64-NEXT: bnez a1, .LBB6_8
-; RV64-NEXT: .LBB6_4: # %else8
-; RV64-NEXT: ret
-; RV64-NEXT: .LBB6_5: # %cond.store
-; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT: vse32.v v8, (a0)
-; RV64-NEXT: addi a0, a0, 4
-; RV64-NEXT: andi a2, a1, 2
-; RV64-NEXT: beqz a2, .LBB6_2
-; RV64-NEXT: .LBB6_6: # %cond.store1
-; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v9, v8, 1
-; RV64-NEXT: vse32.v v9, (a0)
-; RV64-NEXT: addi a0, a0, 4
-; RV64-NEXT: andi a2, a1, 4
-; RV64-NEXT: beqz a2, .LBB6_3
-; RV64-NEXT: .LBB6_7: # %cond.store4
-; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v9, v8, 2
+; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT: vcompress.vm v9, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma
; RV64-NEXT: vse32.v v9, (a0)
-; RV64-NEXT: addi a0, a0, 4
-; RV64-NEXT: andi a1, a1, 8
-; RV64-NEXT: beqz a1, .LBB6_4
-; RV64-NEXT: .LBB6_8: # %cond.store7
-; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v8, v8, 3
-; RV64-NEXT: vse32.v v8, (a0)
; RV64-NEXT: ret
call void @llvm.masked.compressstore.v4f32(<4 x float> %v, ptr align 4 %base, <4 x i1> %mask)
ret void
@@ -515,176 +167,20 @@ declare void @llvm.masked.compressstore.v8f32(<8 x float>, ptr, <8 x i1>)
define void @compressstore_v8f32(ptr %base, <8 x float> %v, <8 x i1> %mask) {
; RV32-LABEL: compressstore_v8f32:
; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT: vmv.x.s a1, v0
-; RV32-NEXT: andi a2, a1, 1
-; RV32-NEXT: bnez a2, .LBB7_9
-; RV32-NEXT: # %bb.1: # %else
-; RV32-NEXT: andi a2, a1, 2
-; RV32-NEXT: bnez a2, .LBB7_10
-; RV32-NEXT: .LBB7_2: # %else2
-; RV32-NEXT: andi a2, a1, 4
-; RV32-NEXT: bnez a2, .LBB7_11
-; RV32-NEXT: .LBB7_3: # %else5
-; RV32-NEXT: andi a2, a1, 8
-; RV32-NEXT: bnez a2, .LBB7_12
-; RV32-NEXT: .LBB7_4: # %else8
-; RV32-NEXT: andi a2, a1, 16
-; RV32-NEXT: bnez a2, .LBB7_13
-; RV32-NEXT: .LBB7_5: # %else11
-; RV32-NEXT: andi a2, a1, 32
-; RV32-NEXT: bnez a2, .LBB7_14
-; RV32-NEXT: .LBB7_6: # %else14
-; RV32-NEXT: andi a2, a1, 64
-; RV32-NEXT: bnez a2, .LBB7_15
-; RV32-NEXT: .LBB7_7: # %else17
-; RV32-NEXT: andi a1, a1, -128
-; RV32-NEXT: bnez a1, .LBB7_16
-; RV32-NEXT: .LBB7_8: # %else20
-; RV32-NEXT: ret
-; RV32-NEXT: .LBB7_9: # %cond.store
-; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT: vse32.v v8, (a0)
-; RV32-NEXT: addi a0, a0, 4
-; RV32-NEXT: andi a2, a1, 2
-; RV32-NEXT: beqz a2, .LBB7_2
-; RV32-NEXT: .LBB7_10: # %cond.store1
-; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v10, v8, 1
-; RV32-NEXT: vse32.v v10, (a0)
-; RV32-NEXT: addi a0, a0, 4
-; RV32-NEXT: andi a2, a1, 4
-; RV32-NEXT: beqz a2, .LBB7_3
-; RV32-NEXT: .LBB7_11: # %cond.store4
-; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v10, v8, 2
-; RV32-NEXT: vse32.v v10, (a0)
-; RV32-NEXT: addi a0, a0, 4
-; RV32-NEXT: andi a2, a1, 8
-; RV32-NEXT: beqz a2, .LBB7_4
-; RV32-NEXT: .LBB7_12: # %cond.store7
-; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v10, v8, 3
-; RV32-NEXT: vse32.v v10, (a0)
-; RV32-NEXT: addi a0, a0, 4
-; RV32-NEXT: andi a2, a1, 16
-; RV32-NEXT: beqz a2, .LBB7_5
-; RV32-NEXT: .LBB7_13: # %cond.store10
-; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma
-; RV32-NEXT: vslidedown.vi v10, v8, 4
-; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT: vse32.v v10, (a0)
-; RV32-NEXT: addi a0, a0, 4
-; RV32-NEXT: andi a2, a1, 32
-; RV32-NEXT: beqz a2, .LBB7_6
-; RV32-NEXT: .LBB7_14: # %cond.store13
-; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma
-; RV32-NEXT: vslidedown.vi v10, v8, 5
-; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT: vse32.v v10, (a0)
-; RV32-NEXT: addi a0, a0, 4
-; RV32-NEXT: andi a2, a1, 64
-; RV32-NEXT: beqz a2, .LBB7_7
-; RV32-NEXT: .LBB7_15: # %cond.store16
-; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma
-; RV32-NEXT: vslidedown.vi v10, v8, 6
-; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT: vcompress.vm v10, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e32, m2, ta, ma
; RV32-NEXT: vse32.v v10, (a0)
-; RV32-NEXT: addi a0, a0, 4
-; RV32-NEXT: andi a1, a1, -128
-; RV32-NEXT: beqz a1, .LBB7_8
-; RV32-NEXT: .LBB7_16: # %cond.store19
-; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma
-; RV32-NEXT: vslidedown.vi v8, v8, 7
-; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT: vse32.v v8, (a0)
; RV32-NEXT: ret
;
; RV64-LABEL: compressstore_v8f32:
; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT: vmv.x.s a1, v0
-; RV64-NEXT: andi a2, a1, 1
-; RV64-NEXT: bnez a2, .LBB7_9
-; RV64-NEXT: # %bb.1: # %else
-; RV64-NEXT: andi a2, a1, 2
-; RV64-NEXT: bnez a2, .LBB7_10
-; RV64-NEXT: .LBB7_2: # %else2
-; RV64-NEXT: andi a2, a1, 4
-; RV64-NEXT: bnez a2, .LBB7_11
-; RV64-NEXT: .LBB7_3: # %else5
-; RV64-NEXT: andi a2, a1, 8
-; RV64-NEXT: bnez a2, .LBB7_12
-; RV64-NEXT: .LBB7_4: # %else8
-; RV64-NEXT: andi a2, a1, 16
-; RV64-NEXT: bnez a2, .LBB7_13
-; RV64-NEXT: .LBB7_5: # %else11
-; RV64-NEXT: andi a2, a1, 32
-; RV64-NEXT: bnez a2, .LBB7_14
-; RV64-NEXT: .LBB7_6: # %else14
-; RV64-NEXT: andi a2, a1, 64
-; RV64-NEXT: bnez a2, .LBB7_15
-; RV64-NEXT: .LBB7_7: # %else17
-; RV64-NEXT: andi a1, a1, -128
-; RV64-NEXT: bnez a1, .LBB7_16
-; RV64-NEXT: .LBB7_8: # %else20
-; RV64-NEXT: ret
-; RV64-NEXT: .LBB7_9: # %cond.store
-; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT: vse32.v v8, (a0)
-; RV64-NEXT: addi a0, a0, 4
-; RV64-NEXT: andi a2, a1, 2
-; RV64-NEXT: beqz a2, .LBB7_2
-; RV64-NEXT: .LBB7_10: # %cond.store1
-; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v10, v8, 1
-; RV64-NEXT: vse32.v v10, (a0)
-; RV64-NEXT: addi a0, a0, 4
-; RV64-NEXT: andi a2, a1, 4
-; RV64-NEXT: beqz a2, .LBB7_3
-; RV64-NEXT: .LBB7_11: # %cond.store4
-; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v10, v8, 2
-; RV64-NEXT: vse32.v v10, (a0)
-; RV64-NEXT: addi a0, a0, 4
-; RV64-NEXT: andi a2, a1, 8
-; RV64-NEXT: beqz a2, .LBB7_4
-; RV64-NEXT: .LBB7_12: # %cond.store7
-; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v10, v8, 3
-; RV64-NEXT: vse32.v v10, (a0)
-; RV64-NEXT: addi a0, a0, 4
-; RV64-NEXT: andi a2, a1, 16
-; RV64-NEXT: beqz a2, .LBB7_5
-; RV64-NEXT: .LBB7_13: # %cond.store10
-; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma
-; RV64-NEXT: vslidedown.vi v10, v8, 4
-; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT: vse32.v v10, (a0)
-; RV64-NEXT: addi a0, a0, 4
-; RV64-NEXT: andi a2, a1, 32
-; RV64-NEXT: beqz a2, .LBB7_6
-; RV64-NEXT: .LBB7_14: # %cond.store13
-; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma
-; RV64-NEXT: vslidedown.vi v10, v8, 5
-; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT: vse32.v v10, (a0)
-; RV64-NEXT: addi a0, a0, 4
-; RV64-NEXT: andi a2, a1, 64
-; RV64-NEXT: beqz a2, .LBB7_7
-; RV64-NEXT: .LBB7_15: # %cond.store16
-; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma
-; RV64-NEXT: vslidedown.vi v10, v8, 6
-; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT: vcompress.vm v10, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e32, m2, ta, ma
; RV64-NEXT: vse32.v v10, (a0)
-; RV64-NEXT: addi a0, a0, 4
-; RV64-NEXT: andi a1, a1, -128
-; RV64-NEXT: beqz a1, .LBB7_8
-; RV64-NEXT: .LBB7_16: # %cond.store19
-; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma
-; RV64-NEXT: vslidedown.vi v8, v8, 7
-; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT: vse32.v v8, (a0)
; RV64-NEXT: ret
call void @llvm.masked.compressstore.v8f32(<8 x float> %v, ptr align 4 %base, <8 x i1> %mask)
ret void
@@ -694,24 +190,20 @@ declare void @llvm.masked.compressstore.v1f64(<1 x double>, ptr, <1 x i1>)
define void @compressstore_v1f64(ptr %base, <1 x double> %v, <1 x i1> %mask) {
; RV32-LABEL: compressstore_v1f64:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
-; RV32-NEXT: vfirst.m a1, v0
-; RV32-NEXT: bnez a1, .LBB8_2
-; RV32-NEXT: # %bb.1: # %cond.store
; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT: vse64.v v8, (a0)
-; RV32-NEXT: .LBB8_2: # %else
+; RV32-NEXT: vcompress.vm v9, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e64, m1, ta, ma
+; RV32-NEXT: vse64.v v9, (a0)
; RV32-NEXT: ret
;
; RV64-LABEL: compressstore_v1f64:
; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
-; RV64-NEXT: vfirst.m a1, v0
-; RV64-NEXT: bnez a1, .LBB8_2
-; RV64-NEXT: # %bb.1: # %cond.store
; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT: vse64.v v8, (a0)
-; RV64-NEXT: .LBB8_2: # %else
+; RV64-NEXT: vcompress.vm v9, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT: vse64.v v9, (a0)
; RV64-NEXT: ret
call void @llvm.masked.compressstore.v1f64(<1 x double> %v, ptr align 8 %base, <1 x i1> %mask)
ret void
@@ -721,48 +213,20 @@ declare void @llvm.masked.compressstore.v2f64(<2 x double>, ptr, <2 x i1>)
define void @compressstore_v2f64(ptr %base, <2 x double> %v, <2 x i1> %mask) {
; RV32-LABEL: compressstore_v2f64:
; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT: vmv.x.s a1, v0
-; RV32-NEXT: andi a2, a1, 1
-; RV32-NEXT: bnez a2, .LBB9_3
-; RV32-NEXT: # %bb.1: # %else
-; RV32-NEXT: andi a1, a1, 2
-; RV32-NEXT: bnez a1, .LBB9_4
-; RV32-NEXT: .LBB9_2: # %else2
-; RV32-NEXT: ret
-; RV32-NEXT: .LBB9_3: # %cond.store
-; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT: vse64.v v8, (a0)
-; RV32-NEXT: addi a0, a0, 8
-; RV32-NEXT: andi a1, a1, 2
-; RV32-NEXT: beqz a1, .LBB9_2
-; RV32-NEXT: .LBB9_4: # %cond.store1
-; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v8, v8, 1
-; RV32-NEXT: vse64.v v8, (a0)
+; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT: vcompress.vm v9, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e64, m1, ta, ma
+; RV32-NEXT: vse64.v v9, (a0)
; RV32-NEXT: ret
;
; RV64-LABEL: compressstore_v2f64:
; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT: vmv.x.s a1, v0
-; RV64-NEXT: andi a2, a1, 1
-; RV64-NEXT: bnez a2, .LBB9_3
-; RV64-NEXT: # %bb.1: # %else
-; RV64-NEXT: andi a1, a1, 2
-; RV64-NEXT: bnez a1, .LBB9_4
-; RV64-NEXT: .LBB9_2: # %else2
-; RV64-NEXT: ret
-; RV64-NEXT: .LBB9_3: # %cond.store
-; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT: vse64.v v8, (a0)
-; RV64-NEXT: addi a0, a0, 8
-; RV64-NEXT: andi a1, a1, 2
-; RV64-NEXT: beqz a1, .LBB9_2
-; RV64-NEXT: .LBB9_4: # %cond.store1
-; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v8, v8, 1
-; RV64-NEXT: vse64.v v8, (a0)
+; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT: vcompress.vm v9, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT: vse64.v v9, (a0)
; RV64-NEXT: ret
call void @llvm.masked.compressstore.v2f64(<2 x double> %v, ptr align 8 %base, <2 x i1> %mask)
ret void
@@ -772,92 +236,20 @@ declare void @llvm.masked.compressstore.v4f64(<4 x double>, ptr, <4 x i1>)
define void @compressstore_v4f64(ptr %base, <4 x double> %v, <4 x i1> %mask) {
; RV32-LABEL: compressstore_v4f64:
; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT: vmv.x.s a1, v0
-; RV32-NEXT: andi a2, a1, 1
-; RV32-NEXT: bnez a2, .LBB10_5
-; RV32-NEXT: # %bb.1: # %else
-; RV32-NEXT: andi a2, a1, 2
-; RV32-NEXT: bnez a2, .LBB10_6
-; RV32-NEXT: .LBB10_2: # %else2
-; RV32-NEXT: andi a2, a1, 4
-; RV32-NEXT: bnez a2, .LBB10_7
-; RV32-NEXT: .LBB10_3: # %else5
-; RV32-NEXT: andi a1, a1, 8
-; RV32-NEXT: bnez a1, .LBB10_8
-; RV32-NEXT: .LBB10_4: # %else8
-; RV32-NEXT: ret
-; RV32-NEXT: .LBB10_5: # %cond.store
-; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT: vse64.v v8, (a0)
-; RV32-NEXT: addi a0, a0, 8
-; RV32-NEXT: andi a2, a1, 2
-; RV32-NEXT: beqz a2, .LBB10_2
-; RV32-NEXT: .LBB10_6: # %cond.store1
-; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v10, v8, 1
+; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT: vcompress.vm v10, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e64, m2, ta, ma
; RV32-NEXT: vse64.v v10, (a0)
-; RV32-NEXT: addi a0, a0, 8
-; RV32-NEXT: andi a2, a1, 4
-; RV32-NEXT: beqz a2, .LBB10_3
-; RV32-NEXT: .LBB10_7: # %cond.store4
-; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma
-; RV32-NEXT: vslidedown.vi v10, v8, 2
-; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT: vse64.v v10, (a0)
-; RV32-NEXT: addi a0, a0, 8
-; RV32-NEXT: andi a1, a1, 8
-; RV32-NEXT: beqz a1, .LBB10_4
-; RV32-NEXT: .LBB10_8: # %cond.store7
-; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma
-; RV32-NEXT: vslidedown.vi v8, v8, 3
-; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT: vse64.v v8, (a0)
; RV32-NEXT: ret
;
; RV64-LABEL: compressstore_v4f64:
; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT: vmv.x.s a1, v0
-; RV64-NEXT: andi a2, a1, 1
-; RV64-NEXT: bnez a2, .LBB10_5
-; RV64-NEXT: # %bb.1: # %else
-; RV64-NEXT: andi a2, a1, 2
-; RV64-NEXT: bnez a2, .LBB10_6
-; RV64-NEXT: .LBB10_2: # %else2
-; RV64-NEXT: andi a2, a1, 4
-; RV64-NEXT: bnez a2, .LBB10_7
-; RV64-NEXT: .LBB10_3: # %else5
-; RV64-NEXT: andi a1, a1, 8
-; RV64-NEXT: bnez a1, .LBB10_8
-; RV64-NEXT: .LBB10_4: # %else8
-; RV64-NEXT: ret
-; RV64-NEXT: .LBB10_5: # %cond.store
-; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT: vse64.v v8, (a0)
-; RV64-NEXT: addi a0, a0, 8
-; RV64-NEXT: andi a2, a1, 2
-; RV64-NEXT: beqz a2, .LBB10_2
-; RV64-NEXT: .LBB10_6: # %cond.store1
-; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v10, v8, 1
-; RV64-NEXT: vse64.v v10, (a0)
-; RV64-NEXT: addi a0, a0, 8
-; RV64-NEXT: andi a2, a1, 4
-; RV64-NEXT: beqz a2, .LBB10_3
-; RV64-NEXT: .LBB10_7: # %cond.store4
-; RV64-NEXT: vsetivli zero, 1, e64, m2, ta, ma
-; RV64-NEXT: vslidedown.vi v10, v8, 2
-; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; RV64-NEXT: vcompress.vm v10, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e64, m2, ta, ma
; RV64-NEXT: vse64.v v10, (a0)
-; RV64-NEXT: addi a0, a0, 8
-; RV64-NEXT: andi a1, a1, 8
-; RV64-NEXT: beqz a1, .LBB10_4
-; RV64-NEXT: .LBB10_8: # %cond.store7
-; RV64-NEXT: vsetivli zero, 1, e64, m2, ta, ma
-; RV64-NEXT: vslidedown.vi v8, v8, 3
-; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT: vse64.v v8, (a0)
; RV64-NEXT: ret
call void @llvm.masked.compressstore.v4f64(<4 x double> %v, ptr align 8 %base, <4 x i1> %mask)
ret void
@@ -867,213 +259,21 @@ declare void @llvm.masked.compressstore.v8f64(<8 x double>, ptr, <8 x i1>)
define void @compressstore_v8f64(ptr %base, <8 x double> %v, <8 x i1> %mask) {
; RV32-LABEL: compressstore_v8f64:
; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT: vmv.x.s a1, v0
-; RV32-NEXT: andi a2, a1, 1
-; RV32-NEXT: bnez a2, .LBB11_11
-; RV32-NEXT: # %bb.1: # %else
-; RV32-NEXT: andi a2, a1, 2
-; RV32-NEXT: bnez a2, .LBB11_12
-; RV32-NEXT: .LBB11_2: # %else2
-; RV32-NEXT: andi a2, a1, 4
-; RV32-NEXT: bnez a2, .LBB11_13
-; RV32-NEXT: .LBB11_3: # %else5
-; RV32-NEXT: andi a2, a1, 8
-; RV32-NEXT: beqz a2, .LBB11_5
-; RV32-NEXT: .LBB11_4: # %cond.store7
-; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma
-; RV32-NEXT: vslidedown.vi v12, v8, 3
-; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT: vse64.v v12, (a0)
-; RV32-NEXT: addi a0, a0, 8
-; RV32-NEXT: .LBB11_5: # %else8
-; RV32-NEXT: addi sp, sp, -320
-; RV32-NEXT: .cfi_def_cfa_offset 320
-; RV32-NEXT: sw ra, 316(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s0, 312(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset ra, -4
-; RV32-NEXT: .cfi_offset s0, -8
-; RV32-NEXT: addi s0, sp, 320
-; RV32-NEXT: .cfi_def_cfa s0, 0
-; RV32-NEXT: andi sp, sp, -64
-; RV32-NEXT: andi a2, a1, 16
-; RV32-NEXT: bnez a2, .LBB11_14
-; RV32-NEXT: # %bb.6: # %else11
-; RV32-NEXT: andi a2, a1, 32
-; RV32-NEXT: bnez a2, .LBB11_15
-; RV32-NEXT: .LBB11_7: # %else14
-; RV32-NEXT: andi a2, a1, 64
-; RV32-NEXT: bnez a2, .LBB11_16
-; RV32-NEXT: .LBB11_8: # %else17
-; RV32-NEXT: andi a1, a1, -128
-; RV32-NEXT: beqz a1, .LBB11_10
-; RV32-NEXT: .LBB11_9: # %cond.store19
-; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT: vse64.v v8, (a1)
-; RV32-NEXT: fld fa5, 56(sp)
-; RV32-NEXT: fsd fa5, 0(a0)
-; RV32-NEXT: .LBB11_10: # %else20
-; RV32-NEXT: addi sp, s0, -320
-; RV32-NEXT: lw ra, 316(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s0, 312(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 320
-; RV32-NEXT: ret
-; RV32-NEXT: .LBB11_11: # %cond.store
-; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT: vse64.v v8, (a0)
-; RV32-NEXT: addi a0, a0, 8
-; RV32-NEXT: andi a2, a1, 2
-; RV32-NEXT: beqz a2, .LBB11_2
-; RV32-NEXT: .LBB11_12: # %cond.store1
-; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v12, v8, 1
+; RV32-NEXT: vcompress.vm v12, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e64, m4, ta, ma
; RV32-NEXT: vse64.v v12, (a0)
-; RV32-NEXT: addi a0, a0, 8
-; RV32-NEXT: andi a2, a1, 4
-; RV32-NEXT: beqz a2, .LBB11_3
-; RV32-NEXT: .LBB11_13: # %cond.store4
-; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma
-; RV32-NEXT: vslidedown.vi v12, v8, 2
-; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT: vse64.v v12, (a0)
-; RV32-NEXT: addi a0, a0, 8
-; RV32-NEXT: andi a2, a1, 8
-; RV32-NEXT: bnez a2, .LBB11_4
-; RV32-NEXT: j .LBB11_5
-; RV32-NEXT: .LBB11_14: # %cond.store10
-; RV32-NEXT: addi a2, sp, 192
-; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT: vse64.v v8, (a2)
-; RV32-NEXT: fld fa5, 224(sp)
-; RV32-NEXT: fsd fa5, 0(a0)
-; RV32-NEXT: addi a0, a0, 8
-; RV32-NEXT: andi a2, a1, 32
-; RV32-NEXT: beqz a2, .LBB11_7
-; RV32-NEXT: .LBB11_15: # %cond.store13
-; RV32-NEXT: addi a2, sp, 128
-; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT: vse64.v v8, (a2)
-; RV32-NEXT: fld fa5, 168(sp)
-; RV32-NEXT: fsd fa5, 0(a0)
-; RV32-NEXT: addi a0, a0, 8
-; RV32-NEXT: andi a2, a1, 64
-; RV32-NEXT: beqz a2, .LBB11_8
-; RV32-NEXT: .LBB11_16: # %cond.store16
-; RV32-NEXT: addi a2, sp, 64
-; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT: vse64.v v8, (a2)
-; RV32-NEXT: fld fa5, 112(sp)
-; RV32-NEXT: fsd fa5, 0(a0)
-; RV32-NEXT: addi a0, a0, 8
-; RV32-NEXT: andi a1, a1, -128
-; RV32-NEXT: bnez a1, .LBB11_9
-; RV32-NEXT: j .LBB11_10
+; RV32-NEXT: ret
;
; RV64-LABEL: compressstore_v8f64:
; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT: vmv.x.s a1, v0
-; RV64-NEXT: andi a2, a1, 1
-; RV64-NEXT: bnez a2, .LBB11_11
-; RV64-NEXT: # %bb.1: # %else
-; RV64-NEXT: andi a2, a1, 2
-; RV64-NEXT: bnez a2, .LBB11_12
-; RV64-NEXT: .LBB11_2: # %else2
-; RV64-NEXT: andi a2, a1, 4
-; RV64-NEXT: bnez a2, .LBB11_13
-; RV64-NEXT: .LBB11_3: # %else5
-; RV64-NEXT: andi a2, a1, 8
-; RV64-NEXT: beqz a2, .LBB11_5
-; RV64-NEXT: .LBB11_4: # %cond.store7
-; RV64-NEXT: vsetivli zero, 1, e64, m2, ta, ma
-; RV64-NEXT: vslidedown.vi v12, v8, 3
-; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT: vse64.v v12, (a0)
-; RV64-NEXT: addi a0, a0, 8
-; RV64-NEXT: .LBB11_5: # %else8
-; RV64-NEXT: addi sp, sp, -320
-; RV64-NEXT: .cfi_def_cfa_offset 320
-; RV64-NEXT: sd ra, 312(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s0, 304(sp) # 8-byte Folded Spill
-; RV64-NEXT: .cfi_offset ra, -8
-; RV64-NEXT: .cfi_offset s0, -16
-; RV64-NEXT: addi s0, sp, 320
-; RV64-NEXT: .cfi_def_cfa s0, 0
-; RV64-NEXT: andi sp, sp, -64
-; RV64-NEXT: andi a2, a1, 16
-; RV64-NEXT: bnez a2, .LBB11_14
-; RV64-NEXT: # %bb.6: # %else11
-; RV64-NEXT: andi a2, a1, 32
-; RV64-NEXT: bnez a2, .LBB11_15
-; RV64-NEXT: .LBB11_7: # %else14
-; RV64-NEXT: andi a2, a1, 64
-; RV64-NEXT: bnez a2, .LBB11_16
-; RV64-NEXT: .LBB11_8: # %else17
-; RV64-NEXT: andi a1, a1, -128
-; RV64-NEXT: beqz a1, .LBB11_10
-; RV64-NEXT: .LBB11_9: # %cond.store19
-; RV64-NEXT: mv a1, sp
; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT: vse64.v v8, (a1)
-; RV64-NEXT: fld fa5, 56(sp)
-; RV64-NEXT: fsd fa5, 0(a0)
-; RV64-NEXT: .LBB11_10: # %else20
-; RV64-NEXT: addi sp, s0, -320
-; RV64-NEXT: ld ra, 312(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s0, 304(sp) # 8-byte Folded Reload
-; RV64-NEXT: addi sp, sp, 320
-; RV64-NEXT: ret
-; RV64-NEXT: .LBB11_11: # %cond.store
-; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT: vse64.v v8, (a0)
-; RV64-NEXT: addi a0, a0, 8
-; RV64-NEXT: andi a2, a1, 2
-; RV64-NEXT: beqz a2, .LBB11_2
-; RV64-NEXT: .LBB11_12: # %cond.store1
-; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v12, v8, 1
+; RV64-NEXT: vcompress.vm v12, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma
; RV64-NEXT: vse64.v v12, (a0)
-; RV64-NEXT: addi a0, a0, 8
-; RV64-NEXT: andi a2, a1, 4
-; RV64-NEXT: beqz a2, .LBB11_3
-; RV64-NEXT: .LBB11_13: # %cond.store4
-; RV64-NEXT: vsetivli zero, 1, e64, m2, ta, ma
-; RV64-NEXT: vslidedown.vi v12, v8, 2
-; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT: vse64.v v12, (a0)
-; RV64-NEXT: addi a0, a0, 8
-; RV64-NEXT: andi a2, a1, 8
-; RV64-NEXT: bnez a2, .LBB11_4
-; RV64-NEXT: j .LBB11_5
-; RV64-NEXT: .LBB11_14: # %cond.store10
-; RV64-NEXT: addi a2, sp, 192
-; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT: vse64.v v8, (a2)
-; RV64-NEXT: fld fa5, 224(sp)
-; RV64-NEXT: fsd fa5, 0(a0)
-; RV64-NEXT: addi a0, a0, 8
-; RV64-NEXT: andi a2, a1, 32
-; RV64-NEXT: beqz a2, .LBB11_7
-; RV64-NEXT: .LBB11_15: # %cond.store13
-; RV64-NEXT: addi a2, sp, 128
-; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT: vse64.v v8, (a2)
-; RV64-NEXT: fld fa5, 168(sp)
-; RV64-NEXT: fsd fa5, 0(a0)
-; RV64-NEXT: addi a0, a0, 8
-; RV64-NEXT: andi a2, a1, 64
-; RV64-NEXT: beqz a2, .LBB11_8
-; RV64-NEXT: .LBB11_16: # %cond.store16
-; RV64-NEXT: addi a2, sp, 64
-; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT: vse64.v v8, (a2)
-; RV64-NEXT: fld fa5, 112(sp)
-; RV64-NEXT: fsd fa5, 0(a0)
-; RV64-NEXT: addi a0, a0, 8
-; RV64-NEXT: andi a1, a1, -128
-; RV64-NEXT: bnez a1, .LBB11_9
-; RV64-NEXT: j .LBB11_10
+; RV64-NEXT: ret
call void @llvm.masked.compressstore.v8f64(<8 x double> %v, ptr align 8 %base, <8 x i1> %mask)
ret void
}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-compressstore-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-compressstore-int.ll
index eb0096d..a388ba9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-compressstore-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-compressstore-int.ll
@@ -6,13 +6,11 @@ declare void @llvm.masked.compressstore.v1i8(<1 x i8>, ptr, <1 x i1>)
define void @compressstore_v1i8(ptr %base, <1 x i8> %v, <1 x i1> %mask) {
; CHECK-LABEL: compressstore_v1i8:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
-; CHECK-NEXT: vfirst.m a1, v0
-; CHECK-NEXT: bnez a1, .LBB0_2
-; CHECK-NEXT: # %bb.1: # %cond.store
; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
-; CHECK-NEXT: vse8.v v8, (a0)
-; CHECK-NEXT: .LBB0_2: # %else
+; CHECK-NEXT: vcompress.vm v9, v8, v0
+; CHECK-NEXT: vcpop.m a1, v0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT: vse8.v v9, (a0)
; CHECK-NEXT: ret
call void @llvm.masked.compressstore.v1i8(<1 x i8> %v, ptr %base, <1 x i1> %mask)
ret void
@@ -22,25 +20,11 @@ declare void @llvm.masked.compressstore.v2i8(<2 x i8>, ptr, <2 x i1>)
define void @compressstore_v2i8(ptr %base, <2 x i8> %v, <2 x i1> %mask) {
; CHECK-LABEL: compressstore_v2i8:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.x.s a1, v0
-; CHECK-NEXT: andi a2, a1, 1
-; CHECK-NEXT: bnez a2, .LBB1_3
-; CHECK-NEXT: # %bb.1: # %else
-; CHECK-NEXT: andi a1, a1, 2
-; CHECK-NEXT: bnez a1, .LBB1_4
-; CHECK-NEXT: .LBB1_2: # %else2
-; CHECK-NEXT: ret
-; CHECK-NEXT: .LBB1_3: # %cond.store
-; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
-; CHECK-NEXT: vse8.v v8, (a0)
-; CHECK-NEXT: addi a0, a0, 1
-; CHECK-NEXT: andi a1, a1, 2
-; CHECK-NEXT: beqz a1, .LBB1_2
-; CHECK-NEXT: .LBB1_4: # %cond.store1
-; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
-; CHECK-NEXT: vslidedown.vi v8, v8, 1
-; CHECK-NEXT: vse8.v v8, (a0)
+; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-NEXT: vcompress.vm v9, v8, v0
+; CHECK-NEXT: vcpop.m a1, v0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT: vse8.v v9, (a0)
; CHECK-NEXT: ret
call void @llvm.masked.compressstore.v2i8(<2 x i8> %v, ptr %base, <2 x i1> %mask)
ret void
@@ -50,45 +34,11 @@ declare void @llvm.masked.compressstore.v4i8(<4 x i8>, ptr, <4 x i1>)
define void @compressstore_v4i8(ptr %base, <4 x i8> %v, <4 x i1> %mask) {
; CHECK-LABEL: compressstore_v4i8:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.x.s a1, v0
-; CHECK-NEXT: andi a2, a1, 1
-; CHECK-NEXT: bnez a2, .LBB2_5
-; CHECK-NEXT: # %bb.1: # %else
-; CHECK-NEXT: andi a2, a1, 2
-; CHECK-NEXT: bnez a2, .LBB2_6
-; CHECK-NEXT: .LBB2_2: # %else2
-; CHECK-NEXT: andi a2, a1, 4
-; CHECK-NEXT: bnez a2, .LBB2_7
-; CHECK-NEXT: .LBB2_3: # %else5
-; CHECK-NEXT: andi a1, a1, 8
-; CHECK-NEXT: bnez a1, .LBB2_8
-; CHECK-NEXT: .LBB2_4: # %else8
-; CHECK-NEXT: ret
-; CHECK-NEXT: .LBB2_5: # %cond.store
-; CHECK-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; CHECK-NEXT: vse8.v v8, (a0)
-; CHECK-NEXT: addi a0, a0, 1
-; CHECK-NEXT: andi a2, a1, 2
-; CHECK-NEXT: beqz a2, .LBB2_2
-; CHECK-NEXT: .LBB2_6: # %cond.store1
-; CHECK-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; CHECK-NEXT: vslidedown.vi v9, v8, 1
+; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT: vcompress.vm v9, v8, v0
+; CHECK-NEXT: vcpop.m a1, v0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
; CHECK-NEXT: vse8.v v9, (a0)
-; CHECK-NEXT: addi a0, a0, 1
-; CHECK-NEXT: andi a2, a1, 4
-; CHECK-NEXT: beqz a2, .LBB2_3
-; CHECK-NEXT: .LBB2_7: # %cond.store4
-; CHECK-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; CHECK-NEXT: vslidedown.vi v9, v8, 2
-; CHECK-NEXT: vse8.v v9, (a0)
-; CHECK-NEXT: addi a0, a0, 1
-; CHECK-NEXT: andi a1, a1, 8
-; CHECK-NEXT: beqz a1, .LBB2_4
-; CHECK-NEXT: .LBB2_8: # %cond.store7
-; CHECK-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; CHECK-NEXT: vslidedown.vi v8, v8, 3
-; CHECK-NEXT: vse8.v v8, (a0)
; CHECK-NEXT: ret
call void @llvm.masked.compressstore.v4i8(<4 x i8> %v, ptr %base, <4 x i1> %mask)
ret void
@@ -98,85 +48,11 @@ declare void @llvm.masked.compressstore.v8i8(<8 x i8>, ptr, <8 x i1>)
define void @compressstore_v8i8(ptr %base, <8 x i8> %v, <8 x i1> %mask) {
; CHECK-LABEL: compressstore_v8i8:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.x.s a1, v0
-; CHECK-NEXT: andi a2, a1, 1
-; CHECK-NEXT: bnez a2, .LBB3_9
-; CHECK-NEXT: # %bb.1: # %else
-; CHECK-NEXT: andi a2, a1, 2
-; CHECK-NEXT: bnez a2, .LBB3_10
-; CHECK-NEXT: .LBB3_2: # %else2
-; CHECK-NEXT: andi a2, a1, 4
-; CHECK-NEXT: bnez a2, .LBB3_11
-; CHECK-NEXT: .LBB3_3: # %else5
-; CHECK-NEXT: andi a2, a1, 8
-; CHECK-NEXT: bnez a2, .LBB3_12
-; CHECK-NEXT: .LBB3_4: # %else8
-; CHECK-NEXT: andi a2, a1, 16
-; CHECK-NEXT: bnez a2, .LBB3_13
-; CHECK-NEXT: .LBB3_5: # %else11
-; CHECK-NEXT: andi a2, a1, 32
-; CHECK-NEXT: bnez a2, .LBB3_14
-; CHECK-NEXT: .LBB3_6: # %else14
-; CHECK-NEXT: andi a2, a1, 64
-; CHECK-NEXT: bnez a2, .LBB3_15
-; CHECK-NEXT: .LBB3_7: # %else17
-; CHECK-NEXT: andi a1, a1, -128
-; CHECK-NEXT: bnez a1, .LBB3_16
-; CHECK-NEXT: .LBB3_8: # %else20
-; CHECK-NEXT: ret
-; CHECK-NEXT: .LBB3_9: # %cond.store
-; CHECK-NEXT: vsetivli zero, 1, e8, mf2, ta, ma
-; CHECK-NEXT: vse8.v v8, (a0)
-; CHECK-NEXT: addi a0, a0, 1
-; CHECK-NEXT: andi a2, a1, 2
-; CHECK-NEXT: beqz a2, .LBB3_2
-; CHECK-NEXT: .LBB3_10: # %cond.store1
-; CHECK-NEXT: vsetivli zero, 1, e8, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vi v9, v8, 1
-; CHECK-NEXT: vse8.v v9, (a0)
-; CHECK-NEXT: addi a0, a0, 1
-; CHECK-NEXT: andi a2, a1, 4
-; CHECK-NEXT: beqz a2, .LBB3_3
-; CHECK-NEXT: .LBB3_11: # %cond.store4
-; CHECK-NEXT: vsetivli zero, 1, e8, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vi v9, v8, 2
-; CHECK-NEXT: vse8.v v9, (a0)
-; CHECK-NEXT: addi a0, a0, 1
-; CHECK-NEXT: andi a2, a1, 8
-; CHECK-NEXT: beqz a2, .LBB3_4
-; CHECK-NEXT: .LBB3_12: # %cond.store7
-; CHECK-NEXT: vsetivli zero, 1, e8, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vi v9, v8, 3
-; CHECK-NEXT: vse8.v v9, (a0)
-; CHECK-NEXT: addi a0, a0, 1
-; CHECK-NEXT: andi a2, a1, 16
-; CHECK-NEXT: beqz a2, .LBB3_5
-; CHECK-NEXT: .LBB3_13: # %cond.store10
-; CHECK-NEXT: vsetivli zero, 1, e8, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vi v9, v8, 4
-; CHECK-NEXT: vse8.v v9, (a0)
-; CHECK-NEXT: addi a0, a0, 1
-; CHECK-NEXT: andi a2, a1, 32
-; CHECK-NEXT: beqz a2, .LBB3_6
-; CHECK-NEXT: .LBB3_14: # %cond.store13
-; CHECK-NEXT: vsetivli zero, 1, e8, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vi v9, v8, 5
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT: vcompress.vm v9, v8, v0
+; CHECK-NEXT: vcpop.m a1, v0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
; CHECK-NEXT: vse8.v v9, (a0)
-; CHECK-NEXT: addi a0, a0, 1
-; CHECK-NEXT: andi a2, a1, 64
-; CHECK-NEXT: beqz a2, .LBB3_7
-; CHECK-NEXT: .LBB3_15: # %cond.store16
-; CHECK-NEXT: vsetivli zero, 1, e8, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vi v9, v8, 6
-; CHECK-NEXT: vse8.v v9, (a0)
-; CHECK-NEXT: addi a0, a0, 1
-; CHECK-NEXT: andi a1, a1, -128
-; CHECK-NEXT: beqz a1, .LBB3_8
-; CHECK-NEXT: .LBB3_16: # %cond.store19
-; CHECK-NEXT: vsetivli zero, 1, e8, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vi v8, v8, 7
-; CHECK-NEXT: vse8.v v8, (a0)
; CHECK-NEXT: ret
call void @llvm.masked.compressstore.v8i8(<8 x i8> %v, ptr %base, <8 x i1> %mask)
ret void
@@ -186,13 +62,11 @@ declare void @llvm.masked.compressstore.v1i16(<1 x i16>, ptr, <1 x i1>)
define void @compressstore_v1i16(ptr %base, <1 x i16> %v, <1 x i1> %mask) {
; CHECK-LABEL: compressstore_v1i16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
-; CHECK-NEXT: vfirst.m a1, v0
-; CHECK-NEXT: bnez a1, .LBB4_2
-; CHECK-NEXT: # %bb.1: # %cond.store
; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-NEXT: vse16.v v8, (a0)
-; CHECK-NEXT: .LBB4_2: # %else
+; CHECK-NEXT: vcompress.vm v9, v8, v0
+; CHECK-NEXT: vcpop.m a1, v0
+; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT: vse16.v v9, (a0)
; CHECK-NEXT: ret
call void @llvm.masked.compressstore.v1i16(<1 x i16> %v, ptr align 2 %base, <1 x i1> %mask)
ret void
@@ -202,25 +76,11 @@ declare void @llvm.masked.compressstore.v2i16(<2 x i16>, ptr, <2 x i1>)
define void @compressstore_v2i16(ptr %base, <2 x i16> %v, <2 x i1> %mask) {
; CHECK-LABEL: compressstore_v2i16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.x.s a1, v0
-; CHECK-NEXT: andi a2, a1, 1
-; CHECK-NEXT: bnez a2, .LBB5_3
-; CHECK-NEXT: # %bb.1: # %else
-; CHECK-NEXT: andi a1, a1, 2
-; CHECK-NEXT: bnez a1, .LBB5_4
-; CHECK-NEXT: .LBB5_2: # %else2
-; CHECK-NEXT: ret
-; CHECK-NEXT: .LBB5_3: # %cond.store
-; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-NEXT: vse16.v v8, (a0)
-; CHECK-NEXT: addi a0, a0, 2
-; CHECK-NEXT: andi a1, a1, 2
-; CHECK-NEXT: beqz a1, .LBB5_2
-; CHECK-NEXT: .LBB5_4: # %cond.store1
-; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-NEXT: vslidedown.vi v8, v8, 1
-; CHECK-NEXT: vse16.v v8, (a0)
+; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT: vcompress.vm v9, v8, v0
+; CHECK-NEXT: vcpop.m a1, v0
+; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT: vse16.v v9, (a0)
; CHECK-NEXT: ret
call void @llvm.masked.compressstore.v2i16(<2 x i16> %v, ptr align 2 %base, <2 x i1> %mask)
ret void
@@ -230,45 +90,11 @@ declare void @llvm.masked.compressstore.v4i16(<4 x i16>, ptr, <4 x i1>)
define void @compressstore_v4i16(ptr %base, <4 x i16> %v, <4 x i1> %mask) {
; CHECK-LABEL: compressstore_v4i16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.x.s a1, v0
-; CHECK-NEXT: andi a2, a1, 1
-; CHECK-NEXT: bnez a2, .LBB6_5
-; CHECK-NEXT: # %bb.1: # %else
-; CHECK-NEXT: andi a2, a1, 2
-; CHECK-NEXT: bnez a2, .LBB6_6
-; CHECK-NEXT: .LBB6_2: # %else2
-; CHECK-NEXT: andi a2, a1, 4
-; CHECK-NEXT: bnez a2, .LBB6_7
-; CHECK-NEXT: .LBB6_3: # %else5
-; CHECK-NEXT: andi a1, a1, 8
-; CHECK-NEXT: bnez a1, .LBB6_8
-; CHECK-NEXT: .LBB6_4: # %else8
-; CHECK-NEXT: ret
-; CHECK-NEXT: .LBB6_5: # %cond.store
-; CHECK-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; CHECK-NEXT: vse16.v v8, (a0)
-; CHECK-NEXT: addi a0, a0, 2
-; CHECK-NEXT: andi a2, a1, 2
-; CHECK-NEXT: beqz a2, .LBB6_2
-; CHECK-NEXT: .LBB6_6: # %cond.store1
-; CHECK-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vi v9, v8, 1
-; CHECK-NEXT: vse16.v v9, (a0)
-; CHECK-NEXT: addi a0, a0, 2
-; CHECK-NEXT: andi a2, a1, 4
-; CHECK-NEXT: beqz a2, .LBB6_3
-; CHECK-NEXT: .LBB6_7: # %cond.store4
-; CHECK-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vi v9, v8, 2
+; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT: vcompress.vm v9, v8, v0
+; CHECK-NEXT: vcpop.m a1, v0
+; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
; CHECK-NEXT: vse16.v v9, (a0)
-; CHECK-NEXT: addi a0, a0, 2
-; CHECK-NEXT: andi a1, a1, 8
-; CHECK-NEXT: beqz a1, .LBB6_4
-; CHECK-NEXT: .LBB6_8: # %cond.store7
-; CHECK-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vi v8, v8, 3
-; CHECK-NEXT: vse16.v v8, (a0)
; CHECK-NEXT: ret
call void @llvm.masked.compressstore.v4i16(<4 x i16> %v, ptr align 2 %base, <4 x i1> %mask)
ret void
@@ -278,85 +104,11 @@ declare void @llvm.masked.compressstore.v8i16(<8 x i16>, ptr, <8 x i1>)
define void @compressstore_v8i16(ptr %base, <8 x i16> %v, <8 x i1> %mask) {
; CHECK-LABEL: compressstore_v8i16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.x.s a1, v0
-; CHECK-NEXT: andi a2, a1, 1
-; CHECK-NEXT: bnez a2, .LBB7_9
-; CHECK-NEXT: # %bb.1: # %else
-; CHECK-NEXT: andi a2, a1, 2
-; CHECK-NEXT: bnez a2, .LBB7_10
-; CHECK-NEXT: .LBB7_2: # %else2
-; CHECK-NEXT: andi a2, a1, 4
-; CHECK-NEXT: bnez a2, .LBB7_11
-; CHECK-NEXT: .LBB7_3: # %else5
-; CHECK-NEXT: andi a2, a1, 8
-; CHECK-NEXT: bnez a2, .LBB7_12
-; CHECK-NEXT: .LBB7_4: # %else8
-; CHECK-NEXT: andi a2, a1, 16
-; CHECK-NEXT: bnez a2, .LBB7_13
-; CHECK-NEXT: .LBB7_5: # %else11
-; CHECK-NEXT: andi a2, a1, 32
-; CHECK-NEXT: bnez a2, .LBB7_14
-; CHECK-NEXT: .LBB7_6: # %else14
-; CHECK-NEXT: andi a2, a1, 64
-; CHECK-NEXT: bnez a2, .LBB7_15
-; CHECK-NEXT: .LBB7_7: # %else17
-; CHECK-NEXT: andi a1, a1, -128
-; CHECK-NEXT: bnez a1, .LBB7_16
-; CHECK-NEXT: .LBB7_8: # %else20
-; CHECK-NEXT: ret
-; CHECK-NEXT: .LBB7_9: # %cond.store
-; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vse16.v v8, (a0)
-; CHECK-NEXT: addi a0, a0, 2
-; CHECK-NEXT: andi a2, a1, 2
-; CHECK-NEXT: beqz a2, .LBB7_2
-; CHECK-NEXT: .LBB7_10: # %cond.store1
-; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vslidedown.vi v9, v8, 1
-; CHECK-NEXT: vse16.v v9, (a0)
-; CHECK-NEXT: addi a0, a0, 2
-; CHECK-NEXT: andi a2, a1, 4
-; CHECK-NEXT: beqz a2, .LBB7_3
-; CHECK-NEXT: .LBB7_11: # %cond.store4
-; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vslidedown.vi v9, v8, 2
-; CHECK-NEXT: vse16.v v9, (a0)
-; CHECK-NEXT: addi a0, a0, 2
-; CHECK-NEXT: andi a2, a1, 8
-; CHECK-NEXT: beqz a2, .LBB7_4
-; CHECK-NEXT: .LBB7_12: # %cond.store7
-; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vslidedown.vi v9, v8, 3
-; CHECK-NEXT: vse16.v v9, (a0)
-; CHECK-NEXT: addi a0, a0, 2
-; CHECK-NEXT: andi a2, a1, 16
-; CHECK-NEXT: beqz a2, .LBB7_5
-; CHECK-NEXT: .LBB7_13: # %cond.store10
-; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vslidedown.vi v9, v8, 4
+; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT: vcompress.vm v9, v8, v0
+; CHECK-NEXT: vcpop.m a1, v0
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
; CHECK-NEXT: vse16.v v9, (a0)
-; CHECK-NEXT: addi a0, a0, 2
-; CHECK-NEXT: andi a2, a1, 32
-; CHECK-NEXT: beqz a2, .LBB7_6
-; CHECK-NEXT: .LBB7_14: # %cond.store13
-; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vslidedown.vi v9, v8, 5
-; CHECK-NEXT: vse16.v v9, (a0)
-; CHECK-NEXT: addi a0, a0, 2
-; CHECK-NEXT: andi a2, a1, 64
-; CHECK-NEXT: beqz a2, .LBB7_7
-; CHECK-NEXT: .LBB7_15: # %cond.store16
-; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vslidedown.vi v9, v8, 6
-; CHECK-NEXT: vse16.v v9, (a0)
-; CHECK-NEXT: addi a0, a0, 2
-; CHECK-NEXT: andi a1, a1, -128
-; CHECK-NEXT: beqz a1, .LBB7_8
-; CHECK-NEXT: .LBB7_16: # %cond.store19
-; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vslidedown.vi v8, v8, 7
-; CHECK-NEXT: vse16.v v8, (a0)
; CHECK-NEXT: ret
call void @llvm.masked.compressstore.v8i16(<8 x i16> %v, ptr align 2 %base, <8 x i1> %mask)
ret void
@@ -366,13 +118,11 @@ declare void @llvm.masked.compressstore.v1i32(<1 x i32>, ptr, <1 x i1>)
define void @compressstore_v1i32(ptr %base, <1 x i32> %v, <1 x i1> %mask) {
; CHECK-LABEL: compressstore_v1i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
-; CHECK-NEXT: vfirst.m a1, v0
-; CHECK-NEXT: bnez a1, .LBB8_2
-; CHECK-NEXT: # %bb.1: # %cond.store
; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT: vse32.v v8, (a0)
-; CHECK-NEXT: .LBB8_2: # %else
+; CHECK-NEXT: vcompress.vm v9, v8, v0
+; CHECK-NEXT: vcpop.m a1, v0
+; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT: vse32.v v9, (a0)
; CHECK-NEXT: ret
call void @llvm.masked.compressstore.v1i32(<1 x i32> %v, ptr align 4 %base, <1 x i1> %mask)
ret void
@@ -382,25 +132,11 @@ declare void @llvm.masked.compressstore.v2i32(<2 x i32>, ptr, <2 x i1>)
define void @compressstore_v2i32(ptr %base, <2 x i32> %v, <2 x i1> %mask) {
; CHECK-LABEL: compressstore_v2i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.x.s a1, v0
-; CHECK-NEXT: andi a2, a1, 1
-; CHECK-NEXT: bnez a2, .LBB9_3
-; CHECK-NEXT: # %bb.1: # %else
-; CHECK-NEXT: andi a1, a1, 2
-; CHECK-NEXT: bnez a1, .LBB9_4
-; CHECK-NEXT: .LBB9_2: # %else2
-; CHECK-NEXT: ret
-; CHECK-NEXT: .LBB9_3: # %cond.store
-; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT: vse32.v v8, (a0)
-; CHECK-NEXT: addi a0, a0, 4
-; CHECK-NEXT: andi a1, a1, 2
-; CHECK-NEXT: beqz a1, .LBB9_2
-; CHECK-NEXT: .LBB9_4: # %cond.store1
-; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vi v8, v8, 1
-; CHECK-NEXT: vse32.v v8, (a0)
+; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT: vcompress.vm v9, v8, v0
+; CHECK-NEXT: vcpop.m a1, v0
+; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT: vse32.v v9, (a0)
; CHECK-NEXT: ret
call void @llvm.masked.compressstore.v2i32(<2 x i32> %v, ptr align 4 %base, <2 x i1> %mask)
ret void
@@ -410,45 +146,11 @@ declare void @llvm.masked.compressstore.v4i32(<4 x i32>, ptr, <4 x i1>)
define void @compressstore_v4i32(ptr %base, <4 x i32> %v, <4 x i1> %mask) {
; CHECK-LABEL: compressstore_v4i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.x.s a1, v0
-; CHECK-NEXT: andi a2, a1, 1
-; CHECK-NEXT: bnez a2, .LBB10_5
-; CHECK-NEXT: # %bb.1: # %else
-; CHECK-NEXT: andi a2, a1, 2
-; CHECK-NEXT: bnez a2, .LBB10_6
-; CHECK-NEXT: .LBB10_2: # %else2
-; CHECK-NEXT: andi a2, a1, 4
-; CHECK-NEXT: bnez a2, .LBB10_7
-; CHECK-NEXT: .LBB10_3: # %else5
-; CHECK-NEXT: andi a1, a1, 8
-; CHECK-NEXT: bnez a1, .LBB10_8
-; CHECK-NEXT: .LBB10_4: # %else8
-; CHECK-NEXT: ret
-; CHECK-NEXT: .LBB10_5: # %cond.store
-; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT: vse32.v v8, (a0)
-; CHECK-NEXT: addi a0, a0, 4
-; CHECK-NEXT: andi a2, a1, 2
-; CHECK-NEXT: beqz a2, .LBB10_2
-; CHECK-NEXT: .LBB10_6: # %cond.store1
-; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT: vslidedown.vi v9, v8, 1
-; CHECK-NEXT: vse32.v v9, (a0)
-; CHECK-NEXT: addi a0, a0, 4
-; CHECK-NEXT: andi a2, a1, 4
-; CHECK-NEXT: beqz a2, .LBB10_3
-; CHECK-NEXT: .LBB10_7: # %cond.store4
-; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT: vslidedown.vi v9, v8, 2
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vcompress.vm v9, v8, v0
+; CHECK-NEXT: vcpop.m a1, v0
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
; CHECK-NEXT: vse32.v v9, (a0)
-; CHECK-NEXT: addi a0, a0, 4
-; CHECK-NEXT: andi a1, a1, 8
-; CHECK-NEXT: beqz a1, .LBB10_4
-; CHECK-NEXT: .LBB10_8: # %cond.store7
-; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT: vslidedown.vi v8, v8, 3
-; CHECK-NEXT: vse32.v v8, (a0)
; CHECK-NEXT: ret
call void @llvm.masked.compressstore.v4i32(<4 x i32> %v, ptr align 4 %base, <4 x i1> %mask)
ret void
@@ -458,89 +160,11 @@ declare void @llvm.masked.compressstore.v8i32(<8 x i32>, ptr, <8 x i1>)
define void @compressstore_v8i32(ptr %base, <8 x i32> %v, <8 x i1> %mask) {
; CHECK-LABEL: compressstore_v8i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.x.s a1, v0
-; CHECK-NEXT: andi a2, a1, 1
-; CHECK-NEXT: bnez a2, .LBB11_9
-; CHECK-NEXT: # %bb.1: # %else
-; CHECK-NEXT: andi a2, a1, 2
-; CHECK-NEXT: bnez a2, .LBB11_10
-; CHECK-NEXT: .LBB11_2: # %else2
-; CHECK-NEXT: andi a2, a1, 4
-; CHECK-NEXT: bnez a2, .LBB11_11
-; CHECK-NEXT: .LBB11_3: # %else5
-; CHECK-NEXT: andi a2, a1, 8
-; CHECK-NEXT: bnez a2, .LBB11_12
-; CHECK-NEXT: .LBB11_4: # %else8
-; CHECK-NEXT: andi a2, a1, 16
-; CHECK-NEXT: bnez a2, .LBB11_13
-; CHECK-NEXT: .LBB11_5: # %else11
-; CHECK-NEXT: andi a2, a1, 32
-; CHECK-NEXT: bnez a2, .LBB11_14
-; CHECK-NEXT: .LBB11_6: # %else14
-; CHECK-NEXT: andi a2, a1, 64
-; CHECK-NEXT: bnez a2, .LBB11_15
-; CHECK-NEXT: .LBB11_7: # %else17
-; CHECK-NEXT: andi a1, a1, -128
-; CHECK-NEXT: bnez a1, .LBB11_16
-; CHECK-NEXT: .LBB11_8: # %else20
-; CHECK-NEXT: ret
-; CHECK-NEXT: .LBB11_9: # %cond.store
-; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT: vse32.v v8, (a0)
-; CHECK-NEXT: addi a0, a0, 4
-; CHECK-NEXT: andi a2, a1, 2
-; CHECK-NEXT: beqz a2, .LBB11_2
-; CHECK-NEXT: .LBB11_10: # %cond.store1
-; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT: vslidedown.vi v10, v8, 1
-; CHECK-NEXT: vse32.v v10, (a0)
-; CHECK-NEXT: addi a0, a0, 4
-; CHECK-NEXT: andi a2, a1, 4
-; CHECK-NEXT: beqz a2, .LBB11_3
-; CHECK-NEXT: .LBB11_11: # %cond.store4
-; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT: vslidedown.vi v10, v8, 2
-; CHECK-NEXT: vse32.v v10, (a0)
-; CHECK-NEXT: addi a0, a0, 4
-; CHECK-NEXT: andi a2, a1, 8
-; CHECK-NEXT: beqz a2, .LBB11_4
-; CHECK-NEXT: .LBB11_12: # %cond.store7
-; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT: vslidedown.vi v10, v8, 3
+; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT: vcompress.vm v10, v8, v0
+; CHECK-NEXT: vcpop.m a1, v0
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
; CHECK-NEXT: vse32.v v10, (a0)
-; CHECK-NEXT: addi a0, a0, 4
-; CHECK-NEXT: andi a2, a1, 16
-; CHECK-NEXT: beqz a2, .LBB11_5
-; CHECK-NEXT: .LBB11_13: # %cond.store10
-; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, ma
-; CHECK-NEXT: vslidedown.vi v10, v8, 4
-; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT: vse32.v v10, (a0)
-; CHECK-NEXT: addi a0, a0, 4
-; CHECK-NEXT: andi a2, a1, 32
-; CHECK-NEXT: beqz a2, .LBB11_6
-; CHECK-NEXT: .LBB11_14: # %cond.store13
-; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, ma
-; CHECK-NEXT: vslidedown.vi v10, v8, 5
-; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT: vse32.v v10, (a0)
-; CHECK-NEXT: addi a0, a0, 4
-; CHECK-NEXT: andi a2, a1, 64
-; CHECK-NEXT: beqz a2, .LBB11_7
-; CHECK-NEXT: .LBB11_15: # %cond.store16
-; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, ma
-; CHECK-NEXT: vslidedown.vi v10, v8, 6
-; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT: vse32.v v10, (a0)
-; CHECK-NEXT: addi a0, a0, 4
-; CHECK-NEXT: andi a1, a1, -128
-; CHECK-NEXT: beqz a1, .LBB11_8
-; CHECK-NEXT: .LBB11_16: # %cond.store19
-; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, ma
-; CHECK-NEXT: vslidedown.vi v8, v8, 7
-; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT: vse32.v v8, (a0)
; CHECK-NEXT: ret
call void @llvm.masked.compressstore.v8i32(<8 x i32> %v, ptr align 4 %base, <8 x i1> %mask)
ret void
@@ -548,439 +172,59 @@ define void @compressstore_v8i32(ptr %base, <8 x i32> %v, <8 x i1> %mask) {
declare void @llvm.masked.compressstore.v1i64(<1 x i64>, ptr, <1 x i1>)
define void @compressstore_v1i64(ptr %base, <1 x i64> %v, <1 x i1> %mask) {
-; RV32-LABEL: compressstore_v1i64:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
-; RV32-NEXT: vfirst.m a1, v0
-; RV32-NEXT: bnez a1, .LBB12_2
-; RV32-NEXT: # %bb.1: # %cond.store
-; RV32-NEXT: li a1, 32
-; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT: vsrl.vx v9, v8, a1
-; RV32-NEXT: vmv.x.s a1, v9
-; RV32-NEXT: vmv.x.s a2, v8
-; RV32-NEXT: sw a2, 0(a0)
-; RV32-NEXT: sw a1, 4(a0)
-; RV32-NEXT: .LBB12_2: # %else
-; RV32-NEXT: ret
-;
-; RV64-LABEL: compressstore_v1i64:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
-; RV64-NEXT: vfirst.m a1, v0
-; RV64-NEXT: bnez a1, .LBB12_2
-; RV64-NEXT: # %bb.1: # %cond.store
-; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT: vse64.v v8, (a0)
-; RV64-NEXT: .LBB12_2: # %else
-; RV64-NEXT: ret
+; CHECK-LABEL: compressstore_v1i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma
+; CHECK-NEXT: vcompress.vm v9, v8, v0
+; CHECK-NEXT: vcpop.m a1, v0
+; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT: vse64.v v9, (a0)
+; CHECK-NEXT: ret
call void @llvm.masked.compressstore.v1i64(<1 x i64> %v, ptr align 8 %base, <1 x i1> %mask)
ret void
}
declare void @llvm.masked.compressstore.v2i64(<2 x i64>, ptr, <2 x i1>)
define void @compressstore_v2i64(ptr %base, <2 x i64> %v, <2 x i1> %mask) {
-; RV32-LABEL: compressstore_v2i64:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT: vmv.x.s a1, v0
-; RV32-NEXT: andi a2, a1, 1
-; RV32-NEXT: bnez a2, .LBB13_3
-; RV32-NEXT: # %bb.1: # %else
-; RV32-NEXT: andi a1, a1, 2
-; RV32-NEXT: bnez a1, .LBB13_4
-; RV32-NEXT: .LBB13_2: # %else2
-; RV32-NEXT: ret
-; RV32-NEXT: .LBB13_3: # %cond.store
-; RV32-NEXT: li a2, 32
-; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT: vsrl.vx v9, v8, a2
-; RV32-NEXT: vmv.x.s a2, v9
-; RV32-NEXT: vmv.x.s a3, v8
-; RV32-NEXT: sw a3, 0(a0)
-; RV32-NEXT: sw a2, 4(a0)
-; RV32-NEXT: addi a0, a0, 8
-; RV32-NEXT: andi a1, a1, 2
-; RV32-NEXT: beqz a1, .LBB13_2
-; RV32-NEXT: .LBB13_4: # %cond.store1
-; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v8, v8, 1
-; RV32-NEXT: li a1, 32
-; RV32-NEXT: vsrl.vx v9, v8, a1
-; RV32-NEXT: vmv.x.s a1, v9
-; RV32-NEXT: vmv.x.s a2, v8
-; RV32-NEXT: sw a2, 0(a0)
-; RV32-NEXT: sw a1, 4(a0)
-; RV32-NEXT: ret
-;
-; RV64-LABEL: compressstore_v2i64:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT: vmv.x.s a1, v0
-; RV64-NEXT: andi a2, a1, 1
-; RV64-NEXT: bnez a2, .LBB13_3
-; RV64-NEXT: # %bb.1: # %else
-; RV64-NEXT: andi a1, a1, 2
-; RV64-NEXT: bnez a1, .LBB13_4
-; RV64-NEXT: .LBB13_2: # %else2
-; RV64-NEXT: ret
-; RV64-NEXT: .LBB13_3: # %cond.store
-; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT: vse64.v v8, (a0)
-; RV64-NEXT: addi a0, a0, 8
-; RV64-NEXT: andi a1, a1, 2
-; RV64-NEXT: beqz a1, .LBB13_2
-; RV64-NEXT: .LBB13_4: # %cond.store1
-; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v8, v8, 1
-; RV64-NEXT: vse64.v v8, (a0)
-; RV64-NEXT: ret
+; CHECK-LABEL: compressstore_v2i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT: vcompress.vm v9, v8, v0
+; CHECK-NEXT: vcpop.m a1, v0
+; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT: vse64.v v9, (a0)
+; CHECK-NEXT: ret
call void @llvm.masked.compressstore.v2i64(<2 x i64> %v, ptr align 8 %base, <2 x i1> %mask)
ret void
}
declare void @llvm.masked.compressstore.v4i64(<4 x i64>, ptr, <4 x i1>)
define void @compressstore_v4i64(ptr %base, <4 x i64> %v, <4 x i1> %mask) {
-; RV32-LABEL: compressstore_v4i64:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT: vmv.x.s a1, v0
-; RV32-NEXT: andi a2, a1, 1
-; RV32-NEXT: bnez a2, .LBB14_5
-; RV32-NEXT: # %bb.1: # %else
-; RV32-NEXT: andi a2, a1, 2
-; RV32-NEXT: bnez a2, .LBB14_6
-; RV32-NEXT: .LBB14_2: # %else2
-; RV32-NEXT: andi a2, a1, 4
-; RV32-NEXT: bnez a2, .LBB14_7
-; RV32-NEXT: .LBB14_3: # %else5
-; RV32-NEXT: andi a1, a1, 8
-; RV32-NEXT: bnez a1, .LBB14_8
-; RV32-NEXT: .LBB14_4: # %else8
-; RV32-NEXT: ret
-; RV32-NEXT: .LBB14_5: # %cond.store
-; RV32-NEXT: li a2, 32
-; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma
-; RV32-NEXT: vsrl.vx v10, v8, a2
-; RV32-NEXT: vmv.x.s a2, v10
-; RV32-NEXT: vmv.x.s a3, v8
-; RV32-NEXT: sw a3, 0(a0)
-; RV32-NEXT: sw a2, 4(a0)
-; RV32-NEXT: addi a0, a0, 8
-; RV32-NEXT: andi a2, a1, 2
-; RV32-NEXT: beqz a2, .LBB14_2
-; RV32-NEXT: .LBB14_6: # %cond.store1
-; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma
-; RV32-NEXT: vslidedown.vi v10, v8, 1
-; RV32-NEXT: li a2, 32
-; RV32-NEXT: vsrl.vx v12, v10, a2
-; RV32-NEXT: vmv.x.s a2, v12
-; RV32-NEXT: vmv.x.s a3, v10
-; RV32-NEXT: sw a3, 0(a0)
-; RV32-NEXT: sw a2, 4(a0)
-; RV32-NEXT: addi a0, a0, 8
-; RV32-NEXT: andi a2, a1, 4
-; RV32-NEXT: beqz a2, .LBB14_3
-; RV32-NEXT: .LBB14_7: # %cond.store4
-; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma
-; RV32-NEXT: vslidedown.vi v10, v8, 2
-; RV32-NEXT: li a2, 32
-; RV32-NEXT: vsrl.vx v12, v10, a2
-; RV32-NEXT: vmv.x.s a2, v12
-; RV32-NEXT: vmv.x.s a3, v10
-; RV32-NEXT: sw a3, 0(a0)
-; RV32-NEXT: sw a2, 4(a0)
-; RV32-NEXT: addi a0, a0, 8
-; RV32-NEXT: andi a1, a1, 8
-; RV32-NEXT: beqz a1, .LBB14_4
-; RV32-NEXT: .LBB14_8: # %cond.store7
-; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma
-; RV32-NEXT: vslidedown.vi v8, v8, 3
-; RV32-NEXT: li a1, 32
-; RV32-NEXT: vsrl.vx v10, v8, a1
-; RV32-NEXT: vmv.x.s a1, v10
-; RV32-NEXT: vmv.x.s a2, v8
-; RV32-NEXT: sw a2, 0(a0)
-; RV32-NEXT: sw a1, 4(a0)
-; RV32-NEXT: ret
-;
-; RV64-LABEL: compressstore_v4i64:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT: vmv.x.s a1, v0
-; RV64-NEXT: andi a2, a1, 1
-; RV64-NEXT: bnez a2, .LBB14_5
-; RV64-NEXT: # %bb.1: # %else
-; RV64-NEXT: andi a2, a1, 2
-; RV64-NEXT: bnez a2, .LBB14_6
-; RV64-NEXT: .LBB14_2: # %else2
-; RV64-NEXT: andi a2, a1, 4
-; RV64-NEXT: bnez a2, .LBB14_7
-; RV64-NEXT: .LBB14_3: # %else5
-; RV64-NEXT: andi a1, a1, 8
-; RV64-NEXT: bnez a1, .LBB14_8
-; RV64-NEXT: .LBB14_4: # %else8
-; RV64-NEXT: ret
-; RV64-NEXT: .LBB14_5: # %cond.store
-; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT: vse64.v v8, (a0)
-; RV64-NEXT: addi a0, a0, 8
-; RV64-NEXT: andi a2, a1, 2
-; RV64-NEXT: beqz a2, .LBB14_2
-; RV64-NEXT: .LBB14_6: # %cond.store1
-; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v10, v8, 1
-; RV64-NEXT: vse64.v v10, (a0)
-; RV64-NEXT: addi a0, a0, 8
-; RV64-NEXT: andi a2, a1, 4
-; RV64-NEXT: beqz a2, .LBB14_3
-; RV64-NEXT: .LBB14_7: # %cond.store4
-; RV64-NEXT: vsetivli zero, 1, e64, m2, ta, ma
-; RV64-NEXT: vslidedown.vi v10, v8, 2
-; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT: vse64.v v10, (a0)
-; RV64-NEXT: addi a0, a0, 8
-; RV64-NEXT: andi a1, a1, 8
-; RV64-NEXT: beqz a1, .LBB14_4
-; RV64-NEXT: .LBB14_8: # %cond.store7
-; RV64-NEXT: vsetivli zero, 1, e64, m2, ta, ma
-; RV64-NEXT: vslidedown.vi v8, v8, 3
-; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT: vse64.v v8, (a0)
-; RV64-NEXT: ret
+; CHECK-LABEL: compressstore_v4i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT: vcompress.vm v10, v8, v0
+; CHECK-NEXT: vcpop.m a1, v0
+; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT: vse64.v v10, (a0)
+; CHECK-NEXT: ret
call void @llvm.masked.compressstore.v4i64(<4 x i64> %v, ptr align 8 %base, <4 x i1> %mask)
ret void
}
declare void @llvm.masked.compressstore.v8i64(<8 x i64>, ptr, <8 x i1>)
define void @compressstore_v8i64(ptr %base, <8 x i64> %v, <8 x i1> %mask) {
-; RV32-LABEL: compressstore_v8i64:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT: vmv.x.s a1, v0
-; RV32-NEXT: andi a2, a1, 1
-; RV32-NEXT: bnez a2, .LBB15_9
-; RV32-NEXT: # %bb.1: # %else
-; RV32-NEXT: andi a2, a1, 2
-; RV32-NEXT: bnez a2, .LBB15_10
-; RV32-NEXT: .LBB15_2: # %else2
-; RV32-NEXT: andi a2, a1, 4
-; RV32-NEXT: bnez a2, .LBB15_11
-; RV32-NEXT: .LBB15_3: # %else5
-; RV32-NEXT: andi a2, a1, 8
-; RV32-NEXT: bnez a2, .LBB15_12
-; RV32-NEXT: .LBB15_4: # %else8
-; RV32-NEXT: andi a2, a1, 16
-; RV32-NEXT: bnez a2, .LBB15_13
-; RV32-NEXT: .LBB15_5: # %else11
-; RV32-NEXT: andi a2, a1, 32
-; RV32-NEXT: bnez a2, .LBB15_14
-; RV32-NEXT: .LBB15_6: # %else14
-; RV32-NEXT: andi a2, a1, 64
-; RV32-NEXT: bnez a2, .LBB15_15
-; RV32-NEXT: .LBB15_7: # %else17
-; RV32-NEXT: andi a1, a1, -128
-; RV32-NEXT: bnez a1, .LBB15_16
-; RV32-NEXT: .LBB15_8: # %else20
-; RV32-NEXT: ret
-; RV32-NEXT: .LBB15_9: # %cond.store
-; RV32-NEXT: li a2, 32
-; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma
-; RV32-NEXT: vsrl.vx v12, v8, a2
-; RV32-NEXT: vmv.x.s a2, v12
-; RV32-NEXT: vmv.x.s a3, v8
-; RV32-NEXT: sw a3, 0(a0)
-; RV32-NEXT: sw a2, 4(a0)
-; RV32-NEXT: addi a0, a0, 8
-; RV32-NEXT: andi a2, a1, 2
-; RV32-NEXT: beqz a2, .LBB15_2
-; RV32-NEXT: .LBB15_10: # %cond.store1
-; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma
-; RV32-NEXT: vslidedown.vi v12, v8, 1
-; RV32-NEXT: li a2, 32
-; RV32-NEXT: vsrl.vx v16, v12, a2
-; RV32-NEXT: vmv.x.s a2, v16
-; RV32-NEXT: vmv.x.s a3, v12
-; RV32-NEXT: sw a3, 0(a0)
-; RV32-NEXT: sw a2, 4(a0)
-; RV32-NEXT: addi a0, a0, 8
-; RV32-NEXT: andi a2, a1, 4
-; RV32-NEXT: beqz a2, .LBB15_3
-; RV32-NEXT: .LBB15_11: # %cond.store4
-; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma
-; RV32-NEXT: vslidedown.vi v12, v8, 2
-; RV32-NEXT: li a2, 32
-; RV32-NEXT: vsrl.vx v16, v12, a2
-; RV32-NEXT: vmv.x.s a2, v16
-; RV32-NEXT: vmv.x.s a3, v12
-; RV32-NEXT: sw a3, 0(a0)
-; RV32-NEXT: sw a2, 4(a0)
-; RV32-NEXT: addi a0, a0, 8
-; RV32-NEXT: andi a2, a1, 8
-; RV32-NEXT: beqz a2, .LBB15_4
-; RV32-NEXT: .LBB15_12: # %cond.store7
-; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma
-; RV32-NEXT: vslidedown.vi v12, v8, 3
-; RV32-NEXT: li a2, 32
-; RV32-NEXT: vsrl.vx v16, v12, a2
-; RV32-NEXT: vmv.x.s a2, v16
-; RV32-NEXT: vmv.x.s a3, v12
-; RV32-NEXT: sw a3, 0(a0)
-; RV32-NEXT: sw a2, 4(a0)
-; RV32-NEXT: addi a0, a0, 8
-; RV32-NEXT: andi a2, a1, 16
-; RV32-NEXT: beqz a2, .LBB15_5
-; RV32-NEXT: .LBB15_13: # %cond.store10
-; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma
-; RV32-NEXT: vslidedown.vi v12, v8, 4
-; RV32-NEXT: li a2, 32
-; RV32-NEXT: vsrl.vx v16, v12, a2
-; RV32-NEXT: vmv.x.s a2, v16
-; RV32-NEXT: vmv.x.s a3, v12
-; RV32-NEXT: sw a3, 0(a0)
-; RV32-NEXT: sw a2, 4(a0)
-; RV32-NEXT: addi a0, a0, 8
-; RV32-NEXT: andi a2, a1, 32
-; RV32-NEXT: beqz a2, .LBB15_6
-; RV32-NEXT: .LBB15_14: # %cond.store13
-; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma
-; RV32-NEXT: vslidedown.vi v12, v8, 5
-; RV32-NEXT: li a2, 32
-; RV32-NEXT: vsrl.vx v16, v12, a2
-; RV32-NEXT: vmv.x.s a2, v16
-; RV32-NEXT: vmv.x.s a3, v12
-; RV32-NEXT: sw a3, 0(a0)
-; RV32-NEXT: sw a2, 4(a0)
-; RV32-NEXT: addi a0, a0, 8
-; RV32-NEXT: andi a2, a1, 64
-; RV32-NEXT: beqz a2, .LBB15_7
-; RV32-NEXT: .LBB15_15: # %cond.store16
-; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma
-; RV32-NEXT: vslidedown.vi v12, v8, 6
-; RV32-NEXT: li a2, 32
-; RV32-NEXT: vsrl.vx v16, v12, a2
-; RV32-NEXT: vmv.x.s a2, v16
-; RV32-NEXT: vmv.x.s a3, v12
-; RV32-NEXT: sw a3, 0(a0)
-; RV32-NEXT: sw a2, 4(a0)
-; RV32-NEXT: addi a0, a0, 8
-; RV32-NEXT: andi a1, a1, -128
-; RV32-NEXT: beqz a1, .LBB15_8
-; RV32-NEXT: .LBB15_16: # %cond.store19
-; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma
-; RV32-NEXT: vslidedown.vi v8, v8, 7
-; RV32-NEXT: li a1, 32
-; RV32-NEXT: vsrl.vx v12, v8, a1
-; RV32-NEXT: vmv.x.s a1, v12
-; RV32-NEXT: vmv.x.s a2, v8
-; RV32-NEXT: sw a2, 0(a0)
-; RV32-NEXT: sw a1, 4(a0)
-; RV32-NEXT: ret
-;
-; RV64-LABEL: compressstore_v8i64:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT: vmv.x.s a1, v0
-; RV64-NEXT: andi a2, a1, 1
-; RV64-NEXT: bnez a2, .LBB15_11
-; RV64-NEXT: # %bb.1: # %else
-; RV64-NEXT: andi a2, a1, 2
-; RV64-NEXT: bnez a2, .LBB15_12
-; RV64-NEXT: .LBB15_2: # %else2
-; RV64-NEXT: andi a2, a1, 4
-; RV64-NEXT: bnez a2, .LBB15_13
-; RV64-NEXT: .LBB15_3: # %else5
-; RV64-NEXT: andi a2, a1, 8
-; RV64-NEXT: beqz a2, .LBB15_5
-; RV64-NEXT: .LBB15_4: # %cond.store7
-; RV64-NEXT: vsetivli zero, 1, e64, m2, ta, ma
-; RV64-NEXT: vslidedown.vi v12, v8, 3
-; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT: vse64.v v12, (a0)
-; RV64-NEXT: addi a0, a0, 8
-; RV64-NEXT: .LBB15_5: # %else8
-; RV64-NEXT: addi sp, sp, -320
-; RV64-NEXT: .cfi_def_cfa_offset 320
-; RV64-NEXT: sd ra, 312(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s0, 304(sp) # 8-byte Folded Spill
-; RV64-NEXT: .cfi_offset ra, -8
-; RV64-NEXT: .cfi_offset s0, -16
-; RV64-NEXT: addi s0, sp, 320
-; RV64-NEXT: .cfi_def_cfa s0, 0
-; RV64-NEXT: andi sp, sp, -64
-; RV64-NEXT: andi a2, a1, 16
-; RV64-NEXT: bnez a2, .LBB15_14
-; RV64-NEXT: # %bb.6: # %else11
-; RV64-NEXT: andi a2, a1, 32
-; RV64-NEXT: bnez a2, .LBB15_15
-; RV64-NEXT: .LBB15_7: # %else14
-; RV64-NEXT: andi a2, a1, 64
-; RV64-NEXT: bnez a2, .LBB15_16
-; RV64-NEXT: .LBB15_8: # %else17
-; RV64-NEXT: andi a1, a1, -128
-; RV64-NEXT: beqz a1, .LBB15_10
-; RV64-NEXT: .LBB15_9: # %cond.store19
-; RV64-NEXT: mv a1, sp
-; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT: vse64.v v8, (a1)
-; RV64-NEXT: ld a1, 56(sp)
-; RV64-NEXT: sd a1, 0(a0)
-; RV64-NEXT: .LBB15_10: # %else20
-; RV64-NEXT: addi sp, s0, -320
-; RV64-NEXT: ld ra, 312(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s0, 304(sp) # 8-byte Folded Reload
-; RV64-NEXT: addi sp, sp, 320
-; RV64-NEXT: ret
-; RV64-NEXT: .LBB15_11: # %cond.store
-; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT: vse64.v v8, (a0)
-; RV64-NEXT: addi a0, a0, 8
-; RV64-NEXT: andi a2, a1, 2
-; RV64-NEXT: beqz a2, .LBB15_2
-; RV64-NEXT: .LBB15_12: # %cond.store1
-; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v12, v8, 1
-; RV64-NEXT: vse64.v v12, (a0)
-; RV64-NEXT: addi a0, a0, 8
-; RV64-NEXT: andi a2, a1, 4
-; RV64-NEXT: beqz a2, .LBB15_3
-; RV64-NEXT: .LBB15_13: # %cond.store4
-; RV64-NEXT: vsetivli zero, 1, e64, m2, ta, ma
-; RV64-NEXT: vslidedown.vi v12, v8, 2
-; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT: vse64.v v12, (a0)
-; RV64-NEXT: addi a0, a0, 8
-; RV64-NEXT: andi a2, a1, 8
-; RV64-NEXT: bnez a2, .LBB15_4
-; RV64-NEXT: j .LBB15_5
-; RV64-NEXT: .LBB15_14: # %cond.store10
-; RV64-NEXT: addi a2, sp, 192
-; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT: vse64.v v8, (a2)
-; RV64-NEXT: ld a2, 224(sp)
-; RV64-NEXT: sd a2, 0(a0)
-; RV64-NEXT: addi a0, a0, 8
-; RV64-NEXT: andi a2, a1, 32
-; RV64-NEXT: beqz a2, .LBB15_7
-; RV64-NEXT: .LBB15_15: # %cond.store13
-; RV64-NEXT: addi a2, sp, 128
-; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT: vse64.v v8, (a2)
-; RV64-NEXT: ld a2, 168(sp)
-; RV64-NEXT: sd a2, 0(a0)
-; RV64-NEXT: addi a0, a0, 8
-; RV64-NEXT: andi a2, a1, 64
-; RV64-NEXT: beqz a2, .LBB15_8
-; RV64-NEXT: .LBB15_16: # %cond.store16
-; RV64-NEXT: addi a2, sp, 64
-; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT: vse64.v v8, (a2)
-; RV64-NEXT: ld a2, 112(sp)
-; RV64-NEXT: sd a2, 0(a0)
-; RV64-NEXT: addi a0, a0, 8
-; RV64-NEXT: andi a1, a1, -128
-; RV64-NEXT: bnez a1, .LBB15_9
-; RV64-NEXT: j .LBB15_10
+; CHECK-LABEL: compressstore_v8i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-NEXT: vcompress.vm v12, v8, v0
+; CHECK-NEXT: vcpop.m a1, v0
+; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT: vse64.v v12, (a0)
+; CHECK-NEXT: ret
call void @llvm.masked.compressstore.v8i64(<8 x i64> %v, ptr align 8 %base, <8 x i1> %mask)
ret void
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; RV32: {{.*}}
+; RV64: {{.*}}
diff --git a/llvm/test/CodeGen/SPIRV/ComparePointers.ll b/llvm/test/CodeGen/SPIRV/ComparePointers.ll
index fd2084d..9be0594 100644
--- a/llvm/test/CodeGen/SPIRV/ComparePointers.ll
+++ b/llvm/test/CodeGen/SPIRV/ComparePointers.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv64-unknown-unknown --mattr=+spirv1.3 %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
;; kernel void test(int global *in, int global *in2) {
;; if (!in)
diff --git a/llvm/test/CodeGen/SPIRV/capability-kernel.ll b/llvm/test/CodeGen/SPIRV/capability-kernel.ll
index 03ea58c..fea1951 100644
--- a/llvm/test/CodeGen/SPIRV/capability-kernel.ll
+++ b/llvm/test/CodeGen/SPIRV/capability-kernel.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; CHECK-DAG: OpCapability Addresses
diff --git a/llvm/test/CodeGen/SPIRV/pointers/getelementptr-addressspace.ll b/llvm/test/CodeGen/SPIRV/pointers/getelementptr-addressspace.ll
index 062863a..7e9c621 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/getelementptr-addressspace.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/getelementptr-addressspace.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; CHECK: %[[#INT8:]] = OpTypeInt 8 0
; CHECK: %[[#PTR1:]] = OpTypePointer CrossWorkgroup %[[#INT8]]
diff --git a/llvm/test/CodeGen/SPIRV/pointers/getelementptr-base-type.ll b/llvm/test/CodeGen/SPIRV/pointers/getelementptr-base-type.ll
index aaf97f8..fc999ba 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/getelementptr-base-type.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/getelementptr-base-type.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; CHECK: %[[#FLOAT32:]] = OpTypeFloat 32
; CHECK: %[[#PTR:]] = OpTypePointer CrossWorkgroup %[[#FLOAT32]]
diff --git a/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-addressspace.ll b/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-addressspace.ll
index 6d12023..a3a730a 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-addressspace.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-addressspace.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; CHECK-DAG: %[[#INT:]] = OpTypeInt 32 0
; CHECK-DAG: %[[#PTR1:]] = OpTypePointer Function %[[#INT]]
diff --git a/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-type-deduction-no-bitcast-to-generic.ll b/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-type-deduction-no-bitcast-to-generic.ll
index 9e136ce..b74a344 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-type-deduction-no-bitcast-to-generic.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-type-deduction-no-bitcast-to-generic.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; CHECK-DAG: %[[#IMAGE:]] = OpTypeImage %2 2D 0 0 0 0 Unknown ReadOnly
diff --git a/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-type.ll b/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-type.ll
index 1fcc6d9..b8f205a 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-type.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-type.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; CHECK-DAG: %[[#FLOAT32:]] = OpTypeFloat 32
; CHECK-DAG: %[[#PTR1:]] = OpTypePointer Function %[[#FLOAT32]]
diff --git a/llvm/test/CodeGen/SPIRV/pointers/load-addressspace.ll b/llvm/test/CodeGen/SPIRV/pointers/load-addressspace.ll
index 1b4e7a3..1667abc 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/load-addressspace.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/load-addressspace.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; CHECK: %[[#INT8:]] = OpTypeInt 8 0
; CHECK: %[[#PTR1:]] = OpTypePointer CrossWorkgroup %[[#INT8]]
diff --git a/llvm/test/CodeGen/SPIRV/pointers/store-operand-ptr-to-struct.ll b/llvm/test/CodeGen/SPIRV/pointers/store-operand-ptr-to-struct.ll
index 00b03c0..3a0d65e 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/store-operand-ptr-to-struct.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/store-operand-ptr-to-struct.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; TODO: OpFunctionParameter should be a pointer of struct base type.
; XFAIL: *
diff --git a/llvm/test/CodeGen/SPIRV/pointers/struct-opaque-pointers.ll b/llvm/test/CodeGen/SPIRV/pointers/struct-opaque-pointers.ll
index 86f5f5b..d426fc4 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/struct-opaque-pointers.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/struct-opaque-pointers.ll
@@ -1,5 +1,5 @@
; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
-; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; CHECK: %[[TyInt8:.*]] = OpTypeInt 8 0
; CHECK: %[[TyInt8Ptr:.*]] = OpTypePointer {{[a-zA-Z]+}} %[[TyInt8]]
diff --git a/llvm/test/CodeGen/SPIRV/pointers/two-bitcast-or-param-users.ll b/llvm/test/CodeGen/SPIRV/pointers/two-bitcast-or-param-users.ll
index 52180d5..23c3faa 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/two-bitcast-or-param-users.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/two-bitcast-or-param-users.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; CHECK-DAG: %[[#INT:]] = OpTypeInt 32
; CHECK-DAG: %[[#GLOBAL_PTR_INT:]] = OpTypePointer CrossWorkgroup %[[#INT]]
diff --git a/llvm/test/CodeGen/SPIRV/pointers/two-subsequent-bitcasts.ll b/llvm/test/CodeGen/SPIRV/pointers/two-subsequent-bitcasts.ll
index 473c2a8..83234e3 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/two-subsequent-bitcasts.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/two-subsequent-bitcasts.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; CHECK-DAG: %[[#float:]] = OpTypeFloat 32
; CHECK-DAG: %[[#pointer:]] = OpTypePointer CrossWorkgroup %[[#float]]
diff --git a/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call-rev.ll b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call-rev.ll
new file mode 100644
index 0000000..76769ab
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call-rev.ll
@@ -0,0 +1,28 @@
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-SPIRV-DAG: OpName %[[FooArg:.*]] "known_type_ptr"
+; CHECK-SPIRV-DAG: OpName %[[Foo:.*]] "foo"
+; CHECK-SPIRV-DAG: OpName %[[ArgToDeduce:.*]] "unknown_type_ptr"
+; CHECK-SPIRV-DAG: OpName %[[Bar:.*]] "bar"
+; CHECK-SPIRV-DAG: %[[Long:.*]] = OpTypeInt 32 0
+; CHECK-SPIRV-DAG: %[[Void:.*]] = OpTypeVoid
+; CHECK-SPIRV-DAG: %[[LongPtr:.*]] = OpTypePointer CrossWorkgroup %[[Long]]
+; CHECK-SPIRV-DAG: %[[Fun:.*]] = OpTypeFunction %[[Void]] %[[LongPtr]]
+; CHECK-SPIRV: %[[Bar]] = OpFunction %[[Void]] None %[[Fun]]
+; CHECK-SPIRV: %[[ArgToDeduce]] = OpFunctionParameter %[[LongPtr]]
+; CHECK-SPIRV: OpFunctionCall %[[Void]] %[[Foo]] %[[ArgToDeduce]]
+; CHECK-SPIRV: %[[Foo]] = OpFunction %[[Void]] None %[[Fun]]
+; CHECK-SPIRV: %[[FooArg]] = OpFunctionParameter %[[LongPtr]]
+
+define spir_kernel void @bar(ptr addrspace(1) %unknown_type_ptr) {
+entry:
+ call spir_func void @foo(ptr addrspace(1) %unknown_type_ptr)
+ ret void
+}
+
+define void @foo(ptr addrspace(1) %known_type_ptr) {
+entry:
+ %elem = getelementptr inbounds i32, ptr addrspace(1) %known_type_ptr, i64 0
+ ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call.ll b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call.ll
new file mode 100644
index 0000000..8cbf360
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call.ll
@@ -0,0 +1,28 @@
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-SPIRV-DAG: OpName %[[FooArg:.*]] "known_type_ptr"
+; CHECK-SPIRV-DAG: OpName %[[Foo:.*]] "foo"
+; CHECK-SPIRV-DAG: OpName %[[ArgToDeduce:.*]] "unknown_type_ptr"
+; CHECK-SPIRV-DAG: OpName %[[Bar:.*]] "bar"
+; CHECK-SPIRV-DAG: %[[Long:.*]] = OpTypeInt 32 0
+; CHECK-SPIRV-DAG: %[[Void:.*]] = OpTypeVoid
+; CHECK-SPIRV-DAG: %[[LongPtr:.*]] = OpTypePointer CrossWorkgroup %[[Long]]
+; CHECK-SPIRV-DAG: %[[Fun:.*]] = OpTypeFunction %[[Void]] %[[LongPtr]]
+; CHECK-SPIRV: %[[Foo]] = OpFunction %[[Void]] None %[[Fun]]
+; CHECK-SPIRV: %[[FooArg]] = OpFunctionParameter %[[LongPtr]]
+; CHECK-SPIRV: %[[Bar]] = OpFunction %[[Void]] None %[[Fun]]
+; CHECK-SPIRV: %[[ArgToDeduce]] = OpFunctionParameter %[[LongPtr]]
+; CHECK-SPIRV: OpFunctionCall %[[Void]] %[[Foo]] %[[ArgToDeduce]]
+
+define void @foo(ptr addrspace(1) %known_type_ptr) {
+entry:
+ %elem = getelementptr inbounds i32, ptr addrspace(1) %known_type_ptr, i64 0
+ ret void
+}
+
+define spir_kernel void @bar(ptr addrspace(1) %unknown_type_ptr) {
+entry:
+ call spir_func void @foo(ptr addrspace(1) %unknown_type_ptr)
+ ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/pointers/typeof-ptr-int.ll b/llvm/test/CodeGen/SPIRV/pointers/typeof-ptr-int.ll
new file mode 100644
index 0000000..f144418
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/pointers/typeof-ptr-int.ll
@@ -0,0 +1,29 @@
+; This test is to check that two functions have different SPIR-V type
+; definitions, even though their LLVM function types are identical.
+
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: OpName %[[Fun32:.*]] "tp_arg_i32"
+; CHECK-DAG: OpName %[[Fun64:.*]] "tp_arg_i64"
+; CHECK-DAG: %[[TyI32:.*]] = OpTypeInt 32 0
+; CHECK-DAG: %[[TyVoid:.*]] = OpTypeVoid
+; CHECK-DAG: %[[TyPtr32:.*]] = OpTypePointer Function %[[TyI32]]
+; CHECK-DAG: %[[TyFun32:.*]] = OpTypeFunction %[[TyVoid]] %[[TyPtr32]]
+; CHECK-DAG: %[[TyI64:.*]] = OpTypeInt 64 0
+; CHECK-DAG: %[[TyPtr64:.*]] = OpTypePointer Function %[[TyI64]]
+; CHECK-DAG: %[[TyFun64:.*]] = OpTypeFunction %[[TyVoid]] %[[TyPtr64]]
+; CHECK-DAG: %[[Fun32]] = OpFunction %[[TyVoid]] None %[[TyFun32]]
+; CHECK-DAG: %[[Fun64]] = OpFunction %[[TyVoid]] None %[[TyFun64]]
+
+define spir_kernel void @tp_arg_i32(ptr %ptr) {
+entry:
+ store i32 1, ptr %ptr
+ ret void
+}
+
+define spir_kernel void @tp_arg_i64(ptr %ptr) {
+entry:
+ store i64 1, ptr %ptr
+ ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/relationals.ll b/llvm/test/CodeGen/SPIRV/relationals.ll
index 1644dc7..f4fcf4d 100644
--- a/llvm/test/CodeGen/SPIRV/relationals.ll
+++ b/llvm/test/CodeGen/SPIRV/relationals.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
declare dso_local spir_func <4 x i8> @_Z13__spirv_IsNanIDv4_aDv4_fET_T0_(<4 x float>)
declare dso_local spir_func <4 x i8> @_Z13__spirv_IsInfIDv4_aDv4_fET_T0_(<4 x float>)
diff --git a/llvm/test/CodeGen/SPIRV/simple.ll b/llvm/test/CodeGen/SPIRV/simple.ll
index de9efa8..63c1596 100644
--- a/llvm/test/CodeGen/SPIRV/simple.ll
+++ b/llvm/test/CodeGen/SPIRV/simple.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
;; Support of doubles is required.
; CHECK: OpCapability Float64
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/AtomicCompareExchangeExplicit_cl20.ll b/llvm/test/CodeGen/SPIRV/transcoding/AtomicCompareExchangeExplicit_cl20.ll
index fdb26ba..55cfcea 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/AtomicCompareExchangeExplicit_cl20.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/AtomicCompareExchangeExplicit_cl20.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
;; __kernel void testAtomicCompareExchangeExplicit_cl20(
;; volatile global atomic_int* object,
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/BitReversePref.ll b/llvm/test/CodeGen/SPIRV/transcoding/BitReversePref.ll
index 55161e6..11b0578 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/BitReversePref.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/BitReversePref.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv64-unknown-linux %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; CHECK: OpDecorate %[[#FUNC_NAME:]] LinkageAttributes "_Z10BitReversei"
; CHECK-NOT: OpBitReverse
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/BuildNDRange.ll b/llvm/test/CodeGen/SPIRV/transcoding/BuildNDRange.ll
index 95f3673..b63c1c6 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/BuildNDRange.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/BuildNDRange.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; CHECK-SPIRV-DAG: %[[#]] = OpBuildNDRange %[[#]] %[[#GWS:]] %[[#LWS:]] %[[#GWO:]]
; CHECK-SPIRV-DAG: %[[#GWS]] = OpConstant %[[#]] 123
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/BuildNDRange_2.ll b/llvm/test/CodeGen/SPIRV/transcoding/BuildNDRange_2.ll
index a2ae808..65c992c 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/BuildNDRange_2.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/BuildNDRange_2.ll
@@ -19,6 +19,7 @@
;; bash$ $PATH_TO_GEN/bin/clang -cc1 -x cl -cl-std=CL2.0 -triple spir64-unknown-unknown -emit-llvm -include opencl-20.h BuildNDRange_2.cl -o BuildNDRange_2.ll
; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; TODO(#60133): Requires updates following opaque pointer migration.
; XFAIL: *
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/ConvertPtr.ll b/llvm/test/CodeGen/SPIRV/transcoding/ConvertPtr.ll
index 3403695..93aecc5 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/ConvertPtr.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/ConvertPtr.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
;; kernel void testConvertPtrToU(global int *a, global unsigned long *res) {
;; res[0] = (unsigned long)&a[0];
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/DecorationAlignment.ll b/llvm/test/CodeGen/SPIRV/transcoding/DecorationAlignment.ll
index 2e9b4a4..d4fc5c3 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/DecorationAlignment.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/DecorationAlignment.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; CHECK-SPIRV: OpDecorate %[[#ALIGNMENT:]] Alignment 16
; CHECK-SPIRV: %[[#ALIGNMENT]] = OpFunctionParameter %[[#]]
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/DecorationMaxByteOffset.ll b/llvm/test/CodeGen/SPIRV/transcoding/DecorationMaxByteOffset.ll
index 64f25b7..966d835 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/DecorationMaxByteOffset.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/DecorationMaxByteOffset.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; CHECK-SPIRV: OpName %[[#PTR_ID:]] "ptr"
; CHECK-SPIRV: OpName %[[#PTR2_ID:]] "ptr2"
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/DivRem.ll b/llvm/test/CodeGen/SPIRV/transcoding/DivRem.ll
index 2f423c2..67c3380 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/DivRem.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/DivRem.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; CHECK-SPIRV-DAG: %[[#int:]] = OpTypeInt 32 0
; CHECK-SPIRV-DAG: %[[#int2:]] = OpTypeVector %[[#int]] 2
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/ExecutionMode_SPIR_to_SPIRV.ll b/llvm/test/CodeGen/SPIRV/transcoding/ExecutionMode_SPIR_to_SPIRV.ll
index 6d6dd24..6e8726c 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/ExecutionMode_SPIR_to_SPIRV.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/ExecutionMode_SPIR_to_SPIRV.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; CHECK-SPIRV-DAG: OpEntryPoint Kernel %[[#WORKER:]] "worker"
; CHECK-SPIRV-DAG: OpExecutionMode %[[#WORKER]] LocalSizeHint 128 10 1
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/GlobalFunAnnotate.ll b/llvm/test/CodeGen/SPIRV/transcoding/GlobalFunAnnotate.ll
index 2796dcb..33bece5 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/GlobalFunAnnotate.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/GlobalFunAnnotate.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv64-unknown-linux %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; CHECK-SPIRV: OpDecorate %[[#]] UserSemantic "annotation_on_function"
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_cmpxchg.ll b/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_cmpxchg.ll
index 331960c..417b89e 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_cmpxchg.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_cmpxchg.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
;; This test checks that the backend is capable to correctly translate
;; atomic_cmpxchg OpenCL C 1.2 built-in function [1] into corresponding SPIR-V
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_legacy.ll b/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_legacy.ll
index 95eb6ad..3180b57 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_legacy.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_legacy.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
;; This test checks that the backend is capable to correctly translate
;; legacy atomic OpenCL C 1.2 built-in functions [1] into corresponding SPIR-V
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_work_item_fence.ll b/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_work_item_fence.ll
index 0f3a62a..c94c130 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_work_item_fence.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_work_item_fence.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
;; This test checks that the backend is capable to correctly translate
;; atomic_work_item_fence OpenCL C 2.0 built-in function [1] into corresponding
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/barrier.ll b/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/barrier.ll
index a126d94..cf4a247 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/barrier.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/barrier.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
;; This test checks that the backend is capable to correctly translate
;; barrier OpenCL C 1.2 built-in function [1] into corresponding SPIR-V
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/sub_group_mask.ll b/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/sub_group_mask.ll
index 42b127c..5d9840d 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/sub_group_mask.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/sub_group_mask.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; CHECK-SPIRV: OpCapability GroupNonUniformBallot
; CHECK-SPIRV: OpDecorate %[[#]] BuiltIn SubgroupGtMask
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/work_group_barrier.ll b/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/work_group_barrier.ll
index 0874e6f..0702fd0 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/work_group_barrier.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/work_group_barrier.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
;; This test checks that the backend is capable to correctly translate
;; sub_group_barrier built-in function [1] from cl_khr_subgroups extension into
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/atomic_flag.ll b/llvm/test/CodeGen/SPIRV/transcoding/atomic_flag.ll
index 3c563d3..20204ac 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/atomic_flag.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/atomic_flag.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
;; Types:
; CHECK-DAG: %[[#INT:]] = OpTypeInt 32
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/atomic_load_store.ll b/llvm/test/CodeGen/SPIRV/transcoding/atomic_load_store.ll
index d013abc..3e5a3ac 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/atomic_load_store.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/atomic_load_store.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
;; Check 'LLVM ==> SPIR-V' conversion of atomic_load and atomic_store.
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/bitcast.ll b/llvm/test/CodeGen/SPIRV/transcoding/bitcast.ll
index 8dbf4d2..2c0fc39 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/bitcast.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/bitcast.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
;; Check the bitcast is translated back to bitcast
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/block_w_struct_return.ll b/llvm/test/CodeGen/SPIRV/transcoding/block_w_struct_return.ll
index 5ecd7f7..2249cbe 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/block_w_struct_return.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/block_w_struct_return.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV,CHECK-SPIRV1_4
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; TODO(#60133): Requires updates following opaque pointer migration.
; XFAIL: *
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/builtin_calls.ll b/llvm/test/CodeGen/SPIRV/transcoding/builtin_calls.ll
index 9b1ce76..0a02a8b 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/builtin_calls.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/builtin_calls.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; CHECK-SPIRV-DAG: OpDecorate %[[#Id:]] BuiltIn GlobalInvocationId
; CHECK-SPIRV-DAG: OpDecorate %[[#Id:]] BuiltIn GlobalLinearId
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars.ll b/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars.ll
index 8286671..f18f27a 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; CHECK-SPIRV: OpDecorate %[[#Id:]] BuiltIn GlobalLinearId
; CHECK-SPIRV: %[[#Id:]] = OpVariable %[[#]]
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_arithmetics.ll b/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_arithmetics.ll
index 22aa40c..d39ca3c 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_arithmetics.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_arithmetics.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv64-unknown-linux %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
;; The IR was generated from the following source:
;; #include <CL/sycl.hpp>
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_opt.ll b/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_opt.ll
index 5b3474f..03456ae 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_opt.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_opt.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv64-unknown-linux %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
;; The IR was generated from the following source:
;; #include <CL/sycl.hpp>
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/check_ro_qualifier.ll b/llvm/test/CodeGen/SPIRV/transcoding/check_ro_qualifier.ll
index 6de610b..824ca1b2 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/check_ro_qualifier.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/check_ro_qualifier.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; CHECK-SPIRV: %[[#IMAGE_TYPE:]] = OpTypeImage
; CHECK-SPIRV: %[[#IMAGE_ARG:]] = OpFunctionParameter %[[#IMAGE_TYPE]]
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/cl-types.ll b/llvm/test/CodeGen/SPIRV/transcoding/cl-types.ll
index 52b7dac..d7e87c0 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/cl-types.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/cl-types.ll
@@ -19,6 +19,7 @@
;; }
; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; CHECK-SPIRV-DAG: OpCapability Sampled1D
; CHECK-SPIRV-DAG: OpCapability SampledBuffer
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/clk_event_t.ll b/llvm/test/CodeGen/SPIRV/transcoding/clk_event_t.ll
index 9054454..0cd75bb 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/clk_event_t.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/clk_event_t.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; CHECK-SPIRV: OpTypeDeviceEvent
; CHECK-SPIRV: OpFunction
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/enqueue_kernel.ll b/llvm/test/CodeGen/SPIRV/transcoding/enqueue_kernel.ll
index cf124ec..d23b068 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/enqueue_kernel.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/enqueue_kernel.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; TODO(#60133): Requires updates following opaque pointer migration.
; XFAIL: *
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/explicit-conversions.ll b/llvm/test/CodeGen/SPIRV/transcoding/explicit-conversions.ll
index c186a81..49b84c1 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/explicit-conversions.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/explicit-conversions.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; CHECK-SPIRV: OpSatConvertSToU
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/extract_insert_value.ll b/llvm/test/CodeGen/SPIRV/transcoding/extract_insert_value.ll
index fd29bc8..0ed1dc7 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/extract_insert_value.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/extract_insert_value.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; TODO(#60133): Requires updates following opaque pointer migration.
; XFAIL: *
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/fadd.ll b/llvm/test/CodeGen/SPIRV/transcoding/fadd.ll
index 78d9a23..af76c0e 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/fadd.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/fadd.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; CHECK-SPIRV: OpName %[[#r1:]] "r1"
; CHECK-SPIRV: OpName %[[#r2:]] "r2"
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/fclamp.ll b/llvm/test/CodeGen/SPIRV/transcoding/fclamp.ll
index cfdcc728..550ec1a 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/fclamp.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/fclamp.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; CHECK-SPIRV: %[[#]] = OpExtInst %[[#]] %[[#]] fclamp
; CHECK-SPIRV-NOT: %[[#]] = OpExtInst %[[#]] %[[#]] clamp
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/fcmp.ll b/llvm/test/CodeGen/SPIRV/transcoding/fcmp.ll
index 572ccc3..46eaba9 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/fcmp.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/fcmp.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; CHECK-SPIRV: OpName %[[#r1:]] "r1"
; CHECK-SPIRV: OpName %[[#r2:]] "r2"
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/fdiv.ll b/llvm/test/CodeGen/SPIRV/transcoding/fdiv.ll
index d0ed564..79b7868 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/fdiv.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/fdiv.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; CHECK-SPIRV: OpName %[[#r1:]] "r1"
; CHECK-SPIRV: OpName %[[#r2:]] "r2"
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/fmod.ll b/llvm/test/CodeGen/SPIRV/transcoding/fmod.ll
index f506787b..683b5c2 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/fmod.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/fmod.ll
@@ -2,6 +2,7 @@
;; { out = fmod( in1, in2 ); }
; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; CHECK-SPIRV: %[[#]] = OpExtInst %[[#]] %[[#]] fmod %[[#]] %[[#]]
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/fmul.ll b/llvm/test/CodeGen/SPIRV/transcoding/fmul.ll
index 886077a..fdab29c 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/fmul.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/fmul.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; CHECK-SPIRV: OpName %[[#r1:]] "r1"
; CHECK-SPIRV: OpName %[[#r2:]] "r2"
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/fneg.ll b/llvm/test/CodeGen/SPIRV/transcoding/fneg.ll
index e17601a..60bbfe6 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/fneg.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/fneg.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; CHECK-SPIRV: OpName %[[#r1:]] "r1"
; CHECK-SPIRV: OpName %[[#r2:]] "r2"
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/fp_contract_reassoc_fast_mode.ll b/llvm/test/CodeGen/SPIRV/transcoding/fp_contract_reassoc_fast_mode.ll
index c035c35..974043c 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/fp_contract_reassoc_fast_mode.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/fp_contract_reassoc_fast_mode.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; CHECK-SPIRV-NOT: OpCapability FPFastMathModeINTEL
; CHECK-SPIRV: OpName %[[#mu:]] "mul"
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/frem.ll b/llvm/test/CodeGen/SPIRV/transcoding/frem.ll
index ecb8f6f..d36ba7f 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/frem.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/frem.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; CHECK-SPIRV: OpName %[[#r1:]] "r1"
; CHECK-SPIRV: OpName %[[#r2:]] "r2"
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/fsub.ll b/llvm/test/CodeGen/SPIRV/transcoding/fsub.ll
index 99d0d0e..3677c00 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/fsub.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/fsub.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; CHECK-SPIRV: OpName %[[#r1:]] "r1"
; CHECK-SPIRV: OpName %[[#r2:]] "r2"
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/get_image_num_mip_levels.ll b/llvm/test/CodeGen/SPIRV/transcoding/get_image_num_mip_levels.ll
index dc307c7..fd24196 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/get_image_num_mip_levels.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/get_image_num_mip_levels.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
;; Types:
; CHECK-DAG: %[[#INT:]] = OpTypeInt 32
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/global_block.ll b/llvm/test/CodeGen/SPIRV/transcoding/global_block.ll
index 2f44e19..ff1bec4 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/global_block.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/global_block.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV,CHECK-SPIRV1_4
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; TODO(#60133): Requires updates following opaque pointer migration.
; XFAIL: *
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/group_ops.ll b/llvm/test/CodeGen/SPIRV/transcoding/group_ops.ll
index 6aa9faa..2412f40 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/group_ops.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/group_ops.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; CHECK-SPIRV-DAG: %[[#int:]] = OpTypeInt 32 0
; CHECK-SPIRV-DAG: %[[#float:]] = OpTypeFloat 32
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/isequal.ll b/llvm/test/CodeGen/SPIRV/transcoding/isequal.ll
index 3c818af..c5f3f9e 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/isequal.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/isequal.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; CHECK-SPIRV-NOT: OpSConvert
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/relationals_double.ll b/llvm/test/CodeGen/SPIRV/transcoding/relationals_double.ll
index f771854..de7673a 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/relationals_double.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/relationals_double.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
;; This test checks following SYCL relational builtins with double and double2
;; types:
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/relationals_float.ll b/llvm/test/CodeGen/SPIRV/transcoding/relationals_float.ll
index 1f55ceb..69a4a30 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/relationals_float.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/relationals_float.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
;; This test checks following SYCL relational builtins with float and float2
;; types:
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/relationals_half.ll b/llvm/test/CodeGen/SPIRV/transcoding/relationals_half.ll
index 864fb4f..d6a7fda 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/relationals_half.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/relationals_half.ll
@@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
;; This test checks following SYCL relational builtins with half and half2 types:
;; isfinite, isinf, isnan, isnormal, signbit, isequal, isnotequal, isgreater
diff --git a/llvm/test/CodeGen/WinCFGuard/cfguard-mingw.ll b/llvm/test/CodeGen/WinCFGuard/cfguard-mingw.ll
index 085cde8..7a5baa0 100644
--- a/llvm/test/CodeGen/WinCFGuard/cfguard-mingw.ll
+++ b/llvm/test/CodeGen/WinCFGuard/cfguard-mingw.ll
@@ -97,7 +97,7 @@ $_ZTI7Derived = comdat any
; Function Attrs: nounwind uwtable
define weak_odr dso_local dllexport void @_ZN4BaseC2Ev(ptr noundef nonnull align 8 dereferenceable(12) %0) unnamed_addr #0 comdat align 2 {
- store ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV4Base, i64 0, inrange i32 0, i64 2), ptr %0, align 8, !tbaa !5
+ store ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV4Base, i64 0, i32 0, i64 2), ptr %0, align 8, !tbaa !5
%2 = getelementptr inbounds %class.Base, ptr %0, i64 0, i32 1
store i32 0, ptr %2, align 8, !tbaa !8
ret void
@@ -105,7 +105,7 @@ define weak_odr dso_local dllexport void @_ZN4BaseC2Ev(ptr noundef nonnull align
; Function Attrs: nounwind uwtable
define weak_odr dso_local dllexport void @_ZN4BaseC1Ev(ptr noundef nonnull align 8 dereferenceable(12) %0) unnamed_addr #0 comdat align 2 {
- store ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV4Base, i64 0, inrange i32 0, i64 2), ptr %0, align 8, !tbaa !5
+ store ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV4Base, i64 0, i32 0, i64 2), ptr %0, align 8, !tbaa !5
%2 = getelementptr inbounds %class.Base, ptr %0, i64 0, i32 1
store i32 0, ptr %2, align 8, !tbaa !8
ret void
@@ -140,10 +140,10 @@ declare dso_local void @_ZdlPv(ptr noundef) local_unnamed_addr #2
; Function Attrs: nounwind uwtable
define weak_odr dso_local dllexport void @_ZN7DerivedC2Ev(ptr noundef nonnull align 8 dereferenceable(16) %0) unnamed_addr #0 comdat align 2 {
- store ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV4Base, i64 0, inrange i32 0, i64 2), ptr %0, align 8, !tbaa !5
+ store ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV4Base, i64 0, i32 0, i64 2), ptr %0, align 8, !tbaa !5
%2 = getelementptr inbounds %class.Base, ptr %0, i64 0, i32 1
store i32 0, ptr %2, align 8, !tbaa !8
- store ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV7Derived, i64 0, inrange i32 0, i64 2), ptr %0, align 8, !tbaa !5
+ store ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV7Derived, i64 0, i32 0, i64 2), ptr %0, align 8, !tbaa !5
%3 = getelementptr inbounds %class.Derived, ptr %0, i64 0, i32 1
store i32 0, ptr %3, align 4, !tbaa !12
ret void
@@ -151,10 +151,10 @@ define weak_odr dso_local dllexport void @_ZN7DerivedC2Ev(ptr noundef nonnull al
; Function Attrs: nounwind uwtable
define weak_odr dso_local dllexport void @_ZN7DerivedC1Ev(ptr noundef nonnull align 8 dereferenceable(16) %0) unnamed_addr #0 comdat align 2 {
- store ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV4Base, i64 0, inrange i32 0, i64 2), ptr %0, align 8, !tbaa !5
+ store ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV4Base, i64 0, i32 0, i64 2), ptr %0, align 8, !tbaa !5
%2 = getelementptr inbounds %class.Base, ptr %0, i64 0, i32 1
store i32 0, ptr %2, align 8, !tbaa !8
- store ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV7Derived, i64 0, inrange i32 0, i64 2), ptr %0, align 8, !tbaa !5
+ store ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV7Derived, i64 0, i32 0, i64 2), ptr %0, align 8, !tbaa !5
%3 = getelementptr inbounds %class.Derived, ptr %0, i64 0, i32 1
store i32 0, ptr %3, align 4, !tbaa !12
ret void
diff --git a/llvm/test/CodeGen/X86/tls-align.ll b/llvm/test/CodeGen/X86/tls-align.ll
index 3c8ee6b..e996c00 100644
--- a/llvm/test/CodeGen/X86/tls-align.ll
+++ b/llvm/test/CodeGen/X86/tls-align.ll
@@ -12,7 +12,7 @@
define internal fastcc void @foo() unnamed_addr {
entry:
- store <8 x ptr> <ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV7Derived, i64 0, inrange i32 0, i64 2), ptr null, ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV7Derived, i64 0, inrange i32 0, i64 2), ptr null, ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV7Derived, i64 0, inrange i32 0, i64 2), ptr null, ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV7Derived, i64 0, inrange i32 0, i64 2), ptr null>, ptr @array, align 32
+ store <8 x ptr> <ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV7Derived, i64 0, i32 0, i64 2), ptr null, ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV7Derived, i64 0, i32 0, i64 2), ptr null, ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV7Derived, i64 0, i32 0, i64 2), ptr null, ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV7Derived, i64 0, i32 0, i64 2), ptr null>, ptr @array, align 32
ret void
}
diff --git a/llvm/test/DebugInfo/X86/tu-to-non-tu.ll b/llvm/test/DebugInfo/X86/tu-to-non-tu.ll
index 3ad97ad..f80bd8b 100644
--- a/llvm/test/DebugInfo/X86/tu-to-non-tu.ll
+++ b/llvm/test/DebugInfo/X86/tu-to-non-tu.ll
@@ -156,14 +156,14 @@
%struct.templ_non_tu.1 = type { ptr }
@_ZTV6non_tu = dso_local unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI6non_tu, ptr @_ZN6non_tu2f1Ev] }, align 8
-@v1 = dso_local global { { ptr } } { { ptr } { ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV6non_tu, i32 0, inrange i32 0, i32 2) } }, align 8, !dbg !0
+@v1 = dso_local global { { ptr } } { { ptr } { ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV6non_tu, i32 0, i32 0, i32 2) } }, align 8, !dbg !0
@v5 = dso_local global %struct.ref_internal zeroinitializer, align 1, !dbg !5
@_ZTV12templ_non_tuIiE = dso_local unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI12templ_non_tuIiE, ptr @_ZN12templ_non_tuIiE2f1Ev] }, align 8
-@v2 = dso_local global { { ptr } } { { ptr } { ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV12templ_non_tuIiE, i32 0, inrange i32 0, i32 2) } }, align 8, !dbg !13
+@v2 = dso_local global { { ptr } } { { ptr } { ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV12templ_non_tuIiE, i32 0, i32 0, i32 2) } }, align 8, !dbg !13
@_ZTV12templ_non_tuIlE = dso_local unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI12templ_non_tuIlE, ptr @_ZN12templ_non_tuIlE2f1Ev] }, align 8
-@v3 = dso_local global { { ptr } } { { ptr } { ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV12templ_non_tuIlE, i32 0, inrange i32 0, i32 2) } }, align 8, !dbg !32
+@v3 = dso_local global { { ptr } } { { ptr } { ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV12templ_non_tuIlE, i32 0, i32 0, i32 2) } }, align 8, !dbg !32
@_ZTV12templ_non_tuIbE = dso_local unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI12templ_non_tuIbE, ptr @_ZN12templ_non_tuIbE2f1Ev] }, align 8
-@v4 = dso_local global { { ptr } } { { ptr } { ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV12templ_non_tuIbE, i32 0, inrange i32 0, i32 2) } }, align 8, !dbg !46
+@v4 = dso_local global { { ptr } } { { ptr } { ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV12templ_non_tuIbE, i32 0, i32 0, i32 2) } }, align 8, !dbg !46
@v6 = dso_local global %class.ref_internal_template zeroinitializer, align 1, !dbg !60
@v7 = dso_local global %class.ref_from_ref_internal_template zeroinitializer, align 1, !dbg !69
@_ZTVN10__cxxabiv117__class_type_infoE = external dso_local global ptr
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s b/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s
index 08ec5b3..efeaf83 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s
@@ -3970,70 +3970,70 @@ buffer_atomic_max_u64 v[5:6], off, s[8:11], s3 offset:8388607 dlc
buffer_atomic_max_u64 v[5:6], off, s[8:11], s3 offset:8388607 glc slc dlc
// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-buffer_atomic_min_f32 v5, off, s[8:11], s3 offset:8388607
+buffer_atomic_min_num_f32 v5, off, s[8:11], s3 offset:8388607
// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-buffer_atomic_min_f32 v255, off, s[8:11], s3 offset:8388607
+buffer_atomic_min_num_f32 v255, off, s[8:11], s3 offset:8388607
// GFX12: encoding: [0x03,0x40,0x14,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-buffer_atomic_min_f32 v5, off, s[12:15], s3 offset:8388607
+buffer_atomic_min_num_f32 v5, off, s[12:15], s3 offset:8388607
// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-buffer_atomic_min_f32 v5, off, s[96:99], s3 offset:8388607
+buffer_atomic_min_num_f32 v5, off, s[96:99], s3 offset:8388607
// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-buffer_atomic_min_f32 v5, off, s[8:11], s101 offset:8388607
+buffer_atomic_min_num_f32 v5, off, s[8:11], s101 offset:8388607
// GFX12: encoding: [0x65,0x40,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-buffer_atomic_min_f32 v5, off, s[8:11], m0 offset:8388607
+buffer_atomic_min_num_f32 v5, off, s[8:11], m0 offset:8388607
// GFX12: encoding: [0x7d,0x40,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-buffer_atomic_min_f32 v5, off, s[8:11], 0 offset:8388607
+buffer_atomic_min_num_f32 v5, off, s[8:11], 0 offset:8388607
// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-buffer_atomic_min_f32 v5, off, s[8:11], -1 offset:8388607
+buffer_atomic_min_num_f32 v5, off, s[8:11], -1 offset:8388607
// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-buffer_atomic_min_f32 v5, off, s[8:11], 0.5 offset:8388607
+buffer_atomic_min_num_f32 v5, off, s[8:11], 0.5 offset:8388607
// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-buffer_atomic_min_f32 v5, off, s[8:11], -4.0 offset:8388607
+buffer_atomic_min_num_f32 v5, off, s[8:11], -4.0 offset:8388607
// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-buffer_atomic_min_f32 v5, v0, s[8:11], s3 idxen offset:8388607
+buffer_atomic_min_num_f32 v5, v0, s[8:11], s3 idxen offset:8388607
// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-buffer_atomic_min_f32 v5, v0, s[8:11], s3 offen offset:8388607
+buffer_atomic_min_num_f32 v5, v0, s[8:11], s3 offen offset:8388607
// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-buffer_atomic_min_f32 v5, off, s[8:11], s3
+buffer_atomic_min_num_f32 v5, off, s[8:11], s3
// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-buffer_atomic_min_f32 v5, off, s[8:11], s3 offset:0
+buffer_atomic_min_num_f32 v5, off, s[8:11], s3 offset:0
// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-buffer_atomic_min_f32 v5, off, s[8:11], s3 offset:7
+buffer_atomic_min_num_f32 v5, off, s[8:11], s3 offset:7
// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-buffer_atomic_min_f32 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
+buffer_atomic_min_num_f32 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
-buffer_atomic_min_f32 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RT_RETURN scope:SCOPE_SE
+buffer_atomic_min_num_f32 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RT_RETURN scope:SCOPE_SE
// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x94,0x00,0x00,0xff,0xff,0x7f]
-buffer_atomic_min_f32 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_CASCADE_NT scope:SCOPE_DEV
+buffer_atomic_min_num_f32 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_CASCADE_NT scope:SCOPE_DEV
// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0xe8,0x00,0x00,0xff,0xff,0x7f]
-buffer_atomic_min_f32 v5, off, s[8:11], s3 offset:8388607 glc
+buffer_atomic_min_num_f32 v5, off, s[8:11], s3 offset:8388607 glc
// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-buffer_atomic_min_f32 v5, off, s[8:11], s3 offset:8388607 slc
+buffer_atomic_min_num_f32 v5, off, s[8:11], s3 offset:8388607 slc
// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-buffer_atomic_min_f32 v5, off, s[8:11], s3 offset:8388607 dlc
+buffer_atomic_min_num_f32 v5, off, s[8:11], s3 offset:8388607 dlc
// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-buffer_atomic_min_f32 v5, off, s[8:11], s3 offset:8388607 glc slc dlc
+buffer_atomic_min_num_f32 v5, off, s[8:11], s3 offset:8388607 glc slc dlc
// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
buffer_atomic_min_i32 v5, off, s[8:11], s3 offset:8388607
diff --git a/llvm/test/ThinLTO/X86/Inputs/devirt_single_hybrid_bar.ll b/llvm/test/ThinLTO/X86/Inputs/devirt_single_hybrid_bar.ll
index 721d6efb..d8c6525 100644
--- a/llvm/test/ThinLTO/X86/Inputs/devirt_single_hybrid_bar.ll
+++ b/llvm/test/ThinLTO/X86/Inputs/devirt_single_hybrid_bar.ll
@@ -23,7 +23,7 @@ define hidden i32 @_Z3barv() local_unnamed_addr #0 {
entry:
%b = alloca %struct.A, align 8
call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %b)
- store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1A, i64 0, inrange i32 0, i64 2), ptr %b, align 8, !tbaa !4
+ store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1A, i64 0, i32 0, i64 2), ptr %b, align 8, !tbaa !4
%call = call i32 @_Z3fooP1A(ptr nonnull %b)
%add = add nsw i32 %call, 10
call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %b) #4
diff --git a/llvm/test/ThinLTO/X86/devirt_after_filtering_unreachable.ll b/llvm/test/ThinLTO/X86/devirt_after_filtering_unreachable.ll
index 68b83de..39f42da 100644
--- a/llvm/test/ThinLTO/X86/devirt_after_filtering_unreachable.ll
+++ b/llvm/test/ThinLTO/X86/devirt_after_filtering_unreachable.ll
@@ -71,7 +71,7 @@ target triple = "x86_64-unknown-linux-gnu"
define hidden i32 @main() {
entry:
%call = tail call ptr @_Znwm(i64 8)
- store ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV7Derived, i64 0, inrange i32 0, i64 2), ptr %call
+ store ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV7Derived, i64 0, i32 0, i64 2), ptr %call
tail call void @_Z3fooP4Base(ptr nonnull %call)
ret i32 0
}
diff --git a/llvm/test/ThinLTO/X86/devirt_external_comdat_same_guid.ll b/llvm/test/ThinLTO/X86/devirt_external_comdat_same_guid.ll
index 2417532..1f0737b 100644
--- a/llvm/test/ThinLTO/X86/devirt_external_comdat_same_guid.ll
+++ b/llvm/test/ThinLTO/X86/devirt_external_comdat_same_guid.ll
@@ -51,7 +51,7 @@ define i32 @_ZN1B1nEi(ptr %this, i32 %a) #0 comdat($_ZTV1B) {
; Ensures that vtable of B is live so that we will attempt devirt.
define dso_local i32 @use_B(ptr %a) {
entry:
- store ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV1B, i64 0, inrange i32 0, i64 2), ptr %a, align 8
+ store ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV1B, i64 0, i32 0, i64 2), ptr %a, align 8
ret i32 0
}
diff --git a/llvm/test/ThinLTO/X86/devirt_local_same_guid.ll b/llvm/test/ThinLTO/X86/devirt_local_same_guid.ll
index 3efea8d..2205545 100644
--- a/llvm/test/ThinLTO/X86/devirt_local_same_guid.ll
+++ b/llvm/test/ThinLTO/X86/devirt_local_same_guid.ll
@@ -37,7 +37,7 @@ define internal i32 @_ZN1B1nEi(ptr %this, i32 %a) #0 {
; Ensures that vtable of B is live so that we will attempt devirt.
define dso_local i32 @use_B(ptr %a) {
entry:
- store ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV1B, i64 0, inrange i32 0, i64 2), ptr %a, align 8
+ store ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV1B, i64 0, i32 0, i64 2), ptr %a, align 8
ret i32 0
}
diff --git a/llvm/test/ThinLTO/X86/lower_type_test_phi.ll b/llvm/test/ThinLTO/X86/lower_type_test_phi.ll
index 722ffe3..81d85f6 100644
--- a/llvm/test/ThinLTO/X86/lower_type_test_phi.ll
+++ b/llvm/test/ThinLTO/X86/lower_type_test_phi.ll
@@ -117,7 +117,7 @@ $_ZTV2D2 = comdat any
define ptr @_Z2b1v() {
entry:
%call = tail call ptr @_Znwm(i64 8)
- store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV2D1, i64 0, inrange i32 0, i64 2), ptr %call, align 8
+ store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV2D1, i64 0, i32 0, i64 2), ptr %call, align 8
ret ptr %call
}
@@ -126,7 +126,7 @@ declare ptr @_Znwm(i64)
define ptr @_Z2b2v() {
entry:
%call = tail call ptr @_Znwm(i64 8)
- store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV2D2, i64 0, inrange i32 0, i64 2), ptr %call, align 8
+ store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV2D2, i64 0, i32 0, i64 2), ptr %call, align 8
ret ptr %call
}
diff --git a/llvm/test/ThinLTO/X86/nodevirt-nonpromoted-typeid.ll b/llvm/test/ThinLTO/X86/nodevirt-nonpromoted-typeid.ll
index c6e61ed..7d71c59 100644
--- a/llvm/test/ThinLTO/X86/nodevirt-nonpromoted-typeid.ll
+++ b/llvm/test/ThinLTO/X86/nodevirt-nonpromoted-typeid.ll
@@ -55,7 +55,7 @@ entry:
%this.addr = alloca ptr, align 8
store ptr %this, ptr %this.addr, align 8
%this1 = load ptr, ptr %this.addr
- store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1D, i64 0, inrange i32 0, i64 2), ptr %this1, align 8
+ store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1D, i64 0, i32 0, i64 2), ptr %this1, align 8
ret void
}
diff --git a/llvm/test/ThinLTO/X86/type_test_noindircall.ll b/llvm/test/ThinLTO/X86/type_test_noindircall.ll
index 2d0faaa..cc85e44 100644
--- a/llvm/test/ThinLTO/X86/type_test_noindircall.ll
+++ b/llvm/test/ThinLTO/X86/type_test_noindircall.ll
@@ -38,8 +38,8 @@ target triple = "x86_64-grtev4-linux-gnu"
define internal void @_ZN12_GLOBAL__N_18RealFileD2Ev(ptr %this) unnamed_addr #0 align 2 {
entry:
; CHECK-IR: store
- store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTVN12_GLOBAL__N_18RealFileE, i64 0, inrange i32 0, i64 2), ptr %this, align 8
- %0 = tail call i1 @llvm.type.test(ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTVN12_GLOBAL__N_18RealFileE, i64 0, inrange i32 0, i64 2), metadata !"4$09c6cc733fc6accb91e5d7b87cb48f2d")
+ store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTVN12_GLOBAL__N_18RealFileE, i64 0, i32 0, i64 2), ptr %this, align 8
+ %0 = tail call i1 @llvm.type.test(ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTVN12_GLOBAL__N_18RealFileE, i64 0, i32 0, i64 2), metadata !"4$09c6cc733fc6accb91e5d7b87cb48f2d")
tail call void @llvm.assume(i1 %0)
; CHECK-IR-NEXT: ret void
ret void
diff --git a/llvm/test/Transforms/Float2Int/basic.ll b/llvm/test/Transforms/Float2Int/basic.ll
index 2854a83..a454b77 100644
--- a/llvm/test/Transforms/Float2Int/basic.ll
+++ b/llvm/test/Transforms/Float2Int/basic.ll
@@ -1,16 +1,29 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes='float2int' -S | FileCheck %s
+; RUN: opt < %s -passes=float2int -S | FileCheck %s -check-prefixes=CHECK,NONE
+; RUN: opt < %s -passes=float2int -S --data-layout="n64" | FileCheck %s -check-prefixes=CHECK,ONLY64
+; RUN: opt < %s -passes=float2int -S --data-layout="n8:16:32:64"| FileCheck %s -check-prefixes=CHECK,MULTIPLE
;
; Positive tests
;
define i16 @simple1(i8 %a) {
-; CHECK-LABEL: @simple1(
-; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
-; CHECK-NEXT: [[T21:%.*]] = add i32 [[TMP1]], 1
-; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[T21]] to i16
-; CHECK-NEXT: ret i16 [[TMP2]]
+; NONE-LABEL: @simple1(
+; NONE-NEXT: [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; NONE-NEXT: [[T21:%.*]] = add i32 [[TMP1]], 1
+; NONE-NEXT: [[TMP2:%.*]] = trunc i32 [[T21]] to i16
+; NONE-NEXT: ret i16 [[TMP2]]
+;
+; ONLY64-LABEL: @simple1(
+; ONLY64-NEXT: [[TMP1:%.*]] = zext i8 [[A:%.*]] to i64
+; ONLY64-NEXT: [[T21:%.*]] = add i64 [[TMP1]], 1
+; ONLY64-NEXT: [[TMP2:%.*]] = trunc i64 [[T21]] to i16
+; ONLY64-NEXT: ret i16 [[TMP2]]
+;
+; MULTIPLE-LABEL: @simple1(
+; MULTIPLE-NEXT: [[TMP1:%.*]] = zext i8 [[A:%.*]] to i16
+; MULTIPLE-NEXT: [[T21:%.*]] = add i16 [[TMP1]], 1
+; MULTIPLE-NEXT: ret i16 [[T21]]
;
%t1 = uitofp i8 %a to float
%t2 = fadd float %t1, 1.0
@@ -19,11 +32,23 @@ define i16 @simple1(i8 %a) {
}
define i8 @simple2(i8 %a) {
-; CHECK-LABEL: @simple2(
-; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
-; CHECK-NEXT: [[T21:%.*]] = sub i32 [[TMP1]], 1
-; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[T21]] to i8
-; CHECK-NEXT: ret i8 [[TMP2]]
+; NONE-LABEL: @simple2(
+; NONE-NEXT: [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; NONE-NEXT: [[T21:%.*]] = sub i32 [[TMP1]], 1
+; NONE-NEXT: [[TMP2:%.*]] = trunc i32 [[T21]] to i8
+; NONE-NEXT: ret i8 [[TMP2]]
+;
+; ONLY64-LABEL: @simple2(
+; ONLY64-NEXT: [[TMP1:%.*]] = zext i8 [[A:%.*]] to i64
+; ONLY64-NEXT: [[T21:%.*]] = sub i64 [[TMP1]], 1
+; ONLY64-NEXT: [[TMP2:%.*]] = trunc i64 [[T21]] to i8
+; ONLY64-NEXT: ret i8 [[TMP2]]
+;
+; MULTIPLE-LABEL: @simple2(
+; MULTIPLE-NEXT: [[TMP1:%.*]] = zext i8 [[A:%.*]] to i16
+; MULTIPLE-NEXT: [[T21:%.*]] = sub i16 [[TMP1]], 1
+; MULTIPLE-NEXT: [[TMP2:%.*]] = trunc i16 [[T21]] to i8
+; MULTIPLE-NEXT: ret i8 [[TMP2]]
;
%t1 = uitofp i8 %a to float
%t2 = fsub float %t1, 1.0
@@ -32,10 +57,22 @@ define i8 @simple2(i8 %a) {
}
define i32 @simple3(i8 %a) {
-; CHECK-LABEL: @simple3(
-; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
-; CHECK-NEXT: [[T21:%.*]] = sub i32 [[TMP1]], 1
-; CHECK-NEXT: ret i32 [[T21]]
+; NONE-LABEL: @simple3(
+; NONE-NEXT: [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; NONE-NEXT: [[T21:%.*]] = sub i32 [[TMP1]], 1
+; NONE-NEXT: ret i32 [[T21]]
+;
+; ONLY64-LABEL: @simple3(
+; ONLY64-NEXT: [[TMP1:%.*]] = zext i8 [[A:%.*]] to i64
+; ONLY64-NEXT: [[T21:%.*]] = sub i64 [[TMP1]], 1
+; ONLY64-NEXT: [[TMP2:%.*]] = trunc i64 [[T21]] to i32
+; ONLY64-NEXT: ret i32 [[TMP2]]
+;
+; MULTIPLE-LABEL: @simple3(
+; MULTIPLE-NEXT: [[TMP1:%.*]] = zext i8 [[A:%.*]] to i16
+; MULTIPLE-NEXT: [[T21:%.*]] = sub i16 [[TMP1]], 1
+; MULTIPLE-NEXT: [[TMP2:%.*]] = zext i16 [[T21]] to i32
+; MULTIPLE-NEXT: ret i32 [[TMP2]]
;
%t1 = uitofp i8 %a to float
%t2 = fsub float %t1, 1.0
@@ -44,11 +81,23 @@ define i32 @simple3(i8 %a) {
}
define i1 @cmp(i8 %a, i8 %b) {
-; CHECK-LABEL: @cmp(
-; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
-; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[B:%.*]] to i32
-; CHECK-NEXT: [[T31:%.*]] = icmp slt i32 [[TMP1]], [[TMP2]]
-; CHECK-NEXT: ret i1 [[T31]]
+; NONE-LABEL: @cmp(
+; NONE-NEXT: [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; NONE-NEXT: [[TMP2:%.*]] = zext i8 [[B:%.*]] to i32
+; NONE-NEXT: [[T31:%.*]] = icmp slt i32 [[TMP1]], [[TMP2]]
+; NONE-NEXT: ret i1 [[T31]]
+;
+; ONLY64-LABEL: @cmp(
+; ONLY64-NEXT: [[TMP1:%.*]] = zext i8 [[A:%.*]] to i64
+; ONLY64-NEXT: [[TMP2:%.*]] = zext i8 [[B:%.*]] to i64
+; ONLY64-NEXT: [[T31:%.*]] = icmp slt i64 [[TMP1]], [[TMP2]]
+; ONLY64-NEXT: ret i1 [[T31]]
+;
+; MULTIPLE-LABEL: @cmp(
+; MULTIPLE-NEXT: [[TMP1:%.*]] = zext i8 [[A:%.*]] to i16
+; MULTIPLE-NEXT: [[TMP2:%.*]] = zext i8 [[B:%.*]] to i16
+; MULTIPLE-NEXT: [[T31:%.*]] = icmp slt i16 [[TMP1]], [[TMP2]]
+; MULTIPLE-NEXT: ret i1 [[T31]]
;
%t1 = uitofp i8 %a to float
%t2 = uitofp i8 %b to float
@@ -70,12 +119,27 @@ define i32 @simple4(i32 %a) {
}
define i32 @simple5(i8 %a, i8 %b) {
-; CHECK-LABEL: @simple5(
-; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
-; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[B:%.*]] to i32
-; CHECK-NEXT: [[T31:%.*]] = add i32 [[TMP1]], 1
-; CHECK-NEXT: [[T42:%.*]] = mul i32 [[T31]], [[TMP2]]
-; CHECK-NEXT: ret i32 [[T42]]
+; NONE-LABEL: @simple5(
+; NONE-NEXT: [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; NONE-NEXT: [[TMP2:%.*]] = zext i8 [[B:%.*]] to i32
+; NONE-NEXT: [[T31:%.*]] = add i32 [[TMP1]], 1
+; NONE-NEXT: [[T42:%.*]] = mul i32 [[T31]], [[TMP2]]
+; NONE-NEXT: ret i32 [[T42]]
+;
+; ONLY64-LABEL: @simple5(
+; ONLY64-NEXT: [[TMP1:%.*]] = zext i8 [[A:%.*]] to i64
+; ONLY64-NEXT: [[TMP2:%.*]] = zext i8 [[B:%.*]] to i64
+; ONLY64-NEXT: [[T31:%.*]] = add i64 [[TMP1]], 1
+; ONLY64-NEXT: [[T42:%.*]] = mul i64 [[T31]], [[TMP2]]
+; ONLY64-NEXT: [[TMP3:%.*]] = trunc i64 [[T42]] to i32
+; ONLY64-NEXT: ret i32 [[TMP3]]
+;
+; MULTIPLE-LABEL: @simple5(
+; MULTIPLE-NEXT: [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; MULTIPLE-NEXT: [[TMP2:%.*]] = zext i8 [[B:%.*]] to i32
+; MULTIPLE-NEXT: [[T31:%.*]] = add i32 [[TMP1]], 1
+; MULTIPLE-NEXT: [[T42:%.*]] = mul i32 [[T31]], [[TMP2]]
+; MULTIPLE-NEXT: ret i32 [[T42]]
;
%t1 = uitofp i8 %a to float
%t2 = uitofp i8 %b to float
@@ -86,12 +150,27 @@ define i32 @simple5(i8 %a, i8 %b) {
}
define i32 @simple6(i8 %a, i8 %b) {
-; CHECK-LABEL: @simple6(
-; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
-; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[B:%.*]] to i32
-; CHECK-NEXT: [[T31:%.*]] = sub i32 0, [[TMP1]]
-; CHECK-NEXT: [[T42:%.*]] = mul i32 [[T31]], [[TMP2]]
-; CHECK-NEXT: ret i32 [[T42]]
+; NONE-LABEL: @simple6(
+; NONE-NEXT: [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; NONE-NEXT: [[TMP2:%.*]] = zext i8 [[B:%.*]] to i32
+; NONE-NEXT: [[T31:%.*]] = sub i32 0, [[TMP1]]
+; NONE-NEXT: [[T42:%.*]] = mul i32 [[T31]], [[TMP2]]
+; NONE-NEXT: ret i32 [[T42]]
+;
+; ONLY64-LABEL: @simple6(
+; ONLY64-NEXT: [[TMP1:%.*]] = zext i8 [[A:%.*]] to i64
+; ONLY64-NEXT: [[TMP2:%.*]] = zext i8 [[B:%.*]] to i64
+; ONLY64-NEXT: [[T31:%.*]] = sub i64 0, [[TMP1]]
+; ONLY64-NEXT: [[T42:%.*]] = mul i64 [[T31]], [[TMP2]]
+; ONLY64-NEXT: [[TMP3:%.*]] = trunc i64 [[T42]] to i32
+; ONLY64-NEXT: ret i32 [[TMP3]]
+;
+; MULTIPLE-LABEL: @simple6(
+; MULTIPLE-NEXT: [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; MULTIPLE-NEXT: [[TMP2:%.*]] = zext i8 [[B:%.*]] to i32
+; MULTIPLE-NEXT: [[T31:%.*]] = sub i32 0, [[TMP1]]
+; MULTIPLE-NEXT: [[T42:%.*]] = mul i32 [[T31]], [[TMP2]]
+; MULTIPLE-NEXT: ret i32 [[T42]]
;
%t1 = uitofp i8 %a to float
%t2 = uitofp i8 %b to float
@@ -105,15 +184,37 @@ define i32 @simple6(i8 %a, i8 %b) {
; cause failure of the other.
define i32 @multi1(i8 %a, i8 %b, i8 %c, float %d) {
-; CHECK-LABEL: @multi1(
-; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
-; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[B:%.*]] to i32
-; CHECK-NEXT: [[FC:%.*]] = uitofp i8 [[C:%.*]] to float
-; CHECK-NEXT: [[X1:%.*]] = add i32 [[TMP1]], [[TMP2]]
-; CHECK-NEXT: [[Z:%.*]] = fadd float [[FC]], [[D:%.*]]
-; CHECK-NEXT: [[W:%.*]] = fptoui float [[Z]] to i32
-; CHECK-NEXT: [[R:%.*]] = add i32 [[X1]], [[W]]
-; CHECK-NEXT: ret i32 [[R]]
+; NONE-LABEL: @multi1(
+; NONE-NEXT: [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; NONE-NEXT: [[TMP2:%.*]] = zext i8 [[B:%.*]] to i32
+; NONE-NEXT: [[FC:%.*]] = uitofp i8 [[C:%.*]] to float
+; NONE-NEXT: [[X1:%.*]] = add i32 [[TMP1]], [[TMP2]]
+; NONE-NEXT: [[Z:%.*]] = fadd float [[FC]], [[D:%.*]]
+; NONE-NEXT: [[W:%.*]] = fptoui float [[Z]] to i32
+; NONE-NEXT: [[R:%.*]] = add i32 [[X1]], [[W]]
+; NONE-NEXT: ret i32 [[R]]
+;
+; ONLY64-LABEL: @multi1(
+; ONLY64-NEXT: [[TMP1:%.*]] = zext i8 [[A:%.*]] to i64
+; ONLY64-NEXT: [[TMP2:%.*]] = zext i8 [[B:%.*]] to i64
+; ONLY64-NEXT: [[FC:%.*]] = uitofp i8 [[C:%.*]] to float
+; ONLY64-NEXT: [[X1:%.*]] = add i64 [[TMP1]], [[TMP2]]
+; ONLY64-NEXT: [[TMP3:%.*]] = trunc i64 [[X1]] to i32
+; ONLY64-NEXT: [[Z:%.*]] = fadd float [[FC]], [[D:%.*]]
+; ONLY64-NEXT: [[W:%.*]] = fptoui float [[Z]] to i32
+; ONLY64-NEXT: [[R:%.*]] = add i32 [[TMP3]], [[W]]
+; ONLY64-NEXT: ret i32 [[R]]
+;
+; MULTIPLE-LABEL: @multi1(
+; MULTIPLE-NEXT: [[TMP1:%.*]] = zext i8 [[A:%.*]] to i16
+; MULTIPLE-NEXT: [[TMP2:%.*]] = zext i8 [[B:%.*]] to i16
+; MULTIPLE-NEXT: [[FC:%.*]] = uitofp i8 [[C:%.*]] to float
+; MULTIPLE-NEXT: [[X1:%.*]] = add i16 [[TMP1]], [[TMP2]]
+; MULTIPLE-NEXT: [[TMP3:%.*]] = zext i16 [[X1]] to i32
+; MULTIPLE-NEXT: [[Z:%.*]] = fadd float [[FC]], [[D:%.*]]
+; MULTIPLE-NEXT: [[W:%.*]] = fptoui float [[Z]] to i32
+; MULTIPLE-NEXT: [[R:%.*]] = add i32 [[TMP3]], [[W]]
+; MULTIPLE-NEXT: ret i32 [[R]]
;
%fa = uitofp i8 %a to float
%fb = uitofp i8 %b to float
@@ -127,11 +228,22 @@ define i32 @multi1(i8 %a, i8 %b, i8 %c, float %d) {
}
define i16 @simple_negzero(i8 %a) {
-; CHECK-LABEL: @simple_negzero(
-; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
-; CHECK-NEXT: [[T21:%.*]] = add i32 [[TMP1]], 0
-; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[T21]] to i16
-; CHECK-NEXT: ret i16 [[TMP2]]
+; NONE-LABEL: @simple_negzero(
+; NONE-NEXT: [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; NONE-NEXT: [[T21:%.*]] = add i32 [[TMP1]], 0
+; NONE-NEXT: [[TMP2:%.*]] = trunc i32 [[T21]] to i16
+; NONE-NEXT: ret i16 [[TMP2]]
+;
+; ONLY64-LABEL: @simple_negzero(
+; ONLY64-NEXT: [[TMP1:%.*]] = zext i8 [[A:%.*]] to i64
+; ONLY64-NEXT: [[T21:%.*]] = add i64 [[TMP1]], 0
+; ONLY64-NEXT: [[TMP2:%.*]] = trunc i64 [[T21]] to i16
+; ONLY64-NEXT: ret i16 [[TMP2]]
+;
+; MULTIPLE-LABEL: @simple_negzero(
+; MULTIPLE-NEXT: [[TMP1:%.*]] = zext i8 [[A:%.*]] to i16
+; MULTIPLE-NEXT: [[T21:%.*]] = add i16 [[TMP1]], 0
+; MULTIPLE-NEXT: ret i16 [[T21]]
;
%t1 = uitofp i8 %a to float
%t2 = fadd fast float %t1, -0.0
@@ -140,12 +252,26 @@ define i16 @simple_negzero(i8 %a) {
}
define i32 @simple_negative(i8 %call) {
-; CHECK-LABEL: @simple_negative(
-; CHECK-NEXT: [[TMP1:%.*]] = sext i8 [[CALL:%.*]] to i32
-; CHECK-NEXT: [[MUL1:%.*]] = mul i32 [[TMP1]], -3
-; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[MUL1]] to i8
-; CHECK-NEXT: [[CONV3:%.*]] = sext i8 [[TMP2]] to i32
-; CHECK-NEXT: ret i32 [[CONV3]]
+; NONE-LABEL: @simple_negative(
+; NONE-NEXT: [[TMP1:%.*]] = sext i8 [[CALL:%.*]] to i32
+; NONE-NEXT: [[MUL1:%.*]] = mul i32 [[TMP1]], -3
+; NONE-NEXT: [[TMP2:%.*]] = trunc i32 [[MUL1]] to i8
+; NONE-NEXT: [[CONV3:%.*]] = sext i8 [[TMP2]] to i32
+; NONE-NEXT: ret i32 [[CONV3]]
+;
+; ONLY64-LABEL: @simple_negative(
+; ONLY64-NEXT: [[TMP1:%.*]] = sext i8 [[CALL:%.*]] to i64
+; ONLY64-NEXT: [[MUL1:%.*]] = mul i64 [[TMP1]], -3
+; ONLY64-NEXT: [[TMP2:%.*]] = trunc i64 [[MUL1]] to i8
+; ONLY64-NEXT: [[CONV3:%.*]] = sext i8 [[TMP2]] to i32
+; ONLY64-NEXT: ret i32 [[CONV3]]
+;
+; MULTIPLE-LABEL: @simple_negative(
+; MULTIPLE-NEXT: [[TMP1:%.*]] = sext i8 [[CALL:%.*]] to i16
+; MULTIPLE-NEXT: [[MUL1:%.*]] = mul i16 [[TMP1]], -3
+; MULTIPLE-NEXT: [[TMP2:%.*]] = trunc i16 [[MUL1]] to i8
+; MULTIPLE-NEXT: [[CONV3:%.*]] = sext i8 [[TMP2]] to i32
+; MULTIPLE-NEXT: ret i32 [[CONV3]]
;
%conv1 = sitofp i8 %call to float
%mul = fmul float %conv1, -3.000000e+00
@@ -155,11 +281,22 @@ define i32 @simple_negative(i8 %call) {
}
define i16 @simple_fneg(i8 %a) {
-; CHECK-LABEL: @simple_fneg(
-; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
-; CHECK-NEXT: [[T21:%.*]] = sub i32 0, [[TMP1]]
-; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[T21]] to i16
-; CHECK-NEXT: ret i16 [[TMP2]]
+; NONE-LABEL: @simple_fneg(
+; NONE-NEXT: [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; NONE-NEXT: [[T21:%.*]] = sub i32 0, [[TMP1]]
+; NONE-NEXT: [[TMP2:%.*]] = trunc i32 [[T21]] to i16
+; NONE-NEXT: ret i16 [[TMP2]]
+;
+; ONLY64-LABEL: @simple_fneg(
+; ONLY64-NEXT: [[TMP1:%.*]] = zext i8 [[A:%.*]] to i64
+; ONLY64-NEXT: [[T21:%.*]] = sub i64 0, [[TMP1]]
+; ONLY64-NEXT: [[TMP2:%.*]] = trunc i64 [[T21]] to i16
+; ONLY64-NEXT: ret i16 [[TMP2]]
+;
+; MULTIPLE-LABEL: @simple_fneg(
+; MULTIPLE-NEXT: [[TMP1:%.*]] = zext i8 [[A:%.*]] to i16
+; MULTIPLE-NEXT: [[T21:%.*]] = sub i16 0, [[TMP1]]
+; MULTIPLE-NEXT: ret i16 [[T21]]
;
%t1 = uitofp i8 %a to float
%t2 = fneg fast float %t1
diff --git a/llvm/test/Transforms/IRCE/compound-loop-bound.ll b/llvm/test/Transforms/IRCE/compound-loop-bound.ll
index 0930d19..e50d8c6 100644
--- a/llvm/test/Transforms/IRCE/compound-loop-bound.ll
+++ b/llvm/test/Transforms/IRCE/compound-loop-bound.ll
@@ -16,23 +16,56 @@ define void @incrementing_loop(ptr %arr, ptr %len_ptr, i32 %K, i32 %M) {
; CHECK-NEXT: br i1 [[AND]], label [[PREHEADER:%.*]], label [[EXIT:%.*]]
; CHECK: preheader:
; CHECK-NEXT: [[SMIN:%.*]] = call i32 @llvm.smin.i32(i32 [[K]], i32 [[M]])
+; CHECK-NEXT: [[SMIN1:%.*]] = call i32 @llvm.smin.i32(i32 [[LEN]], i32 [[M]])
+; CHECK-NEXT: [[SMIN2:%.*]] = call i32 @llvm.smin.i32(i32 [[SMIN1]], i32 [[K]])
+; CHECK-NEXT: [[EXIT_MAINLOOP_AT:%.*]] = call i32 @llvm.smax.i32(i32 [[SMIN2]], i32 0)
+; CHECK-NEXT: [[TMP0:%.*]] = icmp slt i32 0, [[EXIT_MAINLOOP_AT]]
+; CHECK-NEXT: br i1 [[TMP0]], label [[LOOP_PREHEADER:%.*]], label [[MAIN_PSEUDO_EXIT:%.*]]
+; CHECK: loop.preheader:
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
-; CHECK-NEXT: [[IDX:%.*]] = phi i32 [ 0, [[PREHEADER]] ], [ [[IDX_NEXT:%.*]], [[IN_BOUNDS:%.*]] ]
-; CHECK-NEXT: [[IDX_NEXT]] = add i32 [[IDX]], 1
+; CHECK-NEXT: [[IDX:%.*]] = phi i32 [ [[IDX_NEXT:%.*]], [[IN_BOUNDS:%.*]] ], [ 0, [[LOOP_PREHEADER]] ]
+; CHECK-NEXT: [[IDX_NEXT]] = add nsw i32 [[IDX]], 1
; CHECK-NEXT: [[GUARD:%.*]] = icmp slt i32 [[IDX]], [[LEN]]
-; CHECK-NEXT: br i1 [[GUARD]], label [[IN_BOUNDS]], label [[OUT_OF_BOUNDS:%.*]]
+; CHECK-NEXT: br i1 true, label [[IN_BOUNDS]], label [[OUT_OF_BOUNDS_LOOPEXIT3:%.*]]
; CHECK: in.bounds:
; CHECK-NEXT: [[ADDR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[IDX]]
; CHECK-NEXT: store i32 0, ptr [[ADDR]], align 4
; CHECK-NEXT: [[NEXT:%.*]] = icmp slt i32 [[IDX_NEXT]], [[SMIN]]
-; CHECK-NEXT: br i1 [[NEXT]], label [[LOOP]], label [[EXIT_LOOPEXIT:%.*]]
+; CHECK-NEXT: [[TMP1:%.*]] = icmp slt i32 [[IDX_NEXT]], [[EXIT_MAINLOOP_AT]]
+; CHECK-NEXT: br i1 [[TMP1]], label [[LOOP]], label [[MAIN_EXIT_SELECTOR:%.*]]
+; CHECK: main.exit.selector:
+; CHECK-NEXT: [[IDX_NEXT_LCSSA:%.*]] = phi i32 [ [[IDX_NEXT]], [[IN_BOUNDS]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = icmp slt i32 [[IDX_NEXT_LCSSA]], [[SMIN]]
+; CHECK-NEXT: br i1 [[TMP2]], label [[MAIN_PSEUDO_EXIT]], label [[EXIT_LOOPEXIT:%.*]]
+; CHECK: main.pseudo.exit:
+; CHECK-NEXT: [[IDX_COPY:%.*]] = phi i32 [ 0, [[PREHEADER]] ], [ [[IDX_NEXT_LCSSA]], [[MAIN_EXIT_SELECTOR]] ]
+; CHECK-NEXT: [[INDVAR_END:%.*]] = phi i32 [ 0, [[PREHEADER]] ], [ [[IDX_NEXT_LCSSA]], [[MAIN_EXIT_SELECTOR]] ]
+; CHECK-NEXT: br label [[POSTLOOP:%.*]]
+; CHECK: out.of.bounds.loopexit:
+; CHECK-NEXT: br label [[OUT_OF_BOUNDS:%.*]]
+; CHECK: out.of.bounds.loopexit3:
+; CHECK-NEXT: br label [[OUT_OF_BOUNDS]]
; CHECK: out.of.bounds:
; CHECK-NEXT: ret void
+; CHECK: exit.loopexit.loopexit:
+; CHECK-NEXT: br label [[EXIT_LOOPEXIT]]
; CHECK: exit.loopexit:
; CHECK-NEXT: br label [[EXIT]]
; CHECK: exit:
; CHECK-NEXT: ret void
+; CHECK: postloop:
+; CHECK-NEXT: br label [[LOOP_POSTLOOP:%.*]]
+; CHECK: loop.postloop:
+; CHECK-NEXT: [[IDX_POSTLOOP:%.*]] = phi i32 [ [[IDX_COPY]], [[POSTLOOP]] ], [ [[IDX_NEXT_POSTLOOP:%.*]], [[IN_BOUNDS_POSTLOOP:%.*]] ]
+; CHECK-NEXT: [[IDX_NEXT_POSTLOOP]] = add i32 [[IDX_POSTLOOP]], 1
+; CHECK-NEXT: [[GUARD_POSTLOOP:%.*]] = icmp slt i32 [[IDX_POSTLOOP]], [[LEN]]
+; CHECK-NEXT: br i1 [[GUARD_POSTLOOP]], label [[IN_BOUNDS_POSTLOOP]], label [[OUT_OF_BOUNDS_LOOPEXIT:%.*]]
+; CHECK: in.bounds.postloop:
+; CHECK-NEXT: [[ADDR_POSTLOOP:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[IDX_POSTLOOP]]
+; CHECK-NEXT: store i32 0, ptr [[ADDR_POSTLOOP]], align 4
+; CHECK-NEXT: [[NEXT_POSTLOOP:%.*]] = icmp slt i32 [[IDX_NEXT_POSTLOOP]], [[SMIN]]
+; CHECK-NEXT: br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP1:![0-9]+]], !loop_constrainer.loop.clone !6
;
entry:
%len = load i32, ptr %len_ptr, !range !0
@@ -78,24 +111,58 @@ define void @decrementing_loop(ptr %arr, ptr %len_ptr, i32 %K, i32 %M) {
; CHECK-NEXT: [[AND:%.*]] = and i1 [[CHECK0]], [[CHECK1]]
; CHECK-NEXT: br i1 [[AND]], label [[PREHEADER:%.*]], label [[EXIT:%.*]]
; CHECK: preheader:
-; CHECK-NEXT: [[SMIN:%.*]] = call i32 @llvm.smin.i32(i32 [[K]], i32 [[M]])
+; CHECK-NEXT: [[INDVAR_START:%.*]] = call i32 @llvm.smin.i32(i32 [[K]], i32 [[M]])
+; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDVAR_START]], 1
+; CHECK-NEXT: [[SMIN:%.*]] = call i32 @llvm.smin.i32(i32 [[LEN]], i32 [[TMP0]])
+; CHECK-NEXT: [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[SMIN]], i32 0)
+; CHECK-NEXT: [[EXIT_PRELOOP_AT:%.*]] = add nsw i32 [[SMAX]], -1
+; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt i32 [[INDVAR_START]], [[EXIT_PRELOOP_AT]]
+; CHECK-NEXT: br i1 [[TMP1]], label [[LOOP_PRELOOP_PREHEADER:%.*]], label [[PRELOOP_PSEUDO_EXIT:%.*]]
+; CHECK: loop.preloop.preheader:
+; CHECK-NEXT: br label [[LOOP_PRELOOP:%.*]]
+; CHECK: mainloop:
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
-; CHECK-NEXT: [[IDX:%.*]] = phi i32 [ [[SMIN]], [[PREHEADER]] ], [ [[IDX_DEC:%.*]], [[IN_BOUNDS:%.*]] ]
-; CHECK-NEXT: [[IDX_DEC]] = sub i32 [[IDX]], 1
+; CHECK-NEXT: [[IDX:%.*]] = phi i32 [ [[IDX_PRELOOP_COPY:%.*]], [[MAINLOOP:%.*]] ], [ [[IDX_DEC:%.*]], [[IN_BOUNDS:%.*]] ]
+; CHECK-NEXT: [[IDX_DEC]] = sub nsw i32 [[IDX]], 1
; CHECK-NEXT: [[GUARD:%.*]] = icmp slt i32 [[IDX]], [[LEN]]
-; CHECK-NEXT: br i1 [[GUARD]], label [[IN_BOUNDS]], label [[OUT_OF_BOUNDS:%.*]]
+; CHECK-NEXT: br i1 true, label [[IN_BOUNDS]], label [[OUT_OF_BOUNDS_LOOPEXIT1:%.*]]
; CHECK: in.bounds:
; CHECK-NEXT: [[ADDR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[IDX]]
; CHECK-NEXT: store i32 0, ptr [[ADDR]], align 4
; CHECK-NEXT: [[NEXT:%.*]] = icmp sgt i32 [[IDX_DEC]], -1
-; CHECK-NEXT: br i1 [[NEXT]], label [[LOOP]], label [[EXIT_LOOPEXIT:%.*]]
+; CHECK-NEXT: br i1 [[NEXT]], label [[LOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT:%.*]]
+; CHECK: out.of.bounds.loopexit:
+; CHECK-NEXT: br label [[OUT_OF_BOUNDS:%.*]]
+; CHECK: out.of.bounds.loopexit1:
+; CHECK-NEXT: br label [[OUT_OF_BOUNDS]]
; CHECK: out.of.bounds:
; CHECK-NEXT: ret void
+; CHECK: exit.loopexit.loopexit:
+; CHECK-NEXT: br label [[EXIT_LOOPEXIT:%.*]]
; CHECK: exit.loopexit:
; CHECK-NEXT: br label [[EXIT]]
; CHECK: exit:
; CHECK-NEXT: ret void
+; CHECK: loop.preloop:
+; CHECK-NEXT: [[IDX_PRELOOP:%.*]] = phi i32 [ [[IDX_DEC_PRELOOP:%.*]], [[IN_BOUNDS_PRELOOP:%.*]] ], [ [[INDVAR_START]], [[LOOP_PRELOOP_PREHEADER]] ]
+; CHECK-NEXT: [[IDX_DEC_PRELOOP]] = sub i32 [[IDX_PRELOOP]], 1
+; CHECK-NEXT: [[GUARD_PRELOOP:%.*]] = icmp slt i32 [[IDX_PRELOOP]], [[LEN]]
+; CHECK-NEXT: br i1 [[GUARD_PRELOOP]], label [[IN_BOUNDS_PRELOOP]], label [[OUT_OF_BOUNDS_LOOPEXIT:%.*]]
+; CHECK: in.bounds.preloop:
+; CHECK-NEXT: [[ADDR_PRELOOP:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[IDX_PRELOOP]]
+; CHECK-NEXT: store i32 0, ptr [[ADDR_PRELOOP]], align 4
+; CHECK-NEXT: [[NEXT_PRELOOP:%.*]] = icmp sgt i32 [[IDX_DEC_PRELOOP]], -1
+; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt i32 [[IDX_DEC_PRELOOP]], [[EXIT_PRELOOP_AT]]
+; CHECK-NEXT: br i1 [[TMP2]], label [[LOOP_PRELOOP]], label [[PRELOOP_EXIT_SELECTOR:%.*]], !llvm.loop [[LOOP7:![0-9]+]], !loop_constrainer.loop.clone !6
+; CHECK: preloop.exit.selector:
+; CHECK-NEXT: [[IDX_DEC_PRELOOP_LCSSA:%.*]] = phi i32 [ [[IDX_DEC_PRELOOP]], [[IN_BOUNDS_PRELOOP]] ]
+; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt i32 [[IDX_DEC_PRELOOP_LCSSA]], -1
+; CHECK-NEXT: br i1 [[TMP3]], label [[PRELOOP_PSEUDO_EXIT]], label [[EXIT_LOOPEXIT]]
+; CHECK: preloop.pseudo.exit:
+; CHECK-NEXT: [[IDX_PRELOOP_COPY]] = phi i32 [ [[INDVAR_START]], [[PREHEADER]] ], [ [[IDX_DEC_PRELOOP_LCSSA]], [[PRELOOP_EXIT_SELECTOR]] ]
+; CHECK-NEXT: [[INDVAR_END:%.*]] = phi i32 [ [[INDVAR_START]], [[PREHEADER]] ], [ [[IDX_DEC_PRELOOP_LCSSA]], [[PRELOOP_EXIT_SELECTOR]] ]
+; CHECK-NEXT: br label [[MAINLOOP]]
;
entry:
%len = load i32, ptr %len_ptr, !range !0
diff --git a/llvm/test/Transforms/InstCombine/shift-cttz-ctlz.ll b/llvm/test/Transforms/InstCombine/shift-cttz-ctlz.ll
new file mode 100644
index 0000000..2b2f820
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/shift-cttz-ctlz.ll
@@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+define i32 @shl_cttz_false(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @shl_cttz_false(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CTTZ:%.*]] = call i32 @llvm.cttz.i32(i32 [[Y]], i1 true), !range [[RNG0:![0-9]+]]
+; CHECK-NEXT: [[RES:%.*]] = shl i32 [[X]], [[CTTZ]]
+; CHECK-NEXT: ret i32 [[RES]]
+;
+entry:
+ %cttz = call i32 @llvm.cttz.i32(i32 %y, i1 false)
+ %res = shl i32 %x, %cttz
+ ret i32 %res
+}
+
+define i32 @shl_ctlz_false(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @shl_ctlz_false(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CTTZ:%.*]] = call i32 @llvm.ctlz.i32(i32 [[Y]], i1 true), !range [[RNG0]]
+; CHECK-NEXT: [[RES:%.*]] = shl i32 [[X]], [[CTTZ]]
+; CHECK-NEXT: ret i32 [[RES]]
+;
+entry:
+ %cttz = call i32 @llvm.ctlz.i32(i32 %y, i1 false)
+ %res = shl i32 %x, %cttz
+ ret i32 %res
+}
+
+define i32 @lshr_cttz_false(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @lshr_cttz_false(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CTTZ:%.*]] = call i32 @llvm.cttz.i32(i32 [[Y]], i1 true), !range [[RNG0]]
+; CHECK-NEXT: [[RES:%.*]] = lshr i32 [[X]], [[CTTZ]]
+; CHECK-NEXT: ret i32 [[RES]]
+;
+entry:
+ %cttz = call i32 @llvm.cttz.i32(i32 %y, i1 false)
+ %res = lshr i32 %x, %cttz
+ ret i32 %res
+}
+
+define i32 @ashr_cttz_false(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @ashr_cttz_false(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CTTZ:%.*]] = call i32 @llvm.cttz.i32(i32 [[Y]], i1 true), !range [[RNG0]]
+; CHECK-NEXT: [[RES:%.*]] = ashr i32 [[X]], [[CTTZ]]
+; CHECK-NEXT: ret i32 [[RES]]
+;
+entry:
+ %cttz = call i32 @llvm.cttz.i32(i32 %y, i1 false)
+ %res = ashr i32 %x, %cttz
+ ret i32 %res
+}
+
+define i32 @shl_cttz_false_multiuse(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @shl_cttz_false_multiuse(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CTTZ:%.*]] = call i32 @llvm.cttz.i32(i32 [[Y]], i1 false), !range [[RNG0]]
+; CHECK-NEXT: call void @use(i32 [[CTTZ]])
+; CHECK-NEXT: [[RES:%.*]] = shl i32 [[X]], [[CTTZ]]
+; CHECK-NEXT: ret i32 [[RES]]
+;
+entry:
+ %cttz = call i32 @llvm.cttz.i32(i32 %y, i1 false)
+ call void @use(i32 %cttz)
+ %res = shl i32 %x, %cttz
+ ret i32 %res
+}
+
+define i32 @shl_cttz_as_lhs(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @shl_cttz_as_lhs(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CTTZ:%.*]] = call i32 @llvm.cttz.i32(i32 [[Y]], i1 false), !range [[RNG0]]
+; CHECK-NEXT: [[RES:%.*]] = shl i32 [[CTTZ]], [[X]]
+; CHECK-NEXT: ret i32 [[RES]]
+;
+entry:
+ %cttz = call i32 @llvm.cttz.i32(i32 %y, i1 false)
+ %res = shl i32 %cttz, %x
+ ret i32 %res
+}
+
+declare void @use(i32)
+;.
+; CHECK: [[RNG0]] = !{i32 0, i32 33}
+;.
diff --git a/llvm/test/Transforms/InstCombine/zext.ll b/llvm/test/Transforms/InstCombine/zext.ll
index edbd485..88cd9c7 100644
--- a/llvm/test/Transforms/InstCombine/zext.ll
+++ b/llvm/test/Transforms/InstCombine/zext.ll
@@ -836,3 +836,34 @@ define i64 @zext_nneg_demanded_constant(i8 %a) nounwind {
%c = and i64 %b, 254
ret i64 %c
}
+
+define i32 @zext_nneg_i1(i1 %x) {
+; CHECK-LABEL: @zext_nneg_i1(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: ret i32 0
+;
+entry:
+ %res = zext nneg i1 %x to i32
+ ret i32 %res
+}
+
+define <2 x i32> @zext_nneg_i1_vec(<2 x i1> %x) {
+; CHECK-LABEL: @zext_nneg_i1_vec(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: ret <2 x i32> zeroinitializer
+;
+entry:
+ %res = zext nneg <2 x i1> %x to <2 x i32>
+ ret <2 x i32> %res
+}
+
+define i32 @zext_nneg_i2(i2 %x) {
+; CHECK-LABEL: @zext_nneg_i2(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[RES:%.*]] = zext nneg i2 [[X:%.*]] to i32
+; CHECK-NEXT: ret i32 [[RES]]
+;
+entry:
+ %res = zext nneg i2 %x to i32
+ ret i32 %res
+}
diff --git a/llvm/test/Transforms/LoopVectorize/dbg-outer-loop-vect.ll b/llvm/test/Transforms/LoopVectorize/dbg-outer-loop-vect.ll
index 7f20963..2c665a4 100644
--- a/llvm/test/Transforms/LoopVectorize/dbg-outer-loop-vect.ll
+++ b/llvm/test/Transforms/LoopVectorize/dbg-outer-loop-vect.ll
@@ -7,7 +7,7 @@ define void @foo(ptr %h) !dbg !4 {
; CHECK-LABEL: define void @foo(
; CHECK-SAME: ptr [[H:%.*]]) !dbg [[DBG4:![0-9]+]] {
; CHECK-NEXT: entry:
-; CHECK-NEXT: call void @llvm.dbg.value(metadata i64 0, metadata [[META11:![0-9]+]], metadata !DIExpression()), !dbg [[DBG20:![0-9]+]]
+; CHECK-NEXT: tail call void @llvm.dbg.value(metadata i64 0, metadata [[META11:![0-9]+]], metadata !DIExpression()), !dbg [[DBG20:![0-9]+]]
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !dbg [[DBG21:![0-9]+]]
; CHECK: vector.ph:
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]], !dbg [[DBG21]]
@@ -27,15 +27,15 @@ define void @foo(ptr %h) !dbg !4 {
; CHECK-NEXT: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x ptr> [[TMP3]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>), !dbg [[DBG22]]
; CHECK-NEXT: [[TMP4]] = add nuw nsw <4 x i64> [[VEC_PHI]], <i64 1, i64 1, i64 1, i64 1>, !dbg [[DBG24:![0-9]+]]
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i64> [[TMP4]], <i64 5, i64 5, i64 5, i64 5>, !dbg [[DBG25:![0-9]+]]
-; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
-; CHECK-NEXT: br i1 [[TMP6]], label [[FOR_COND_CLEANUP32]], label [[FOR_COND5_PREHEADER1]]
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0, !dbg [[DBG26:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP6]], label [[FOR_COND_CLEANUP32]], label [[FOR_COND5_PREHEADER1]], !dbg [[DBG26]]
; CHECK: for.cond.cleanup32:
-; CHECK-NEXT: [[TMP7:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>, !dbg [[DBG26:![0-9]+]]
-; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <4 x i64> [[TMP7]], <i64 23, i64 23, i64 23, i64 23>, !dbg [[DBG27:![0-9]+]]
+; CHECK-NEXT: [[TMP7:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>, !dbg [[DBG27:![0-9]+]]
+; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <4 x i64> [[TMP7]], <i64 23, i64 23, i64 23, i64 23>, !dbg [[DBG28:![0-9]+]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 20
-; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]], !dbg [[DBG21]]
; CHECK: scalar.ph:
@@ -43,8 +43,8 @@ define void @foo(ptr %h) !dbg !4 {
; CHECK-NEXT: br label [[FOR_COND1_PREHEADER:%.*]], !dbg [[DBG21]]
; CHECK: for.cond1.preheader:
; CHECK-NEXT: [[I_023:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC13:%.*]], [[FOR_COND_CLEANUP3:%.*]] ]
-; CHECK-NEXT: call void @llvm.dbg.value(metadata i64 [[I_023]], metadata [[META11]], metadata !DIExpression()), !dbg [[DBG20]]
-; CHECK-NEXT: br label [[FOR_COND5_PREHEADER:%.*]], !dbg [[DBG32:![0-9]+]]
+; CHECK-NEXT: tail call void @llvm.dbg.value(metadata i64 [[I_023]], metadata [[META11]], metadata !DIExpression()), !dbg [[DBG20]]
+; CHECK-NEXT: br label [[FOR_COND5_PREHEADER:%.*]], !dbg [[DBG26]]
; CHECK: for.cond5.preheader:
; CHECK-NEXT: [[L_022:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INC10:%.*]], [[FOR_COND5_PREHEADER]] ]
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[H]], i64 [[L_022]]
@@ -57,11 +57,11 @@ define void @foo(ptr %h) !dbg !4 {
; CHECK-NEXT: store i32 3, ptr [[ARRAYIDX_3]], align 4, !dbg [[DBG22]]
; CHECK-NEXT: [[INC10]] = add nuw nsw i64 [[L_022]], 1, !dbg [[DBG24]]
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC10]], 5, !dbg [[DBG25]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP3]], label [[FOR_COND5_PREHEADER]], !dbg [[DBG32]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP3]], label [[FOR_COND5_PREHEADER]], !dbg [[DBG26]]
; CHECK: for.cond.cleanup3:
-; CHECK-NEXT: [[INC13]] = add nuw nsw i64 [[I_023]], 1, !dbg [[DBG26]]
-; CHECK-NEXT: call void @llvm.dbg.value(metadata i64 [[INC13]], metadata [[META11]], metadata !DIExpression()), !dbg [[DBG20]]
-; CHECK-NEXT: [[EXITCOND24_NOT:%.*]] = icmp eq i64 [[INC13]], 23, !dbg [[DBG27]]
+; CHECK-NEXT: [[INC13]] = add nuw nsw i64 [[I_023]], 1, !dbg [[DBG27]]
+; CHECK-NEXT: tail call void @llvm.dbg.value(metadata i64 [[INC13]], metadata [[META11]], metadata !DIExpression()), !dbg [[DBG20]]
+; CHECK-NEXT: [[EXITCOND24_NOT:%.*]] = icmp eq i64 [[INC13]], 23, !dbg [[DBG28]]
; CHECK-NEXT: br i1 [[EXITCOND24_NOT]], label [[EXIT]], label [[FOR_COND1_PREHEADER]], !dbg [[DBG21]], !llvm.loop [[LOOP34:![0-9]+]]
; CHECK: exit:
; CHECK-NEXT: ret void, !dbg [[DBG35:![0-9]+]]
@@ -163,14 +163,14 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
; CHECK: [[META23]] = distinct !DILexicalBlock(scope: [[META18]], file: [[META1]], line: 12, column: 7)
; CHECK: [[DBG24]] = !DILocation(line: 11, column: 32, scope: [[META19]])
; CHECK: [[DBG25]] = !DILocation(line: 11, column: 26, scope: [[META19]])
-; CHECK: [[DBG26]] = !DILocation(line: 10, column: 30, scope: [[META16]])
-; CHECK: [[DBG27]] = !DILocation(line: 10, column: 24, scope: [[META16]])
-; CHECK: [[LOOP28]] = distinct !{[[LOOP28]], [[DBG21]], [[META29:![0-9]+]], [[META30:![0-9]+]], [[META31:![0-9]+]]}
-; CHECK: [[META29]] = !DILocation(line: 13, column: 13, scope: [[META12]])
-; CHECK: [[META30]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK: [[META31]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[DBG32]] = !DILocation(line: 11, column: 5, scope: [[META15]])
+; CHECK: [[DBG26]] = !DILocation(line: 11, column: 5, scope: [[META15]])
+; CHECK: [[DBG27]] = !DILocation(line: 10, column: 30, scope: [[META16]])
+; CHECK: [[DBG28]] = !DILocation(line: 10, column: 24, scope: [[META16]])
+; CHECK: [[LOOP29]] = distinct !{[[LOOP29]], [[DBG21]], [[META30:![0-9]+]], [[META31:![0-9]+]], [[META32:![0-9]+]]}
+; CHECK: [[META30]] = !DILocation(line: 13, column: 13, scope: [[META12]])
+; CHECK: [[META31]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META32]] = !{!"llvm.loop.unroll.runtime.disable"}
; CHECK: [[DBG33]] = !DILocation(line: 13, column: 2, scope: [[META23]])
-; CHECK: [[LOOP34]] = distinct !{[[LOOP34]], [[DBG21]], [[META29]], [[META30]]}
+; CHECK: [[LOOP34]] = distinct !{[[LOOP34]], [[DBG21]], [[META30]], [[META31]]}
; CHECK: [[DBG35]] = !DILocation(line: 14, column: 1, scope: [[DBG4]])
;.
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll
new file mode 100644
index 0000000..705e425
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll
@@ -0,0 +1,88 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
+
+define void @h() {
+; CHECK-LABEL: define void @h() {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr i8, ptr null, i64 16
+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> <i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 0, i32 0
+; CHECK-NEXT: [[TMP1:%.*]] = or <8 x i32> zeroinitializer, [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = or <8 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT: [[TMP3:%.*]] = trunc <8 x i32> [[TMP2]] to <8 x i16>
+; CHECK-NEXT: store <8 x i16> [[TMP3]], ptr [[ARRAYIDX2]], align 2
+; CHECK-NEXT: ret void
+;
+entry:
+ %conv9 = zext i16 0 to i32
+ %arrayidx2 = getelementptr i8, ptr null, i64 16
+ %conv310 = zext i16 0 to i32
+ %add4 = or i32 %conv310, %conv9
+ %sub = or i32 %conv9, %conv310
+ %conv15 = sext i16 0 to i32
+ %shr = ashr i32 0, 0
+ %arrayidx18 = getelementptr i8, ptr null, i64 24
+ %conv19 = sext i16 0 to i32
+ %sub20 = or i32 %shr, %conv19
+ %shr29 = ashr i32 0, 0
+ %add30 = or i32 %shr29, %conv15
+ %sub39 = or i32 %sub, %sub20
+ %conv40 = trunc i32 %sub39 to i16
+ store i16 %conv40, ptr %arrayidx2, align 2
+ %sub44 = or i32 %add4, %add30
+ %conv45 = trunc i32 %sub44 to i16
+ store i16 %conv45, ptr %arrayidx18, align 2
+ %arrayidx2.1 = getelementptr i8, ptr null, i64 18
+ %conv3.112 = zext i16 0 to i32
+ %add4.1 = or i32 %conv3.112, 0
+ %sub.1 = or i32 0, %conv3.112
+ %conv15.1 = sext i16 0 to i32
+ %shr.1 = ashr i32 0, 0
+ %arrayidx18.1 = getelementptr i8, ptr null, i64 26
+ %conv19.1 = sext i16 0 to i32
+ %sub20.1 = or i32 %shr.1, %conv19.1
+ %shr29.1 = ashr i32 0, 0
+ %add30.1 = or i32 %shr29.1, %conv15.1
+ %sub39.1 = or i32 %sub.1, %sub20.1
+ %conv40.1 = trunc i32 %sub39.1 to i16
+ store i16 %conv40.1, ptr %arrayidx2.1, align 2
+ %sub44.1 = or i32 %add4.1, %add30.1
+ %conv45.1 = trunc i32 %sub44.1 to i16
+ store i16 %conv45.1, ptr %arrayidx18.1, align 2
+ %conv.213 = zext i16 0 to i32
+ %arrayidx2.2 = getelementptr i8, ptr null, i64 20
+ %conv3.214 = zext i16 0 to i32
+ %add4.2 = or i32 0, %conv.213
+ %sub.2 = or i32 0, %conv3.214
+ %conv15.2 = sext i16 0 to i32
+ %shr.2 = ashr i32 0, 0
+ %arrayidx18.2 = getelementptr i8, ptr null, i64 28
+ %conv19.2 = sext i16 0 to i32
+ %sub20.2 = or i32 %shr.2, %conv19.2
+ %shr29.2 = ashr i32 0, 0
+ %add30.2 = or i32 %shr29.2, %conv15.2
+ %sub39.2 = or i32 %sub.2, %sub20.2
+ %conv40.2 = trunc i32 %sub39.2 to i16
+ store i16 %conv40.2, ptr %arrayidx2.2, align 2
+ %sub44.2 = or i32 %add4.2, %add30.2
+ %conv45.2 = trunc i32 %sub44.2 to i16
+ store i16 %conv45.2, ptr %arrayidx18.2, align 2
+ %conv.315 = zext i16 0 to i32
+ %arrayidx2.3 = getelementptr i8, ptr null, i64 22
+ %conv3.316 = zext i16 0 to i32
+ %add4.3 = or i32 0, %conv.315
+ %sub.3 = or i32 0, %conv3.316
+ %conv15.3 = sext i16 0 to i32
+ %shr.3 = ashr i32 0, 0
+ %arrayidx18.3 = getelementptr i8, ptr null, i64 30
+ %conv19.3 = sext i16 0 to i32
+ %sub20.3 = or i32 %shr.3, %conv19.3
+ %shr29.3 = ashr i32 0, 0
+ %add30.3 = or i32 %shr29.3, %conv15.3
+ %sub39.3 = or i32 %sub.3, %sub20.3
+ %conv40.3 = trunc i32 %sub39.3 to i16
+ store i16 %conv40.3, ptr %arrayidx2.3, align 2
+ %sub44.3 = or i32 %add4.3, %add30.3
+ %conv45.3 = trunc i32 %sub44.3 to i16
+ store i16 %conv45.3, ptr %arrayidx18.3, align 2
+ ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-with-minbith-user.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-with-minbith-user.ll
new file mode 100644
index 0000000..9566c00
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-with-minbith-user.ll
@@ -0,0 +1,89 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
+
+define void @h() {
+; CHECK-LABEL: define void @h() {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr i8, ptr null, i64 16
+; CHECK-NEXT: [[TMP0:%.*]] = sub <8 x i32> zeroinitializer, zeroinitializer
+; CHECK-NEXT: [[TMP1:%.*]] = add <8 x i32> zeroinitializer, zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: [[TMP3:%.*]] = or <8 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT: [[TMP4:%.*]] = trunc <8 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT: store <8 x i16> [[TMP4]], ptr [[ARRAYIDX2]], align 2
+; CHECK-NEXT: ret void
+;
+entry:
+ %conv9 = zext i16 0 to i32
+ %arrayidx2 = getelementptr i8, ptr null, i64 16
+ %conv310 = zext i16 0 to i32
+ %add4 = add i32 %conv310, %conv9
+ %sub = sub i32 0, %conv310
+ %conv15 = sext i16 0 to i32
+ %shr = ashr i32 0, 0
+ %arrayidx18 = getelementptr i8, ptr null, i64 24
+ %conv19 = sext i16 0 to i32
+ %sub20 = sub i32 %shr, %conv19
+ %shr29 = ashr i32 0, 0
+ %add30 = add i32 %shr29, %conv15
+ %sub39 = or i32 %sub, %sub20
+ %conv40 = trunc i32 %sub39 to i16
+ store i16 %conv40, ptr %arrayidx2, align 2
+ %sub44 = or i32 %add4, %add30
+ %conv45 = trunc i32 %sub44 to i16
+ store i16 %conv45, ptr %arrayidx18, align 2
+ %arrayidx2.1 = getelementptr i8, ptr null, i64 18
+ %conv3.112 = zext i16 0 to i32
+ %add4.1 = add i32 %conv3.112, 0
+ %sub.1 = sub i32 0, %conv3.112
+ %conv15.1 = sext i16 0 to i32
+ %shr.1 = ashr i32 0, 0
+ %arrayidx18.1 = getelementptr i8, ptr null, i64 26
+ %conv19.1 = sext i16 0 to i32
+ %sub20.1 = sub i32 %shr.1, %conv19.1
+ %shr29.1 = ashr i32 0, 0
+ %add30.1 = add i32 %shr29.1, %conv15.1
+ %sub39.1 = or i32 %sub.1, %sub20.1
+ %conv40.1 = trunc i32 %sub39.1 to i16
+ store i16 %conv40.1, ptr %arrayidx2.1, align 2
+ %sub44.1 = or i32 %add4.1, %add30.1
+ %conv45.1 = trunc i32 %sub44.1 to i16
+ store i16 %conv45.1, ptr %arrayidx18.1, align 2
+ %conv.213 = zext i16 0 to i32
+ %arrayidx2.2 = getelementptr i8, ptr null, i64 20
+ %conv3.214 = zext i16 0 to i32
+ %add4.2 = add i32 0, %conv.213
+ %sub.2 = sub i32 0, %conv3.214
+ %conv15.2 = sext i16 0 to i32
+ %shr.2 = ashr i32 0, 0
+ %arrayidx18.2 = getelementptr i8, ptr null, i64 28
+ %conv19.2 = sext i16 0 to i32
+ %sub20.2 = sub i32 %shr.2, %conv19.2
+ %shr29.2 = ashr i32 0, 0
+ %add30.2 = add i32 %shr29.2, %conv15.2
+ %sub39.2 = or i32 %sub.2, %sub20.2
+ %conv40.2 = trunc i32 %sub39.2 to i16
+ store i16 %conv40.2, ptr %arrayidx2.2, align 2
+ %sub44.2 = or i32 %add4.2, %add30.2
+ %conv45.2 = trunc i32 %sub44.2 to i16
+ store i16 %conv45.2, ptr %arrayidx18.2, align 2
+ %conv.315 = zext i16 0 to i32
+ %arrayidx2.3 = getelementptr i8, ptr null, i64 22
+ %conv3.316 = zext i16 0 to i32
+ %add4.3 = add i32 0, %conv.315
+ %sub.3 = sub i32 0, %conv3.316
+ %conv15.3 = sext i16 0 to i32
+ %shr.3 = ashr i32 0, 0
+ %arrayidx18.3 = getelementptr i8, ptr null, i64 30
+ %conv19.3 = sext i16 0 to i32
+ %sub20.3 = sub i32 %shr.3, %conv19.3
+ %shr29.3 = ashr i32 0, 0
+ %add30.3 = add i32 %shr29.3, %conv15.3
+ %sub39.3 = or i32 %sub.3, %sub20.3
+ %conv40.3 = trunc i32 %sub39.3 to i16
+ store i16 %conv40.3, ptr %arrayidx2.3, align 2
+ %sub44.3 = or i32 %add4.3, %add30.3
+ %conv45.3 = trunc i32 %sub44.3 to i16
+ store i16 %conv45.3, ptr %arrayidx18.3, align 2
+ ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/user-node-not-in-bitwidths.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/user-node-not-in-bitwidths.ll
new file mode 100644
index 0000000..6404cf4
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/user-node-not-in-bitwidths.ll
@@ -0,0 +1,83 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
+
+define void @h() {
+; CHECK-LABEL: define void @h() {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr i8, ptr null, i64 16
+; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr [[ARRAYIDX2]], align 2
+; CHECK-NEXT: ret void
+;
+entry:
+ %arrayidx2 = getelementptr i8, ptr null, i64 16
+ %conv310 = zext i16 0 to i32
+ %add4 = or i32 %conv310, 0
+ %sub = or i32 0, %conv310
+ %conv15 = sext i16 0 to i32
+ %shr = ashr i32 %conv15, 0
+ %arrayidx18 = getelementptr i8, ptr null, i64 24
+ %conv19 = sext i16 0 to i32
+ %sub20 = or i32 %shr, 0
+ %shr29 = ashr i32 %conv19, 0
+ %add30 = or i32 %shr29, %conv15
+ %sub39 = or i32 %sub, %sub20
+ %conv40 = trunc i32 %sub39 to i16
+ store i16 %conv40, ptr %arrayidx2, align 2
+ %sub44 = or i32 %add4, %add30
+ %conv45 = trunc i32 %sub44 to i16
+ store i16 %conv45, ptr %arrayidx18, align 2
+ %arrayidx2.1 = getelementptr i8, ptr null, i64 18
+ %conv3.112 = zext i16 0 to i32
+ %add4.1 = or i32 %conv3.112, 0
+ %sub.1 = or i32 0, %conv3.112
+ %conv15.1 = sext i16 0 to i32
+ %shr.1 = ashr i32 %conv15.1, 0
+ %arrayidx18.1 = getelementptr i8, ptr null, i64 26
+ %conv19.1 = sext i16 0 to i32
+ %sub20.1 = or i32 %shr.1, 0
+ %shr29.1 = ashr i32 %conv19.1, 0
+ %add30.1 = or i32 %shr29.1, 0
+ %sub39.1 = or i32 %sub.1, %sub20.1
+ %conv40.1 = trunc i32 %sub39.1 to i16
+ store i16 %conv40.1, ptr %arrayidx2.1, align 2
+ %sub44.1 = or i32 %add4.1, %add30.1
+ %conv45.1 = trunc i32 %sub44.1 to i16
+ store i16 %conv45.1, ptr %arrayidx18.1, align 2
+ %conv.213 = zext i16 0 to i32
+ %arrayidx2.2 = getelementptr i8, ptr null, i64 20
+ %conv3.214 = zext i16 0 to i32
+ %add4.2 = or i32 0, %conv.213
+ %sub.2 = or i32 0, %conv3.214
+ %conv15.2 = sext i16 0 to i32
+ %shr.2 = ashr i32 %conv15.2, 0
+ %arrayidx18.2 = getelementptr i8, ptr null, i64 28
+ %conv19.2 = sext i16 0 to i32
+ %sub20.2 = or i32 %shr.2, 0
+ %shr29.2 = ashr i32 %conv19.2, 0
+ %add30.2 = or i32 %shr29.2, 0
+ %sub39.2 = or i32 %sub.2, %sub20.2
+ %conv40.2 = trunc i32 %sub39.2 to i16
+ store i16 %conv40.2, ptr %arrayidx2.2, align 2
+ %sub44.2 = or i32 %add4.2, %add30.2
+ %conv45.2 = trunc i32 %sub44.2 to i16
+ store i16 %conv45.2, ptr %arrayidx18.2, align 2
+ %conv.315 = zext i16 0 to i32
+ %arrayidx2.3 = getelementptr i8, ptr null, i64 22
+ %conv3.316 = zext i16 0 to i32
+ %add4.3 = or i32 0, %conv.315
+ %sub.3 = or i32 0, %conv3.316
+ %conv15.3 = sext i16 0 to i32
+ %shr.3 = ashr i32 %conv15.3, 0
+ %arrayidx18.3 = getelementptr i8, ptr null, i64 30
+ %conv19.3 = sext i16 0 to i32
+ %sub20.3 = or i32 %shr.3, 0
+ %shr29.3 = ashr i32 %conv19.3, 0
+ %add30.3 = or i32 %shr29.3, 0
+ %sub39.3 = or i32 %sub.3, %sub20.3
+ %conv40.3 = trunc i32 %sub39.3 to i16
+ store i16 %conv40.3, ptr %arrayidx2.3, align 2
+ %sub44.3 = or i32 %add4.3, %add30.3
+ %conv45.3 = trunc i32 %sub44.3 to i16
+ store i16 %conv45.3, ptr %arrayidx18.3, align 2
+ ret void
+}
diff --git a/llvm/test/tools/llvm-ar/coff-symtab.test b/llvm/test/tools/llvm-ar/coff-symtab.test
new file mode 100644
index 0000000..4a57472
--- /dev/null
+++ b/llvm/test/tools/llvm-ar/coff-symtab.test
@@ -0,0 +1,91 @@
+Verify that llvm-ar uses COFF archive format by ensuring that archive map is sorted.
+
+RUN: rm -rf %t.dir && split-file %s %t.dir && cd %t.dir
+
+RUN: yaml2obj coff-symtab.yaml -o coff-symtab.obj
+RUN: llvm-ar crs out.a coff-symtab.obj
+RUN: llvm-nm --print-armap out.a | FileCheck %s
+
+RUN: llvm-as coff-symtab.ll -o coff-symtab.bc
+RUN: llvm-ar crs out2.a coff-symtab.bc
+RUN: llvm-nm --print-armap out2.a | FileCheck %s
+
+RUN: yaml2obj elf.yaml -o coff-symtab.o
+RUN: llvm-ar crs --format coff out3.a coff-symtab.o
+RUN: llvm-nm --print-armap out3.a | FileCheck %s
+
+Create an empty archive with no symbol map, add a COFF file to it and check that the output archive is a COFF archive.
+
+RUN: llvm-ar --format coff rcS out4.a
+RUN: llvm-ar rs out4.a coff-symtab.obj
+RUN: llvm-nm --print-armap out4.a | FileCheck %s
+
+CHECK: Archive map
+CHECK-NEXT: a in coff-symtab
+CHECK-NEXT: b in coff-symtab
+CHECK-NEXT: c in coff-symtab
+CHECK-EMPTY:
+
+#--- coff-symtab.yaml
+--- !COFF
+header:
+ Machine: IMAGE_FILE_MACHINE_UNKNOWN
+ Characteristics: [ ]
+sections:
+ - Name: .text
+ Characteristics: [ IMAGE_SCN_CNT_CODE, IMAGE_SCN_MEM_EXECUTE, IMAGE_SCN_MEM_READ ]
+ Alignment: 4
+ SectionData: ''
+symbols:
+ - Name: b
+ Value: 0
+ SectionNumber: 1
+ SimpleType: IMAGE_SYM_TYPE_NULL
+ ComplexType: IMAGE_SYM_DTYPE_FUNCTION
+ StorageClass: IMAGE_SYM_CLASS_EXTERNAL
+ - Name: c
+ Value: 0
+ SectionNumber: 1
+ SimpleType: IMAGE_SYM_TYPE_NULL
+ ComplexType: IMAGE_SYM_DTYPE_FUNCTION
+ StorageClass: IMAGE_SYM_CLASS_EXTERNAL
+ - Name: a
+ Value: 0
+ SectionNumber: 1
+ SimpleType: IMAGE_SYM_TYPE_NULL
+ ComplexType: IMAGE_SYM_DTYPE_FUNCTION
+ StorageClass: IMAGE_SYM_CLASS_EXTERNAL
+...
+
+
+#--- coff-symtab.ll
+target triple = "x86_64-unknown-windows-msvc"
+
+define void @b() { ret void }
+define void @c() { ret void }
+define void @a() { ret void }
+
+#--- elf.yaml
+--- !ELF
+FileHeader:
+ Class: ELFCLASS64
+ Data : ELFDATA2LSB
+ Type: ET_REL
+ Machine: EM_X86_64
+Sections:
+ - Name: .text
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+ AddressAlign: 0x0000000000000004
+ Content: ''
+Symbols:
+ - Name: b
+ Binding: STB_GLOBAL
+ Section: .text
+ - Name: c
+ Binding: STB_GLOBAL
+ Section: .text
+ - Name: a
+ Binding: STB_GLOBAL
+ Section: .text
+...
diff --git a/llvm/test/tools/llvm-ar/no-symtab.yaml b/llvm/test/tools/llvm-ar/no-symtab.yaml
new file mode 100644
index 0000000..7370c9b
--- /dev/null
+++ b/llvm/test/tools/llvm-ar/no-symtab.yaml
@@ -0,0 +1,32 @@
+## Create archives with no symtab in various formats and check that we can read them.
+
+# RUN: yaml2obj %s -o %t.o
+# RUN: rm -f %t.*.a
+
+# RUN: llvm-ar --format=gnu rcS %t.gnu.a %t.o
+# RUN: llvm-ar --format=coff rcS %t.coff.a %t.o
+# RUN: llvm-ar --format=darwin rcS %t.darwin.a %t.o
+# RUN: llvm-ar --format=bsd rcS %t.bsd.a %t.o
+# RUN: llvm-ar --format=bigarchive rcS %t.bigarchive.a %t.o
+
+# RUN: llvm-nm --print-armap %t.gnu.a | FileCheck %s
+# RUN: llvm-nm --print-armap %t.coff.a | FileCheck %s
+# RUN: llvm-nm --print-armap %t.darwin.a | FileCheck %s
+# RUN: llvm-nm --print-armap %t.bsd.a | FileCheck %s
+# RUN: llvm-nm --print-armap %t.bigarchive.a | FileCheck %s
+
+# CHECK-NOT: Archive map
+
+--- !ELF
+FileHeader:
+ Class: ELFCLASS64
+ Data: ELFDATA2LSB
+ Type: ET_REL
+ Machine: EM_X86_64
+Sections:
+ - Name: .text
+ Type: SHT_PROGBITS
+Symbols:
+ - Name: symbol
+ Binding: STB_GLOBAL
+ Section: .text
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFive7/vector-integer-arithmetic.s b/llvm/test/tools/llvm-mca/RISCV/SiFive7/vector-integer-arithmetic.s
index 21459bc..3b6fd7e 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFive7/vector-integer-arithmetic.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFive7/vector-integer-arithmetic.s
@@ -399,6 +399,26 @@ vmseq.vv v4, v8, v12
vsetvli zero, zero, e64, m8, tu, mu
vmseq.vx v4, v8, x10
+# Pseudo instructions
+vsetvli zero, zero, e8, mf8, tu, mu
+vmslt.vi v4, v8, 1
+vsetvli zero, zero, e8, mf4, tu, mu
+vmsltu.vi v4, v8, 1
+vsetvli zero, zero, e8, mf2, tu, mu
+vmsltu.vi v4, v8, 0
+vsetvli zero, zero, e8, m1, tu, mu
+vmsgeu.vi v4, v8, 1
+vsetvli zero, zero, e8, m2, tu, mu
+vmsge.vi v4, v8, 1
+vsetvli zero, zero, e8, m4, tu, mu
+vmsgeu.vi v4, v8, 0
+vsetvli zero, zero, e16, mf4, tu, mu
+vmsge.vi v4, v8, 0
+vsetvli zero, zero, e16, mf2, tu, mu
+vmsge.vx v4, v8, x10
+vsetvli zero, zero, e16, m1, tu, mu
+vmsgeu.vx v4, v8, x11
+
# Vector Integer Min/Max Instructions
vsetvli zero, zero, e8, mf8, tu, mu
vminu.vv v4, v8, v12
@@ -754,14 +774,14 @@ vsetvli zero, zero, e64, m8, tu, mu
vmv.v.v v4, v12
# CHECK: Iterations: 1
-# CHECK-NEXT: Instructions: 707
-# CHECK-NEXT: Total Cycles: 11962
-# CHECK-NEXT: Total uOps: 707
+# CHECK-NEXT: Instructions: 727
+# CHECK-NEXT: Total Cycles: 12018
+# CHECK-NEXT: Total uOps: 727
# CHECK: Dispatch Width: 2
# CHECK-NEXT: uOps Per Cycle: 0.06
# CHECK-NEXT: IPC: 0.06
-# CHECK-NEXT: Block RThroughput: 11549.0
+# CHECK-NEXT: Block RThroughput: 11583.0
# CHECK: Instruction Info:
# CHECK-NEXT: [1]: #uOps
@@ -1144,6 +1164,26 @@ vmv.v.v v4, v12
# CHECK-NEXT: 1 3 1.00 U vsetvli zero, zero, e64, m8, tu, mu
# CHECK-NEXT: 1 19 17.00 vmseq.vx v4, v8, a0
# CHECK-NEXT: 1 3 1.00 U vsetvli zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT: 1 4 2.00 vmsle.vi v4, v8, 0
+# CHECK-NEXT: 1 3 1.00 U vsetvli zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT: 1 4 2.00 vmsleu.vi v4, v8, 0
+# CHECK-NEXT: 1 3 1.00 U vsetvli zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT: 1 4 2.00 vmsne.vv v4, v8, v8
+# CHECK-NEXT: 1 3 1.00 U vsetvli zero, zero, e8, m1, tu, mu
+# CHECK-NEXT: 1 5 3.00 vmsgtu.vi v4, v8, 0
+# CHECK-NEXT: 1 3 1.00 U vsetvli zero, zero, e8, m2, tu, mu
+# CHECK-NEXT: 1 7 5.00 vmsgt.vi v4, v8, 0
+# CHECK-NEXT: 1 3 1.00 U vsetvli zero, zero, e8, m4, tu, mu
+# CHECK-NEXT: 1 11 9.00 vmseq.vv v4, v8, v8
+# CHECK-NEXT: 1 3 1.00 U vsetvli zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT: 1 4 2.00 vmsgt.vi v4, v8, -1
+# CHECK-NEXT: 1 3 1.00 U vsetvli zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT: 1 4 2.00 vmslt.vx v4, v8, a0
+# CHECK-NEXT: 1 4 2.00 vmnot.m v4, v4
+# CHECK-NEXT: 1 3 1.00 U vsetvli zero, zero, e16, m1, tu, mu
+# CHECK-NEXT: 1 5 3.00 vmsltu.vx v4, v8, a1
+# CHECK-NEXT: 1 4 2.00 vmnot.m v4, v4
+# CHECK-NEXT: 1 3 1.00 U vsetvli zero, zero, e8, mf8, tu, mu
# CHECK-NEXT: 1 4 2.00 vminu.vv v4, v8, v12
# CHECK-NEXT: 1 3 1.00 U vsetvli zero, zero, e8, mf4, tu, mu
# CHECK-NEXT: 1 4 2.00 vminu.vx v4, v8, a0
@@ -1492,7 +1532,7 @@ vmv.v.v v4, v12
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7]
-# CHECK-NEXT: - - 333.00 - 11549.00 374.00 - -
+# CHECK-NEXT: - - 342.00 - 11583.00 385.00 - -
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] Instructions:
@@ -1868,6 +1908,26 @@ vmv.v.v v4, v12
# CHECK-NEXT: - - 1.00 - - - - - vsetvli zero, zero, e64, m8, tu, mu
# CHECK-NEXT: - - - - 17.00 1.00 - - vmseq.vx v4, v8, a0
# CHECK-NEXT: - - 1.00 - - - - - vsetvli zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT: - - - - 2.00 1.00 - - vmsle.vi v4, v8, 0
+# CHECK-NEXT: - - 1.00 - - - - - vsetvli zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT: - - - - 2.00 1.00 - - vmsleu.vi v4, v8, 0
+# CHECK-NEXT: - - 1.00 - - - - - vsetvli zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT: - - - - 2.00 1.00 - - vmsne.vv v4, v8, v8
+# CHECK-NEXT: - - 1.00 - - - - - vsetvli zero, zero, e8, m1, tu, mu
+# CHECK-NEXT: - - - - 3.00 1.00 - - vmsgtu.vi v4, v8, 0
+# CHECK-NEXT: - - 1.00 - - - - - vsetvli zero, zero, e8, m2, tu, mu
+# CHECK-NEXT: - - - - 5.00 1.00 - - vmsgt.vi v4, v8, 0
+# CHECK-NEXT: - - 1.00 - - - - - vsetvli zero, zero, e8, m4, tu, mu
+# CHECK-NEXT: - - - - 9.00 1.00 - - vmseq.vv v4, v8, v8
+# CHECK-NEXT: - - 1.00 - - - - - vsetvli zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT: - - - - 2.00 1.00 - - vmsgt.vi v4, v8, -1
+# CHECK-NEXT: - - 1.00 - - - - - vsetvli zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT: - - - - 2.00 1.00 - - vmslt.vx v4, v8, a0
+# CHECK-NEXT: - - - - 2.00 1.00 - - vmnot.m v4, v4
+# CHECK-NEXT: - - 1.00 - - - - - vsetvli zero, zero, e16, m1, tu, mu
+# CHECK-NEXT: - - - - 3.00 1.00 - - vmsltu.vx v4, v8, a1
+# CHECK-NEXT: - - - - 2.00 1.00 - - vmnot.m v4, v4
+# CHECK-NEXT: - - 1.00 - - - - - vsetvli zero, zero, e8, mf8, tu, mu
# CHECK-NEXT: - - - - 2.00 1.00 - - vminu.vv v4, v8, v12
# CHECK-NEXT: - - 1.00 - - - - - vsetvli zero, zero, e8, mf4, tu, mu
# CHECK-NEXT: - - - - 2.00 1.00 - - vminu.vx v4, v8, a0
diff --git a/llvm/test/tools/llvm-objcopy/ELF/compress-debug-sections-zstd.test b/llvm/test/tools/llvm-objcopy/ELF/compress-debug-sections-zstd.test
index dcf9c37..bde1c2f 100644
--- a/llvm/test/tools/llvm-objcopy/ELF/compress-debug-sections-zstd.test
+++ b/llvm/test/tools/llvm-objcopy/ELF/compress-debug-sections-zstd.test
@@ -15,7 +15,7 @@
# COMPRESSED: .debug_alloc PROGBITS 0000000000000000 {{.*}} 000040 00 A 0 0 0
# DECOMPRESSED: .debug_foo PROGBITS 0000000000000000 000040 000008 00 0 0 0
# DECOMPRESSED-NEXT: .notdebug_foo PROGBITS 0000000000000000 {{.*}} 000008 00 0 0 0
-# UNCOMPRESSED: .debug_alloc PROGBITS 0000000000000000 {{.*}} 000040 00 A 0 0 0
+# DECOMPRESSED: .debug_alloc PROGBITS 0000000000000000 {{.*}} 000040 00 A 0 0 0
## Relocations do not change.
# CHECK: Relocation section '.rela.debug_foo' at offset {{.*}} contains 2 entries:
diff --git a/llvm/tools/llvm-ar/llvm-ar.cpp b/llvm/tools/llvm-ar/llvm-ar.cpp
index 81cb2a2..294b853 100644
--- a/llvm/tools/llvm-ar/llvm-ar.cpp
+++ b/llvm/tools/llvm-ar/llvm-ar.cpp
@@ -82,6 +82,7 @@ static void printArHelp(StringRef ToolName) {
=darwin - darwin
=bsd - bsd
=bigarchive - big archive (AIX OS)
+ =coff - coff
--plugin=<string> - ignored for compatibility
-h --help - display this help and exit
--output - the directory to extract archive members to
@@ -193,7 +194,7 @@ static SmallVector<const char *, 256> PositionalArgs;
static bool MRI;
namespace {
-enum Format { Default, GNU, BSD, DARWIN, BIGARCHIVE, Unknown };
+enum Format { Default, GNU, COFF, BSD, DARWIN, BIGARCHIVE, Unknown };
}
static Format FormatType = Default;
@@ -1025,14 +1026,21 @@ static void performWriteOperation(ArchiveOperation Operation,
Kind = object::Archive::K_GNU;
else if (OldArchive) {
Kind = OldArchive->kind();
- if (Kind == object::Archive::K_BSD) {
- auto InferredKind = object::Archive::K_BSD;
+ std::optional<object::Archive::Kind> AltKind;
+ if (Kind == object::Archive::K_BSD)
+ AltKind = object::Archive::K_DARWIN;
+ else if (Kind == object::Archive::K_GNU && !OldArchive->hasSymbolTable())
+ // If there is no symbol table, we can't tell GNU from COFF format
+ // from the old archive type.
+ AltKind = object::Archive::K_COFF;
+ if (AltKind) {
+ auto InferredKind = Kind;
if (NewMembersP && !NewMembersP->empty())
InferredKind = NewMembersP->front().detectKindFromObject();
else if (!NewMembers.empty())
InferredKind = NewMembers.front().detectKindFromObject();
- if (InferredKind == object::Archive::K_DARWIN)
- Kind = object::Archive::K_DARWIN;
+ if (InferredKind == AltKind)
+ Kind = *AltKind;
}
} else if (NewMembersP)
Kind = !NewMembersP->empty() ? NewMembersP->front().detectKindFromObject()
@@ -1044,6 +1052,9 @@ static void performWriteOperation(ArchiveOperation Operation,
case GNU:
Kind = object::Archive::K_GNU;
break;
+ case COFF:
+ Kind = object::Archive::K_COFF;
+ break;
case BSD:
if (Thin)
fail("only the gnu format has a thin mode");
@@ -1376,6 +1387,7 @@ static int ar_main(int argc, char **argv) {
.Case("darwin", DARWIN)
.Case("bsd", BSD)
.Case("bigarchive", BIGARCHIVE)
+ .Case("coff", COFF)
.Default(Unknown);
if (FormatType == Unknown)
fail(std::string("Invalid format ") + Match);
diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
index 17ce035..5c9848f 100644
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
@@ -301,7 +301,6 @@ private:
if (AddMemDefError)
return AddMemDefError;
- long ParentTID = get_threadid();
pid_t ParentOrChildPID = fork();
if (ParentOrChildPID == -1) {
@@ -315,7 +314,7 @@ private:
// Unregister handlers, signal handling is now handled through ptrace in
// the host process.
sys::unregisterHandlers();
- prepareAndRunBenchmark(PipeFiles[0], Key, ParentTID);
+ prepareAndRunBenchmark(PipeFiles[0], Key);
// The child process terminates in the above function, so we should never
// get to this point.
llvm_unreachable("Child process didn't exit when expected.");
@@ -416,8 +415,8 @@ private:
setrlimit(RLIMIT_CORE, &rlim);
}
- [[noreturn]] void prepareAndRunBenchmark(int Pipe, const BenchmarkKey &Key,
- long ParentTID) const {
+ [[noreturn]] void prepareAndRunBenchmark(int Pipe,
+ const BenchmarkKey &Key) const {
// Disable core dumps in the child process as otherwise everytime we
// encounter an execution failure like a segmentation fault, we will create
// a core dump. We report the information directly rather than require the
@@ -474,7 +473,7 @@ private:
Expected<int> AuxMemFDOrError =
SubprocessMemory::setupAuxiliaryMemoryInSubprocess(
- Key.MemoryValues, ParentPID, ParentTID, CounterFileDescriptor);
+ Key.MemoryValues, ParentPID, CounterFileDescriptor);
if (!AuxMemFDOrError)
exit(ChildProcessExitCodeE::AuxiliaryMemorySetupFailed);
diff --git a/llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp b/llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp
index 28b341c..a49fa077 100644
--- a/llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp
+++ b/llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp
@@ -9,14 +9,11 @@
#include "SubprocessMemory.h"
#include "Error.h"
#include "llvm/Support/Error.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/Threading.h"
#include <cerrno>
#ifdef __linux__
#include <fcntl.h>
#include <sys/mman.h>
-#include <sys/syscall.h>
#include <unistd.h>
#endif
@@ -29,10 +26,8 @@ Error SubprocessMemory::initializeSubprocessMemory(pid_t ProcessID) {
// Add the PID to the shared memory name so that if we're running multiple
// processes at the same time, they won't interfere with each other.
// This comes up particularly often when running the exegesis tests with
- // llvm-lit. Additionally add the TID so that downstream consumers
- // using multiple threads don't run into conflicts.
- std::string AuxiliaryMemoryName =
- formatv("/{0}auxmem{1}", get_threadid(), ProcessID);
+ // llvm-lit
+ std::string AuxiliaryMemoryName = "/auxmem" + std::to_string(ProcessID);
int AuxiliaryMemoryFD = shm_open(AuxiliaryMemoryName.c_str(),
O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
if (AuxiliaryMemoryFD == -1)
@@ -52,8 +47,8 @@ Error SubprocessMemory::addMemoryDefinition(
pid_t ProcessPID) {
SharedMemoryNames.reserve(MemoryDefinitions.size());
for (auto &[Name, MemVal] : MemoryDefinitions) {
- std::string SharedMemoryName =
- formatv("/{0}t{1}memdef{2}", ProcessPID, get_threadid(), MemVal.Index);
+ std::string SharedMemoryName = "/" + std::to_string(ProcessPID) + "memdef" +
+ std::to_string(MemVal.Index);
SharedMemoryNames.push_back(SharedMemoryName);
int SharedMemoryFD =
shm_open(SharedMemoryName.c_str(), O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
@@ -87,9 +82,8 @@ Error SubprocessMemory::addMemoryDefinition(
Expected<int> SubprocessMemory::setupAuxiliaryMemoryInSubprocess(
std::unordered_map<std::string, MemoryValue> MemoryDefinitions,
- pid_t ParentPID, uint64_t ParentTID, int CounterFileDescriptor) {
- std::string AuxiliaryMemoryName =
- formatv("/{0}auxmem{1}", ParentTID, ParentPID);
+ pid_t ParentPID, int CounterFileDescriptor) {
+ std::string AuxiliaryMemoryName = "/auxmem" + std::to_string(ParentPID);
int AuxiliaryMemoryFileDescriptor =
shm_open(AuxiliaryMemoryName.c_str(), O_RDWR, S_IRUSR | S_IWUSR);
if (AuxiliaryMemoryFileDescriptor == -1)
@@ -103,8 +97,8 @@ Expected<int> SubprocessMemory::setupAuxiliaryMemoryInSubprocess(
return make_error<Failure>("Mapping auxiliary memory failed");
AuxiliaryMemoryMapping[0] = CounterFileDescriptor;
for (auto &[Name, MemVal] : MemoryDefinitions) {
- std::string MemoryValueName =
- formatv("/{0}t{1}memdef{2}", ParentPID, ParentTID, MemVal.Index);
+ std::string MemoryValueName = "/" + std::to_string(ParentPID) + "memdef" +
+ std::to_string(MemVal.Index);
AuxiliaryMemoryMapping[AuxiliaryMemoryOffset + MemVal.Index] =
shm_open(MemoryValueName.c_str(), O_RDWR, S_IRUSR | S_IWUSR);
if (AuxiliaryMemoryMapping[AuxiliaryMemoryOffset + MemVal.Index] == -1)
@@ -139,7 +133,7 @@ Error SubprocessMemory::addMemoryDefinition(
Expected<int> SubprocessMemory::setupAuxiliaryMemoryInSubprocess(
std::unordered_map<std::string, MemoryValue> MemoryDefinitions,
- pid_t ParentPID, uint64_t ParentTID, int CounterFileDescriptor) {
+ pid_t ParentPID, int CounterFileDescriptor) {
return make_error<Failure>(
"setupAuxiliaryMemoryInSubprocess is only supported on Linux");
}
diff --git a/llvm/tools/llvm-exegesis/lib/SubprocessMemory.h b/llvm/tools/llvm-exegesis/lib/SubprocessMemory.h
index 807046e..e20b50c 100644
--- a/llvm/tools/llvm-exegesis/lib/SubprocessMemory.h
+++ b/llvm/tools/llvm-exegesis/lib/SubprocessMemory.h
@@ -54,7 +54,7 @@ public:
// section.
static Expected<int> setupAuxiliaryMemoryInSubprocess(
std::unordered_map<std::string, MemoryValue> MemoryDefinitions,
- pid_t ParentPID, uint64_t ParentTID, int CounterFileDescriptor);
+ pid_t ParentPID, int CounterFileDescriptor);
~SubprocessMemory();
diff --git a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
index 17fc3ce..2b764c9 100644
--- a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
+++ b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
@@ -130,6 +130,9 @@ TEST_F(SelectionDAGPatternMatchTest, matchBinaryOp) {
SDValue Add = DAG->getNode(ISD::ADD, DL, Int32VT, Op0, Op1);
SDValue Sub = DAG->getNode(ISD::SUB, DL, Int32VT, Add, Op0);
SDValue Mul = DAG->getNode(ISD::MUL, DL, Int32VT, Add, Sub);
+ SDValue And = DAG->getNode(ISD::AND, DL, Int32VT, Op0, Op1);
+ SDValue Xor = DAG->getNode(ISD::XOR, DL, Int32VT, Op1, Op0);
+ SDValue Or = DAG->getNode(ISD::OR, DL, Int32VT, Op0, Op1);
SDValue SFAdd = DAG->getNode(ISD::STRICT_FADD, DL, {Float32VT, MVT::Other},
{DAG->getEntryNode(), Op2, Op2});
@@ -144,6 +147,14 @@ TEST_F(SelectionDAGPatternMatchTest, matchBinaryOp) {
EXPECT_TRUE(
sd_match(SFAdd, m_ChainedBinOp(ISD::STRICT_FADD, m_SpecificVT(Float32VT),
m_SpecificVT(Float32VT))));
+
+ EXPECT_TRUE(sd_match(And, m_c_BinOp(ISD::AND, m_Value(), m_Value())));
+ EXPECT_TRUE(sd_match(And, m_And(m_Value(), m_Value())));
+ EXPECT_TRUE(sd_match(Xor, m_c_BinOp(ISD::XOR, m_Value(), m_Value())));
+ EXPECT_TRUE(sd_match(Xor, m_Xor(m_Value(), m_Value())));
+ EXPECT_TRUE(sd_match(Or, m_c_BinOp(ISD::OR, m_Value(), m_Value())));
+ EXPECT_TRUE(sd_match(Or, m_Or(m_Value(), m_Value())));
+
SDValue BindVal;
EXPECT_TRUE(sd_match(SFAdd, m_ChainedBinOp(ISD::STRICT_FADD, m_Value(BindVal),
m_Deferred(BindVal))));
diff --git a/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp b/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp
index e23c7ea..bfc64cb 100644
--- a/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp
+++ b/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp
@@ -149,7 +149,7 @@ TEST(BasicBlockDbgInfoTest, MarkerOperations) {
Instruction *Instr2 = Instr1->getNextNode();
DPMarker *Marker1 = Instr1->DbgMarker;
DPMarker *Marker2 = Instr2->DbgMarker;
- // There's no TrailingDPValues marker allocated yet.
+ // There's no TrailingDbgRecords marker allocated yet.
DPMarker *EndMarker = nullptr;
// Check that the "getMarker" utilities operate as expected.
@@ -159,26 +159,26 @@ TEST(BasicBlockDbgInfoTest, MarkerOperations) {
EXPECT_EQ(BB.getNextMarker(Instr2), EndMarker); // Is nullptr.
// There should be two DPValues,
- EXPECT_EQ(Marker1->StoredDPValues.size(), 1u);
- EXPECT_EQ(Marker2->StoredDPValues.size(), 1u);
+ EXPECT_EQ(Marker1->StoredDbgRecords.size(), 1u);
+ EXPECT_EQ(Marker2->StoredDbgRecords.size(), 1u);
// Unlink them and try to re-insert them through the basic block.
- DbgRecord *DPV1 = &*Marker1->StoredDPValues.begin();
- DbgRecord *DPV2 = &*Marker2->StoredDPValues.begin();
+ DbgRecord *DPV1 = &*Marker1->StoredDbgRecords.begin();
+ DbgRecord *DPV2 = &*Marker2->StoredDbgRecords.begin();
DPV1->removeFromParent();
DPV2->removeFromParent();
- EXPECT_TRUE(Marker1->StoredDPValues.empty());
- EXPECT_TRUE(Marker2->StoredDPValues.empty());
+ EXPECT_TRUE(Marker1->StoredDbgRecords.empty());
+ EXPECT_TRUE(Marker2->StoredDbgRecords.empty());
// This should appear in Marker1.
BB.insertDbgRecordBefore(DPV1, BB.begin());
- EXPECT_EQ(Marker1->StoredDPValues.size(), 1u);
- EXPECT_EQ(DPV1, &*Marker1->StoredDPValues.begin());
+ EXPECT_EQ(Marker1->StoredDbgRecords.size(), 1u);
+ EXPECT_EQ(DPV1, &*Marker1->StoredDbgRecords.begin());
// This should attach to Marker2.
BB.insertDbgRecordAfter(DPV2, &*BB.begin());
- EXPECT_EQ(Marker2->StoredDPValues.size(), 1u);
- EXPECT_EQ(DPV2, &*Marker2->StoredDPValues.begin());
+ EXPECT_EQ(Marker2->StoredDbgRecords.size(), 1u);
+ EXPECT_EQ(DPV2, &*Marker2->StoredDbgRecords.begin());
// Now, how about removing instructions? That should cause any DPValues to
// "fall down".
@@ -186,7 +186,7 @@ TEST(BasicBlockDbgInfoTest, MarkerOperations) {
Marker1 = nullptr;
// DPValues should now be in Marker2.
EXPECT_EQ(BB.size(), 1u);
- EXPECT_EQ(Marker2->StoredDPValues.size(), 2u);
+ EXPECT_EQ(Marker2->StoredDbgRecords.size(), 2u);
// They should also be in the correct order.
SmallVector<DbgRecord *, 2> DPVs;
for (DbgRecord &DPV : Marker2->getDbgRecordRange())
@@ -201,7 +201,7 @@ TEST(BasicBlockDbgInfoTest, MarkerOperations) {
EXPECT_TRUE(BB.empty());
EndMarker = BB.getTrailingDbgRecords();
ASSERT_NE(EndMarker, nullptr);
- EXPECT_EQ(EndMarker->StoredDPValues.size(), 2u);
+ EXPECT_EQ(EndMarker->StoredDbgRecords.size(), 2u);
// Again, these should arrive in the correct order.
DPVs.clear();
@@ -213,13 +213,13 @@ TEST(BasicBlockDbgInfoTest, MarkerOperations) {
// Inserting a normal instruction at the beginning: shouldn't dislodge the
// DPValues. It's intended to not go at the start.
Instr1->insertBefore(BB, BB.begin());
- EXPECT_EQ(EndMarker->StoredDPValues.size(), 2u);
+ EXPECT_EQ(EndMarker->StoredDbgRecords.size(), 2u);
Instr1->removeFromParent();
// Inserting at end(): should dislodge the DPValues, if they were dbg.values
// then they would sit "above" the new instruction.
Instr1->insertBefore(BB, BB.end());
- EXPECT_EQ(Instr1->DbgMarker->StoredDPValues.size(), 2u);
+ EXPECT_EQ(Instr1->DbgMarker->StoredDbgRecords.size(), 2u);
// We should de-allocate the trailing marker when something is inserted
// at end().
EXPECT_EQ(BB.getTrailingDbgRecords(), nullptr);
@@ -227,14 +227,14 @@ TEST(BasicBlockDbgInfoTest, MarkerOperations) {
// Remove Instr1: now the DPValues will fall down again,
Instr1->removeFromParent();
EndMarker = BB.getTrailingDbgRecords();
- EXPECT_EQ(EndMarker->StoredDPValues.size(), 2u);
+ EXPECT_EQ(EndMarker->StoredDbgRecords.size(), 2u);
// Inserting a terminator, however it's intended, should dislodge the
// trailing DPValues, as it's the clear intention of the caller that this be
// the final instr in the block, and DPValues aren't allowed to live off the
// end forever.
Instr2->insertBefore(BB, BB.begin());
- EXPECT_EQ(Instr2->DbgMarker->StoredDPValues.size(), 2u);
+ EXPECT_EQ(Instr2->DbgMarker->StoredDbgRecords.size(), 2u);
EXPECT_EQ(BB.getTrailingDbgRecords(), nullptr);
// Teardown,
@@ -298,24 +298,24 @@ TEST(BasicBlockDbgInfoTest, HeadBitOperations) {
Instruction *DInst = CInst->getNextNode();
// CInst should have debug-info.
ASSERT_TRUE(CInst->DbgMarker);
- EXPECT_FALSE(CInst->DbgMarker->StoredDPValues.empty());
+ EXPECT_FALSE(CInst->DbgMarker->StoredDbgRecords.empty());
// If we move "c" to the start of the block, just normally, then the DPValues
// should fall down to "d".
CInst->moveBefore(BB, BeginIt2);
- EXPECT_TRUE(!CInst->DbgMarker || CInst->DbgMarker->StoredDPValues.empty());
+ EXPECT_TRUE(!CInst->DbgMarker || CInst->DbgMarker->StoredDbgRecords.empty());
ASSERT_TRUE(DInst->DbgMarker);
- EXPECT_FALSE(DInst->DbgMarker->StoredDPValues.empty());
+ EXPECT_FALSE(DInst->DbgMarker->StoredDbgRecords.empty());
// Wheras if we move D to the start of the block with moveBeforePreserving,
// the DPValues should move with it.
DInst->moveBeforePreserving(BB, BB.begin());
- EXPECT_FALSE(DInst->DbgMarker->StoredDPValues.empty());
+ EXPECT_FALSE(DInst->DbgMarker->StoredDbgRecords.empty());
EXPECT_EQ(&*BB.begin(), DInst);
// Similarly, moveAfterPreserving "D" to "C" should move DPValues with "D".
DInst->moveAfterPreserving(CInst);
- EXPECT_FALSE(DInst->DbgMarker->StoredDPValues.empty());
+ EXPECT_FALSE(DInst->DbgMarker->StoredDbgRecords.empty());
// (move back to the start...)
DInst->moveBeforePreserving(BB, BB.begin());
@@ -324,8 +324,8 @@ TEST(BasicBlockDbgInfoTest, HeadBitOperations) {
// If we move "C" to the beginning of the block, it should go before the
// DPValues. They'll stay on "D".
CInst->moveBefore(BB, BB.begin());
- EXPECT_TRUE(!CInst->DbgMarker || CInst->DbgMarker->StoredDPValues.empty());
- EXPECT_FALSE(DInst->DbgMarker->StoredDPValues.empty());
+ EXPECT_TRUE(!CInst->DbgMarker || CInst->DbgMarker->StoredDbgRecords.empty());
+ EXPECT_FALSE(DInst->DbgMarker->StoredDbgRecords.empty());
EXPECT_EQ(&*BB.begin(), CInst);
EXPECT_EQ(CInst->getNextNode(), DInst);
@@ -341,8 +341,8 @@ TEST(BasicBlockDbgInfoTest, HeadBitOperations) {
// run of dbg.values and the next instruction.
CInst->moveBefore(BB, DInst->getIterator());
// CInst gains the DPValues.
- EXPECT_TRUE(!DInst->DbgMarker || DInst->DbgMarker->StoredDPValues.empty());
- EXPECT_FALSE(CInst->DbgMarker->StoredDPValues.empty());
+ EXPECT_TRUE(!DInst->DbgMarker || DInst->DbgMarker->StoredDbgRecords.empty());
+ EXPECT_FALSE(CInst->DbgMarker->StoredDbgRecords.empty());
EXPECT_EQ(&*BB.begin(), CInst);
UseNewDbgInfoFormat = false;
@@ -390,16 +390,16 @@ TEST(BasicBlockDbgInfoTest, InstrDbgAccess) {
ASSERT_FALSE(BInst->DbgMarker);
ASSERT_TRUE(CInst->DbgMarker);
- ASSERT_EQ(CInst->DbgMarker->StoredDPValues.size(), 1u);
- DbgRecord *DPV1 = &*CInst->DbgMarker->StoredDPValues.begin();
+ ASSERT_EQ(CInst->DbgMarker->StoredDbgRecords.size(), 1u);
+ DbgRecord *DPV1 = &*CInst->DbgMarker->StoredDbgRecords.begin();
ASSERT_TRUE(DPV1);
EXPECT_FALSE(BInst->hasDbgRecords());
// Clone DPValues from one inst to another. Other arguments to clone are
// tested in DPMarker test.
auto Range1 = BInst->cloneDebugInfoFrom(CInst);
- EXPECT_EQ(BInst->DbgMarker->StoredDPValues.size(), 1u);
- DbgRecord *DPV2 = &*BInst->DbgMarker->StoredDPValues.begin();
+ EXPECT_EQ(BInst->DbgMarker->StoredDbgRecords.size(), 1u);
+ DbgRecord *DPV2 = &*BInst->DbgMarker->StoredDbgRecords.begin();
EXPECT_EQ(std::distance(Range1.begin(), Range1.end()), 1u);
EXPECT_EQ(&*Range1.begin(), DPV2);
EXPECT_NE(DPV1, DPV2);
@@ -417,12 +417,12 @@ TEST(BasicBlockDbgInfoTest, InstrDbgAccess) {
// Dropping should be easy,
BInst->dropDbgRecords();
EXPECT_FALSE(BInst->hasDbgRecords());
- EXPECT_EQ(BInst->DbgMarker->StoredDPValues.size(), 0u);
+ EXPECT_EQ(BInst->DbgMarker->StoredDbgRecords.size(), 0u);
// And we should be able to drop individual DPValues.
CInst->dropOneDbgRecord(DPV1);
EXPECT_FALSE(CInst->hasDbgRecords());
- EXPECT_EQ(CInst->DbgMarker->StoredDPValues.size(), 0u);
+ EXPECT_EQ(CInst->DbgMarker->StoredDbgRecords.size(), 0u);
UseNewDbgInfoFormat = false;
}
@@ -531,9 +531,9 @@ protected:
Branch = &*Last;
CInst = &*Dest;
- DPVA = cast<DPValue>(&*BInst->DbgMarker->StoredDPValues.begin());
- DPVB = cast<DPValue>(&*Branch->DbgMarker->StoredDPValues.begin());
- DPVConst = cast<DPValue>(&*CInst->DbgMarker->StoredDPValues.begin());
+ DPVA = cast<DPValue>(&*BInst->DbgMarker->StoredDbgRecords.begin());
+ DPVB = cast<DPValue>(&*Branch->DbgMarker->StoredDbgRecords.begin());
+ DPVConst = cast<DPValue>(&*CInst->DbgMarker->StoredDbgRecords.begin());
}
void TearDown() override { UseNewDbgInfoFormat = false; }
@@ -1171,7 +1171,7 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceTrailing) {
// spliced in.
Instruction *BInst = &*Entry.begin();
ASSERT_TRUE(BInst->DbgMarker);
- EXPECT_EQ(BInst->DbgMarker->StoredDPValues.size(), 1u);
+ EXPECT_EQ(BInst->DbgMarker->StoredDbgRecords.size(), 1u);
UseNewDbgInfoFormat = false;
}
@@ -1387,7 +1387,7 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceToEmpty1) {
// should be in the correct order of %a, then 0.
Instruction *BInst = &*Entry.begin();
ASSERT_TRUE(BInst->hasDbgRecords());
- EXPECT_EQ(BInst->DbgMarker->StoredDPValues.size(), 2u);
+ EXPECT_EQ(BInst->DbgMarker->StoredDbgRecords.size(), 2u);
SmallVector<DPValue *, 2> DPValues;
for (DbgRecord &DPV : BInst->getDbgRecordRange())
DPValues.push_back(cast<DPValue>(&DPV));
@@ -1457,7 +1457,7 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceToEmpty2) {
// We should now have one dbg.values on the first instruction, %a.
Instruction *BInst = &*Entry.begin();
ASSERT_TRUE(BInst->hasDbgRecords());
- EXPECT_EQ(BInst->DbgMarker->StoredDPValues.size(), 1u);
+ EXPECT_EQ(BInst->DbgMarker->StoredDbgRecords.size(), 1u);
SmallVector<DPValue *, 2> DPValues;
for (DbgRecord &DPV : BInst->getDbgRecordRange())
DPValues.push_back(cast<DPValue>(&DPV));
diff --git a/llvm/unittests/IR/DebugInfoTest.cpp b/llvm/unittests/IR/DebugInfoTest.cpp
index 0b019c2..4bd11d2 100644
--- a/llvm/unittests/IR/DebugInfoTest.cpp
+++ b/llvm/unittests/IR/DebugInfoTest.cpp
@@ -951,7 +951,7 @@ TEST(MetadataTest, ConvertDbgToDPValue) {
ExitBlock->createMarker(FirstInst);
ExitBlock->createMarker(RetInst);
- // Insert DPValues into markers, order should come out DPV2, DPV1.
+ // Insert DbgRecords into markers, order should come out DPV2, DPV1.
FirstInst->DbgMarker->insertDbgRecord(DPV1, false);
FirstInst->DbgMarker->insertDbgRecord(DPV2, true);
unsigned int ItCount = 0;
@@ -964,7 +964,7 @@ TEST(MetadataTest, ConvertDbgToDPValue) {
// Clone them onto the second marker -- should allocate new DPVs.
RetInst->DbgMarker->cloneDebugInfoFrom(FirstInst->DbgMarker, std::nullopt, false);
- EXPECT_EQ(RetInst->DbgMarker->StoredDPValues.size(), 2u);
+ EXPECT_EQ(RetInst->DbgMarker->StoredDbgRecords.size(), 2u);
ItCount = 0;
// Check these things store the same information; but that they're not the same
// objects.
@@ -980,25 +980,25 @@ TEST(MetadataTest, ConvertDbgToDPValue) {
}
RetInst->DbgMarker->dropDbgRecords();
- EXPECT_EQ(RetInst->DbgMarker->StoredDPValues.size(), 0u);
+ EXPECT_EQ(RetInst->DbgMarker->StoredDbgRecords.size(), 0u);
// Try cloning one single DPValue.
auto DIIt = std::next(FirstInst->DbgMarker->getDbgRecordRange().begin());
RetInst->DbgMarker->cloneDebugInfoFrom(FirstInst->DbgMarker, DIIt, false);
- EXPECT_EQ(RetInst->DbgMarker->StoredDPValues.size(), 1u);
+ EXPECT_EQ(RetInst->DbgMarker->StoredDbgRecords.size(), 1u);
// The second DPValue should have been cloned; it should have the same values
// as DPV1.
- EXPECT_EQ(cast<DPValue>(RetInst->DbgMarker->StoredDPValues.begin())
+ EXPECT_EQ(cast<DPValue>(RetInst->DbgMarker->StoredDbgRecords.begin())
->getRawLocation(),
DPV1->getRawLocation());
- // We should be able to drop individual DPValues.
+ // We should be able to drop individual DbgRecords.
RetInst->DbgMarker->dropOneDbgRecord(
- &*RetInst->DbgMarker->StoredDPValues.begin());
+ &*RetInst->DbgMarker->StoredDbgRecords.begin());
// "Aborb" a DPMarker: this means pretend that the instruction it's attached
// to is disappearing so it needs to be transferred into "this" marker.
RetInst->DbgMarker->absorbDebugValues(*FirstInst->DbgMarker, true);
- EXPECT_EQ(RetInst->DbgMarker->StoredDPValues.size(), 2u);
+ EXPECT_EQ(RetInst->DbgMarker->StoredDbgRecords.size(), 2u);
// Should be the DPV1 and DPV2 objects.
ItCount = 0;
for (DbgRecord &Item : RetInst->DbgMarker->getDbgRecordRange()) {
@@ -1009,7 +1009,7 @@ TEST(MetadataTest, ConvertDbgToDPValue) {
}
// Finally -- there are two DPValues left over. If we remove evrything in the
- // basic block, then they should sink down into the "TrailingDPValues"
+ // basic block, then they should sink down into the "TrailingDbgRecords"
// container for dangling debug-info. Future facilities will restore them
// back when a terminator is inserted.
FirstInst->DbgMarker->removeMarker();
@@ -1019,7 +1019,7 @@ TEST(MetadataTest, ConvertDbgToDPValue) {
DPMarker *EndMarker = ExitBlock->getTrailingDbgRecords();
ASSERT_NE(EndMarker, nullptr);
- EXPECT_EQ(EndMarker->StoredDPValues.size(), 2u);
+ EXPECT_EQ(EndMarker->StoredDbgRecords.size(), 2u);
// Test again that it's those two DPValues, DPV1 and DPV2.
ItCount = 0;
for (DbgRecord &Item : EndMarker->getDbgRecordRange()) {
@@ -1115,14 +1115,14 @@ TEST(MetadataTest, DPValueConversionRoutines) {
EXPECT_EQ(FirstInst, FirstInst->DbgMarker->MarkedInstr);
EXPECT_EQ(SecondInst, SecondInst->DbgMarker->MarkedInstr);
- EXPECT_EQ(FirstInst->DbgMarker->StoredDPValues.size(), 1u);
+ EXPECT_EQ(FirstInst->DbgMarker->StoredDbgRecords.size(), 1u);
DPValue *DPV1 =
cast<DPValue>(&*FirstInst->DbgMarker->getDbgRecordRange().begin());
EXPECT_EQ(DPV1->getMarker(), FirstInst->DbgMarker);
// Should point at %a, an argument.
EXPECT_TRUE(isa<Argument>(DPV1->getVariableLocationOp(0)));
- EXPECT_EQ(SecondInst->DbgMarker->StoredDPValues.size(), 1u);
+ EXPECT_EQ(SecondInst->DbgMarker->StoredDbgRecords.size(), 1u);
DPValue *DPV2 =
cast<DPValue>(&*SecondInst->DbgMarker->getDbgRecordRange().begin());
EXPECT_EQ(DPV2->getMarker(), SecondInst->DbgMarker);
@@ -1135,7 +1135,7 @@ TEST(MetadataTest, DPValueConversionRoutines) {
EXPECT_TRUE(BB2->IsNewDbgInfoFormat);
for (auto &Inst : *BB2)
// Either there should be no marker, or it should be empty.
- EXPECT_TRUE(!Inst.DbgMarker || Inst.DbgMarker->StoredDPValues.empty());
+ EXPECT_TRUE(!Inst.DbgMarker || Inst.DbgMarker->StoredDbgRecords.empty());
// Validating the first block should continue to not be a problem,
Error = verifyModule(*M, &errs(), &BrokenDebugInfo);
diff --git a/llvm/unittests/Transforms/Utils/CallPromotionUtilsTest.cpp b/llvm/unittests/Transforms/Utils/CallPromotionUtilsTest.cpp
index eff8e27..0e9641c 100644
--- a/llvm/unittests/Transforms/Utils/CallPromotionUtilsTest.cpp
+++ b/llvm/unittests/Transforms/Utils/CallPromotionUtilsTest.cpp
@@ -37,7 +37,7 @@ define void @f() {
entry:
%o = alloca %class.Impl
%base = getelementptr %class.Impl, %class.Impl* %o, i64 0, i32 0, i32 0
- store i32 (...)** bitcast (i8** getelementptr inbounds ({ [3 x i8*] }, { [3 x i8*] }* @_ZTV4Impl, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %base
+ store i32 (...)** bitcast (i8** getelementptr inbounds ({ [3 x i8*] }, { [3 x i8*] }* @_ZTV4Impl, i64 0, i32 0, i64 2) to i32 (...)**), i32 (...)*** %base
%f = getelementptr inbounds %class.Impl, %class.Impl* %o, i64 0, i32 1
store i32 3, i32* %f
%base.i = getelementptr inbounds %class.Impl, %class.Impl* %o, i64 0, i32 0
@@ -171,7 +171,7 @@ define void @f() {
entry:
%o = alloca %class.Impl
%base = getelementptr %class.Impl, %class.Impl* %o, i64 0, i32 0, i32 0
- store i32 (...)** bitcast (i8** getelementptr inbounds ({ [3 x i8*] }, { [3 x i8*] }* @_ZTV4Impl, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %base
+ store i32 (...)** bitcast (i8** getelementptr inbounds ({ [3 x i8*] }, { [3 x i8*] }* @_ZTV4Impl, i64 0, i32 0, i64 2) to i32 (...)**), i32 (...)*** %base
%f = getelementptr inbounds %class.Impl, %class.Impl* %o, i64 0, i32 1
store i32 3, i32* %f
%base.i = getelementptr inbounds %class.Impl, %class.Impl* %o, i64 0, i32 0
@@ -213,7 +213,7 @@ define void @f() {
entry:
%o = alloca %class.Impl
%base = getelementptr %class.Impl, %class.Impl* %o, i64 0, i32 0, i32 0
- store i32 (...)** bitcast (i8** getelementptr inbounds ({ [3 x i8*] }, { [3 x i8*] }* @_ZTV4Impl, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %base
+ store i32 (...)** bitcast (i8** getelementptr inbounds ({ [3 x i8*] }, { [3 x i8*] }* @_ZTV4Impl, i64 0, i32 0, i64 2) to i32 (...)**), i32 (...)*** %base
%f = getelementptr inbounds %class.Impl, %class.Impl* %o, i64 0, i32 1
store i32 3, i32* %f
%base.i = getelementptr inbounds %class.Impl, %class.Impl* %o, i64 0, i32 0
@@ -256,7 +256,7 @@ entry:
%a = alloca %struct.A, align 8
%0 = bitcast %struct.A* %a to i8*
%1 = getelementptr %struct.A, %struct.A* %a, i64 0, i32 0
- store i32 (...)** bitcast (i8** getelementptr inbounds ({ [4 x i8*] }, { [4 x i8*] }* @_ZTV1A, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %1, align 8
+ store i32 (...)** bitcast (i8** getelementptr inbounds ({ [4 x i8*] }, { [4 x i8*] }* @_ZTV1A, i64 0, i32 0, i64 2) to i32 (...)**), i32 (...)*** %1, align 8
%2 = bitcast %struct.A* %a to i8*
%3 = bitcast i8* %2 to i8**
%vtable.i = load i8*, i8** %3, align 8
@@ -271,7 +271,7 @@ entry:
%a = alloca %struct.A, align 8
%0 = bitcast %struct.A* %a to i8*
%1 = getelementptr %struct.A, %struct.A* %a, i64 0, i32 0
- store i32 (...)** bitcast (i8** getelementptr inbounds ({ [4 x i8*] }, { [4 x i8*] }* @_ZTV1A, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %1, align 8
+ store i32 (...)** bitcast (i8** getelementptr inbounds ({ [4 x i8*] }, { [4 x i8*] }* @_ZTV1A, i64 0, i32 0, i64 2) to i32 (...)**), i32 (...)*** %1, align 8
%2 = bitcast %struct.A* %a to i8*
%3 = bitcast i8* %2 to i8**
%vtable.i = load i8*, i8** %3, align 8
@@ -340,7 +340,7 @@ define %struct1 @f() {
entry:
%o = alloca %class.Impl
%base = getelementptr %class.Impl, %class.Impl* %o, i64 0, i32 0, i32 0
- store i32 (...)** bitcast (i8** getelementptr inbounds ({ [3 x i8*] }, { [3 x i8*] }* @_ZTV4Impl, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %base
+ store i32 (...)** bitcast (i8** getelementptr inbounds ({ [3 x i8*] }, { [3 x i8*] }* @_ZTV4Impl, i64 0, i32 0, i64 2) to i32 (...)**), i32 (...)*** %base
%f = getelementptr inbounds %class.Impl, %class.Impl* %o, i64 0, i32 1
store i32 3, i32* %f
%base.i = getelementptr inbounds %class.Impl, %class.Impl* %o, i64 0, i32 0
diff --git a/llvm/unittests/Transforms/Utils/DebugifyTest.cpp b/llvm/unittests/Transforms/Utils/DebugifyTest.cpp
index 89fa133..0b00734 100644
--- a/llvm/unittests/Transforms/Utils/DebugifyTest.cpp
+++ b/llvm/unittests/Transforms/Utils/DebugifyTest.cpp
@@ -60,7 +60,7 @@ struct DebugValueDrop : public FunctionPass {
for (Instruction &I : BB) {
if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&I))
Dbgs.push_back(DVI);
- // If there are any non-intrinsic records (DPValues), drop those too.
+ // If there are any non-intrinsic records (DbgRecords), drop those too.
I.dropDbgRecords();
}
}
diff --git a/llvm/unittests/tools/llvm-exegesis/X86/SubprocessMemoryTest.cpp b/llvm/unittests/tools/llvm-exegesis/X86/SubprocessMemoryTest.cpp
index 7c23e7b..c07ec18 100644
--- a/llvm/unittests/tools/llvm-exegesis/X86/SubprocessMemoryTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/X86/SubprocessMemoryTest.cpp
@@ -17,7 +17,6 @@
#include <endian.h>
#include <fcntl.h>
#include <sys/mman.h>
-#include <sys/syscall.h>
#include <unistd.h>
#endif // __linux__
@@ -50,9 +49,7 @@ protected:
std::string getSharedMemoryName(const unsigned TestNumber,
const unsigned DefinitionNumber) {
- long CurrentTID = syscall(SYS_gettid);
- return "/" + std::to_string(getSharedMemoryNumber(TestNumber)) + "t" +
- std::to_string(CurrentTID) + "memdef" +
+ return "/" + std::to_string(getSharedMemoryNumber(TestNumber)) + "memdef" +
std::to_string(DefinitionNumber);
}
diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp
index dd78dc0..628bff5 100644
--- a/llvm/utils/TableGen/DecoderEmitter.cpp
+++ b/llvm/utils/TableGen/DecoderEmitter.cpp
@@ -934,7 +934,7 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table,
unsigned Shift = 0;
do {
OS << ", " << (unsigned)*I;
- Value += (*I & 0x7f) << Shift;
+ Value += ((uint64_t)(*I & 0x7f)) << Shift;
Shift += 7;
} while (*I++ >= 128);
if (Value > 127) {
@@ -947,7 +947,7 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table,
Shift = 0;
do {
OS << ", " << (unsigned)*I;
- Value += (*I & 0x7f) << Shift;
+ Value += ((uint64_t)(*I & 0x7f)) << Shift;
Shift += 7;
} while (*I++ >= 128);
if (Value > 127) {