aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorVitaly Buka <vitalybuka@google.com>2024-07-26 10:20:26 -0700
committerVitaly Buka <vitalybuka@google.com>2024-07-26 10:20:26 -0700
commitf75b8a41bbf63c4424af9cc289ce958ba3364342 (patch)
treea7d5a2029b12fd7e0ea1ec129204da5498e9cd8e
parent7c586d05f48f39515fd981a44d75efb5356975c2 (diff)
parentd683d378998c85c12d7f0549944f807bb44c7b76 (diff)
downloadllvm-users/vitalybuka/spr/main.ubsanhwasan-let-mixing-filters.zip
llvm-users/vitalybuka/spr/main.ubsanhwasan-let-mixing-filters.tar.gz
llvm-users/vitalybuka/spr/main.ubsanhwasan-let-mixing-filters.tar.bz2
[𝘀𝗽𝗿] changes introduced through rebaseusers/vitalybuka/spr/main.ubsanhwasan-let-mixing-filters
Created using spr 1.3.4 [skip ci]
-rw-r--r--.github/workflows/release-asset-audit.py51
-rw-r--r--.github/workflows/release-asset-audit.yml54
-rw-r--r--.github/workflows/release-sources.yml2
-rwxr-xr-xclang-tools-extra/clang-tidy/add_new_check.py41
-rw-r--r--clang-tools-extra/test/clang-doc/basic-project.test3
-rw-r--r--clang/docs/LanguageExtensions.rst1
-rw-r--r--clang/docs/ReleaseNotes.rst6
-rw-r--r--clang/include/clang/AST/Type.h11
-rw-r--r--clang/include/clang/Basic/DiagnosticSemaKinds.td2
-rw-r--r--clang/include/clang/Basic/LangOptions.def2
-rw-r--r--clang/include/clang/Basic/TokenKinds.def1
-rw-r--r--clang/include/clang/Driver/Options.td14
-rw-r--r--clang/include/clang/StaticAnalyzer/Checkers/Checkers.td12
-rw-r--r--clang/lib/AST/DeclBase.cpp3
-rw-r--r--clang/lib/AST/Interp/Disasm.cpp11
-rw-r--r--clang/lib/AST/Interp/InterpBuiltin.cpp37
-rw-r--r--clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp2
-rw-r--r--clang/lib/Basic/LangOptions.cpp4
-rw-r--r--clang/lib/Basic/Targets/OSTargets.cpp8
-rw-r--r--clang/lib/CodeGen/CGAtomic.cpp13
-rw-r--r--clang/lib/CodeGen/CGExprScalar.cpp13
-rw-r--r--clang/lib/CodeGen/CGStmtOpenMP.cpp4
-rw-r--r--clang/lib/CodeGen/CodeGenFunction.h7
-rw-r--r--clang/lib/CodeGen/TargetInfo.h4
-rw-r--r--clang/lib/CodeGen/Targets/AMDGPU.cpp19
-rw-r--r--clang/lib/Driver/ToolChains/AMDGPU.cpp6
-rw-r--r--clang/lib/Driver/ToolChains/Clang.cpp28
-rw-r--r--clang/lib/Driver/ToolChains/PS4CPU.cpp4
-rw-r--r--clang/lib/Frontend/InitPreprocessor.cpp2
-rw-r--r--clang/lib/Frontend/PrintPreprocessedOutput.cpp1
-rw-r--r--clang/lib/Lex/PPMacroExpansion.cpp10
-rw-r--r--clang/lib/Sema/SemaChecking.cpp15
-rw-r--r--clang/lib/Sema/SemaDecl.cpp24
-rw-r--r--clang/lib/Sema/SemaDeclAttr.cpp7
-rw-r--r--clang/lib/Sema/SemaDeclCXX.cpp2
-rw-r--r--clang/lib/Sema/SemaExprCXX.cpp26
-rw-r--r--clang/lib/Sema/SemaPPC.cpp3
-rw-r--r--clang/lib/Sema/SemaTemplateDeduction.cpp4
-rw-r--r--clang/lib/StaticAnalyzer/Checkers/MmapWriteExecChecker.cpp58
-rw-r--r--clang/test/AST/Interp/builtins.cpp5
-rw-r--r--clang/test/Analysis/analyzer-config.c2
-rw-r--r--clang/test/Analysis/mmap-writeexec.c11
-rw-r--r--clang/test/CodeGen/AMDGPU/amdgpu-atomic-float.c316
-rw-r--r--clang/test/CodeGen/aarch64-fmv-streaming.c107
-rw-r--r--clang/test/CodeGen/finite-math.c2
-rw-r--r--clang/test/CodeGen/fp-floatcontrol-stack.cpp2
-rw-r--r--clang/test/CodeGen/fp-options-to-fast-math-flags.c2
-rw-r--r--clang/test/CodeGen/nofpclass.c4
-rw-r--r--clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu102
-rw-r--r--clang/test/CodeGenOpenCL/builtins-alloca.cl141
-rw-r--r--clang/test/CodeGenOpenCL/relaxed-fpmath.cl4
-rw-r--r--clang/test/Driver/amdgpu-toolchain.c4
-rw-r--r--clang/test/Driver/opencl.cl2
-rw-r--r--clang/test/Driver/stack-size-section.c1
-rw-r--r--clang/test/Headers/__clang_hip_cmath.hip3
-rw-r--r--clang/test/Headers/__clang_hip_math.hip3
-rw-r--r--clang/test/Headers/float.c2
-rw-r--r--clang/test/OpenMP/amdgpu-unsafe-fp-atomics.cpp59
-rw-r--r--clang/test/Parser/namelookup-anonymous-struct.c6
-rw-r--r--clang/test/Preprocessor/predefined-macros.c4
-rw-r--r--clang/test/Sema/aarch64-fmv-streaming.c46
-rw-r--r--clang/test/Sema/aarch64-sme-func-attrs.c42
-rw-r--r--clang/test/Sema/warn-infinity-nan-disabled-lnx.cpp9
-rw-r--r--clang/test/Sema/warn-infinity-nan-disabled-win.cpp5
-rw-r--r--clang/test/SemaCXX/pr100095.cpp17
-rw-r--r--clang/test/SemaCXX/type-traits.cpp74
-rw-r--r--clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp7
-rw-r--r--clang/tools/clang-nvlink-wrapper/NVLinkOpts.td7
-rw-r--r--clang/unittests/AST/ASTImporterTest.cpp37
-rw-r--r--clang/unittests/AST/StructuralEquivalenceTest.cpp14
-rw-r--r--clang/unittests/Analysis/FlowSensitive/DataflowEnvironmentTest.cpp49
-rw-r--r--flang/lib/Frontend/CompilerInvocation.cpp2
-rw-r--r--flang/lib/Semantics/check-cuda.cpp32
-rw-r--r--flang/runtime/transformational.cpp5
-rw-r--r--flang/test/Lower/CUDA/cuda-data-transfer.cuf2
-rw-r--r--flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf4
-rw-r--r--flang/test/Parser/cuf-sanity-common3
-rw-r--r--flang/test/Semantics/cuf09.cuf10
-rw-r--r--flang/test/Semantics/reduce.cuf6
-rw-r--r--libc/config/darwin/arm/entrypoints.txt1
-rw-r--r--libc/config/linux/aarch64/entrypoints.txt1
-rw-r--r--libc/config/linux/arm/entrypoints.txt1
-rw-r--r--libc/config/linux/riscv/entrypoints.txt1
-rw-r--r--libc/config/linux/x86_64/entrypoints.txt1
-rw-r--r--libc/config/windows/entrypoints.txt1
-rw-r--r--libc/docs/math/index.rst2
-rw-r--r--libc/spec/stdc.td1
-rw-r--r--libc/src/math/generic/CMakeLists.txt20
-rw-r--r--libc/src/math/generic/atan2.cpp313
-rw-r--r--libc/startup/gpu/CMakeLists.txt10
-rw-r--r--libc/test/src/math/CMakeLists.txt12
-rw-r--r--libc/test/src/math/atan2_test.cpp125
-rw-r--r--libc/test/src/math/smoke/CMakeLists.txt10
-rw-r--r--libc/test/src/math/smoke/atan2_test.cpp22
-rw-r--r--libcxx/CMakeLists.txt16
-rw-r--r--libcxx/cmake/Modules/CodeCoverage.cmake50
-rw-r--r--libcxx/cmake/caches/Generic-no-exceptions.cmake4
-rw-r--r--libcxx/cmake/caches/Generic-no-experimental.cmake4
-rw-r--r--libcxx/cmake/caches/Generic-no-filesystem.cmake4
-rw-r--r--libcxx/cmake/caches/Generic-no-localization.cmake4
-rw-r--r--libcxx/cmake/caches/Generic-no-random_device.cmake4
-rw-r--r--libcxx/cmake/caches/Generic-no-rtti.cmake4
-rw-r--r--libcxx/cmake/caches/Generic-no-threads.cmake4
-rw-r--r--libcxx/cmake/caches/Generic-no-tzdb.cmake4
-rw-r--r--libcxx/cmake/caches/Generic-no-unicode.cmake4
-rw-r--r--libcxx/cmake/caches/Generic-no-wide-characters.cmake4
-rw-r--r--libcxx/docs/Status/Cxx20Issues.csv10
-rw-r--r--libcxx/docs/Status/Cxx23Issues.csv12
-rw-r--r--libcxx/docs/Status/Cxx2cIssues.csv2
-rw-r--r--libcxx/include/__thread/thread.h1
-rw-r--r--libcxx/src/CMakeLists.txt5
-rw-r--r--libcxx/test/CMakeLists.txt18
-rw-r--r--libcxx/test/std/atomics/atomics.ref/required_alignment.pass.cpp1
-rw-r--r--lld/ELF/Arch/LoongArch.cpp10
-rw-r--r--lld/ELF/Relocations.cpp3
-rw-r--r--lld/test/ELF/loongarch-tls-gd-pcrel20-s2.s129
-rw-r--r--lld/test/ELF/loongarch-tls-ld-pcrel20-s2.s82
-rw-r--r--lld/test/ELF/loongarch-tlsdesc-pcrel20-s2.s142
-rw-r--r--lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp4
-rw-r--r--lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp113
-rw-r--r--lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h3
-rw-r--r--lldb/test/API/lang/cpp/class-template-non-type-parameter-pack/TestClassTemplateNonTypeParameterPack.py3
-rw-r--r--lldb/test/API/lang/cpp/class-template-type-parameter-pack/TestClassTemplateTypeParameterPack.py3
-rw-r--r--lldb/tools/lldb-dap/README.md31
-rw-r--r--lldb/tools/lldb-dap/package.json4
-rw-r--r--llvm/bindings/ocaml/llvm/llvm.ml1
-rw-r--r--llvm/docs/ProgrammersManual.rst2
-rw-r--r--llvm/include/llvm/ADT/DenseMap.h3
-rw-r--r--llvm/include/llvm/Analysis/LoopAccessAnalysis.h22
-rw-r--r--llvm/include/llvm/CodeGen/AsmPrinter.h6
-rw-r--r--llvm/include/llvm/CodeGen/FunctionLoweringInfo.h9
-rw-r--r--llvm/include/llvm/CodeGen/MIRPrinter.h1
-rw-r--r--llvm/include/llvm/CodeGen/MachineFunction.h19
-rw-r--r--llvm/include/llvm/CodeGen/MachineInstr.h2
-rw-r--r--llvm/include/llvm/CodeGen/MachineModuleInfo.h38
-rw-r--r--llvm/include/llvm/CodeGen/SDPatternMatch.h6
-rw-r--r--llvm/include/llvm/CodeGen/SelectionDAG.h1
-rw-r--r--llvm/include/llvm/CodeGen/TargetInstrInfo.h1
-rw-r--r--llvm/lib/Analysis/LoopAccessAnalysis.cpp119
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp18
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp4
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp13
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp9
-rw-r--r--llvm/lib/CodeGen/BranchFolding.cpp4
-rw-r--r--llvm/lib/CodeGen/DeadMachineInstructionElim.cpp2
-rw-r--r--llvm/lib/CodeGen/EarlyIfConversion.cpp2
-rw-r--r--llvm/lib/CodeGen/GlobalISel/Utils.cpp2
-rw-r--r--llvm/lib/CodeGen/IfConversion.cpp2
-rw-r--r--llvm/lib/CodeGen/LiveRangeEdit.cpp4
-rw-r--r--llvm/lib/CodeGen/LiveRangeShrink.cpp2
-rw-r--r--llvm/lib/CodeGen/MachineFunction.cpp17
-rw-r--r--llvm/lib/CodeGen/MachineFunctionAnalysis.cpp2
-rw-r--r--llvm/lib/CodeGen/MachineInstr.cpp2
-rw-r--r--llvm/lib/CodeGen/MachineLICM.cpp2
-rw-r--r--llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp3
-rw-r--r--llvm/lib/CodeGen/MachineModuleInfo.cpp8
-rw-r--r--llvm/lib/CodeGen/MachineSink.cpp8
-rw-r--r--llvm/lib/CodeGen/ModuloSchedule.cpp2
-rw-r--r--llvm/lib/CodeGen/RegisterCoalescer.cpp2
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/FastISel.cpp16
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp4
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp10
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp27
-rw-r--r--llvm/lib/CodeGen/TwoAddressInstructionPass.cpp4
-rw-r--r--llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp26
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp31
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h1
-rw-r--r--llvm/lib/Target/AMDGPU/SMInstructions.td22
-rw-r--r--llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp2
-rw-r--r--llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp2
-rw-r--r--llvm/lib/Target/Hexagon/HexagonGenInsert.cpp2
-rw-r--r--llvm/lib/Target/Lanai/LanaiInstrInfo.cpp2
-rw-r--r--llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp24
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchInstrInfo.td6
-rw-r--r--llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h8
-rw-r--r--llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp12
-rw-r--r--llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp15
-rw-r--r--llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h4
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp8
-rw-r--r--llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp12
-rw-r--r--llvm/lib/Target/RISCV/RISCVFeatures.td4
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp122
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfo.cpp2
-rw-r--r--llvm/lib/Target/RISCV/RISCVProcessors.td20
-rw-r--r--llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp4
-rw-r--r--llvm/lib/Target/SystemZ/SystemZISelLowering.cpp3
-rw-r--r--llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp3
-rw-r--r--llvm/lib/Target/X86/X86AsmPrinter.cpp30
-rw-r--r--llvm/lib/Target/X86/X86InstrInfo.cpp2
-rw-r--r--llvm/lib/Target/X86/X86SchedBroadwell.td6
-rw-r--r--llvm/lib/Target/X86/X86SchedHaswell.td6
-rw-r--r--llvm/lib/Target/X86/X86SchedSandyBridge.td6
-rw-r--r--llvm/lib/Target/X86/X86SchedSkylakeClient.td2
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.cpp58
-rw-r--r--llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp3
-rw-r--r--llvm/lib/Transforms/Scalar/LICM.cpp70
-rw-r--r--llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp515
-rw-r--r--llvm/lib/Transforms/Utils/SimplifyCFG.cpp27
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h4
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp44
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp6
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.h27
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp6
-rw-r--r--llvm/test/Analysis/CostModel/X86/arith-overflow.ll462
-rw-r--r--llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll8
-rw-r--r--llvm/test/Analysis/LoopAccessAnalysis/load-store-index-loaded-in-loop.ll26
-rw-r--r--llvm/test/Analysis/LoopAccessAnalysis/pointer-with-unknown-bounds.ll4
-rw-r--r--llvm/test/Analysis/LoopAccessAnalysis/print-order.ll6
-rw-r--r--llvm/test/Analysis/LoopAccessAnalysis/select-dependence.ll4
-rw-r--r--llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll103
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdpal-elf.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll24
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/basic-branch.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/early-term.mir4
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir2
-rw-r--r--llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir2
-rw-r--r--llvm/test/CodeGen/AMDGPU/hsa-generic-target-features.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/hsa-metadata-wavefrontsize.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/hsa.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/insert-delay-alu-literal.mir2
-rw-r--r--llvm/test/CodeGen/AMDGPU/insert-delay-alu-parse.mir2
-rw-r--r--llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir2
-rw-r--r--llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.wave32.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll787
-rw-r--r--llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir8
-rw-r--r--llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/s_or_saveexec_xor_combine.mir4
-rw-r--r--llvm/test/CodeGen/AMDGPU/sgpr-spill.mir2
-rw-r--r--llvm/test/CodeGen/AMDGPU/sink-after-control-flow-postra.mir2
-rw-r--r--llvm/test/CodeGen/AMDGPU/skip-if-dead.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/smem-war-hazard.mir4
-rw-r--r--llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir4
-rw-r--r--llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir4
-rw-r--r--llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.mir4
-rw-r--r--llvm/test/CodeGen/AMDGPU/vcmpx-exec-war-hazard.mir4
-rw-r--r--llvm/test/CodeGen/AMDGPU/verify-constant-bus-violations.mir8
-rw-r--r--llvm/test/CodeGen/AMDGPU/verify-vopd-gfx12.mir4
-rw-r--r--llvm/test/CodeGen/AMDGPU/verify-vopd.mir4
-rw-r--r--llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir2
-rw-r--r--llvm/test/CodeGen/AMDGPU/wmma_modifiers.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/wqm.ll2
-rw-r--r--llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll2
-rw-r--r--llvm/test/CodeGen/PowerPC/common-chain.ll315
-rw-r--r--llvm/test/CodeGen/RISCV/O3-pipeline.ll2
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll109
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmacc.ll20
-rw-r--r--llvm/test/CodeGen/SystemZ/vec-combine-01.ll10
-rw-r--r--llvm/test/CodeGen/X86/known-bits.ll2
-rw-r--r--llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll23
-rw-r--r--llvm/test/MC/LoongArch/Basic/Integer/invalid.s6
-rw-r--r--llvm/test/MC/LoongArch/Relocations/relocations.s20
-rw-r--r--llvm/test/Transforms/LICM/hoist-binop.ll226
-rw-r--r--llvm/test/Transforms/LICM/sink-foldable.ll5
-rw-r--r--llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll4
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll22
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll32
-rw-r--r--llvm/test/Transforms/SimplifyCFG/AMDGPU/skip-threading.ll44
-rw-r--r--llvm/test/Transforms/SimplifyCFG/convergent.ll39
-rw-r--r--llvm/test/tools/llvm-mca/X86/Barcelona/resources-sse2.s10
-rw-r--r--llvm/test/tools/llvm-mca/X86/Barcelona/zero-idioms.s132
-rw-r--r--llvm/test/tools/llvm-mca/X86/Broadwell/resources-mmx.s14
-rw-r--r--llvm/test/tools/llvm-mca/X86/Generic/resources-sse2.s10
-rw-r--r--llvm/test/tools/llvm-mca/X86/Haswell/resources-mmx.s14
-rw-r--r--llvm/test/tools/llvm-mca/X86/SandyBridge/resources-sse2.s10
-rw-r--r--llvm/test/tools/llvm-mca/X86/SandyBridge/zero-idioms.s220
-rw-r--r--llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-avx1.s10
-rw-r--r--llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-sse41.s10
-rw-r--r--llvm/tools/llvm-driver/llvm-driver.cpp2
-rw-r--r--llvm/tools/llvm-reduce/ReducerWorkItem.cpp2
-rw-r--r--llvm/unittests/CodeGen/AArch64SelectionDAGTest.cpp4
-rw-r--r--llvm/unittests/CodeGen/InstrRefLDVTest.cpp2
-rw-r--r--llvm/unittests/CodeGen/MFCommon.inc3
-rw-r--r--llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp2
-rw-r--r--llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp15
-rw-r--r--llvm/unittests/Target/AMDGPU/ExecMayBeModifiedBeforeAnyUse.cpp3
-rw-r--r--llvm/unittests/Target/AMDGPU/PALMetadata.cpp2
-rw-r--r--llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp2
-rw-r--r--llvm/unittests/Transforms/Vectorize/VPlanTest.cpp4
-rw-r--r--llvm/utils/gn/secondary/lldb/source/Interpreter/BUILD.gn1
-rw-r--r--llvm/utils/gn/secondary/lldb/source/Interpreter/Interfaces/BUILD.gn9
-rw-r--r--llvm/utils/gn/secondary/llvm/test/BUILD.gn1
-rw-r--r--llvm/utils/gn/secondary/llvm/tools/llvm-ctxprof-util/BUILD.gn11
-rw-r--r--mlir/docs/DefiningDialects/Operations.md53
-rw-r--r--mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td15
-rw-r--r--mlir/include/mlir/IR/ODSSupport.h48
-rw-r--r--mlir/include/mlir/IR/Properties.td565
-rw-r--r--mlir/include/mlir/TableGen/Operator.h2
-rw-r--r--mlir/include/mlir/TableGen/Property.h53
-rw-r--r--mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h1
-rw-r--r--mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp91
-rw-r--r--mlir/lib/IR/ODSSupport.cpp71
-rw-r--r--mlir/lib/TableGen/Property.cpp59
-rw-r--r--mlir/test/Dialect/ArmSME/vector-legalization.mlir26
-rw-r--r--mlir/test/IR/properties.mlir52
-rw-r--r--mlir/test/IR/traits.mlir19
-rw-r--r--mlir/test/Transforms/test-legalizer.mlir4
-rw-r--r--mlir/test/lib/Dialect/Test/TestFormatUtils.cpp16
-rw-r--r--mlir/test/lib/Dialect/Test/TestFormatUtils.h3
-rw-r--r--mlir/test/lib/Dialect/Test/TestOps.td83
-rw-r--r--mlir/test/lib/Dialect/Test/TestOpsSyntax.td22
-rw-r--r--mlir/test/mlir-tblgen/op-format.mlir10
-rw-r--r--mlir/test/mlir-tblgen/op-format.td4
-rw-r--r--mlir/test/mlir-tblgen/op-properties.td120
-rw-r--r--mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp200
-rw-r--r--mlir/tools/mlir-tblgen/OpFormatGen.cpp290
-rw-r--r--utils/bazel/llvm-project-overlay/bolt/BUILD.bazel11
-rw-r--r--utils/bazel/llvm-project-overlay/llvm/BUILD.bazel5
-rw-r--r--utils/bazel/llvm-project-overlay/mlir/BUILD.bazel3
342 files changed, 7159 insertions, 2266 deletions
diff --git a/.github/workflows/release-asset-audit.py b/.github/workflows/release-asset-audit.py
new file mode 100644
index 0000000..355e7fe
--- /dev/null
+++ b/.github/workflows/release-asset-audit.py
@@ -0,0 +1,51 @@
+import github
+import sys
+
+def main():
+ token = sys.argv[1]
+
+ gh = github.Github(login_or_token=token)
+ repo = gh.get_repo("llvm/llvm-project")
+
+ uploaders = set(
+ [
+ "DimitryAndric",
+ "stefanp-ibm",
+ "lei137",
+ "omjavaid",
+ "nicolerabjohn",
+ "amy-kwan",
+ "mandlebug",
+ "zmodem",
+ "androm3da",
+ "tru",
+ "rovka",
+ "rorth",
+ "quinnlp",
+ "kamaub",
+ "abrisco",
+ "jakeegan",
+ "maryammo",
+ "tstellar",
+ "github-actions[bot]",
+ ]
+ )
+
+ for release in repo.get_releases():
+ print("Release:", release.title)
+ for asset in release.get_assets():
+ created_at = asset.created_at
+ updated_at = (
+ "" if asset.created_at == asset.updated_at else asset.updated_at
+ )
+ print(
+ f"{asset.name} : {asset.uploader.login} [{created_at} {updated_at}] ( {asset.download_count} )"
+ )
+ if asset.uploader.login not in uploaders:
+ with open('comment', 'w') as file:
+ file.write(f'@{asset.uploader.login} is not a valid uploader.')
+ sys.exit(1)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/.github/workflows/release-asset-audit.yml b/.github/workflows/release-asset-audit.yml
new file mode 100644
index 0000000..fd42bc6
--- /dev/null
+++ b/.github/workflows/release-asset-audit.yml
@@ -0,0 +1,54 @@
+name: Release Asset Audit
+
+on:
+ workflow_dispatch:
+ release:
+ schedule:
+ # * is a special character in YAML so you have to quote this string
+ # Run once an hour
+ - cron: '5 * * * *'
+
+ pull_request:
+ paths:
+ - ".github/workflows/release-asset-audit.py"
+ - ".github/workflows/release-asset-audit.yml"
+
+permissions:
+ contents: read # Default everything to read-only
+
+jobs:
+ audit:
+ name: "Release Asset Audit"
+ runs-on: ubuntu-22.04
+ if: github.repository == 'llvm/llvm-project'
+ steps:
+ - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 #v4.1.6
+ - name: "Run Audit Script"
+ env:
+ GITHUB_TOKEN: ${{ github.token }}
+ run: |
+ pip install --require-hashes -r ./llvm/utils/git/requirements.txt
+ python3 ./.github/workflows/release-asset-audit.py $GITHUB_TOKEN
+ - name: "File Issue"
+ if: >-
+ github.event_name != 'pull_request' &&
+ failure()
+ uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea #v7.0.1
+ with:
+ github-token: ${{ secrets.ISSUE_SUBSCRIBER_TOKEN }}
+ script: |
+ var fs = require('fs');
+ var body = ''
+ if (fs.existsSync('./comment')) {
+ body = JSON.parse(fs.readFileSync('./comment')) + "\n\n";
+ }
+ body = body + `\n\nhttps://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`
+
+ const issue = await github.rest.issues.create({
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ title: "Release Asset Audit Failed",
+ labels: ['infrastructure'],
+ body: body
+ });
+ console.log(issue);
diff --git a/.github/workflows/release-sources.yml b/.github/workflows/release-sources.yml
index 9c5b1a9..b0c0b65 100644
--- a/.github/workflows/release-sources.yml
+++ b/.github/workflows/release-sources.yml
@@ -47,7 +47,7 @@ jobs:
steps:
- id: inputs
run: |
- ref=${{ inputs.release-version || github.sha }}
+ ref=${{ (inputs.release-version && format('llvmorg-{0}', inputs.release-version)) || github.sha }}
if [ -n "${{ inputs.release-version }}" ]; then
export_args="-release ${{ inputs.release-version }} -final"
else
diff --git a/clang-tools-extra/clang-tidy/add_new_check.py b/clang-tools-extra/clang-tidy/add_new_check.py
index 3a62df1..1ce2019 100755
--- a/clang-tools-extra/clang-tidy/add_new_check.py
+++ b/clang-tools-extra/clang-tidy/add_new_check.py
@@ -16,6 +16,7 @@ import io
import os
import re
import sys
+import textwrap
# Adapts the module's CMakelist file. Returns 'True' if it could add a new
# entry and 'False' if the entry already existed.
@@ -53,7 +54,14 @@ def adapt_cmake(module_path, check_name_camel):
# Adds a header for the new check.
-def write_header(module_path, module, namespace, check_name, check_name_camel):
+def write_header(
+ module_path, module, namespace, check_name, check_name_camel, description
+):
+ wrapped_desc = "\n".join(
+ textwrap.wrap(
+ description, width=80, initial_indent="/// ", subsequent_indent="/// "
+ )
+ )
filename = os.path.join(module_path, check_name_camel) + ".h"
print("Creating %s..." % filename)
with io.open(filename, "w", encoding="utf8", newline="\n") as f:
@@ -85,7 +93,7 @@ def write_header(module_path, module, namespace, check_name, check_name_camel):
namespace clang::tidy::%(namespace)s {
-/// FIXME: Write a short description.
+%(description)s
///
/// For the user-facing documentation see:
/// http://clang.llvm.org/extra/clang-tidy/checks/%(module)s/%(check_name)s.html
@@ -107,6 +115,7 @@ public:
"check_name": check_name,
"module": module,
"namespace": namespace,
+ "description": wrapped_desc,
}
)
@@ -235,7 +244,12 @@ def adapt_module(module_path, module, check_name, check_name_camel):
# Adds a release notes entry.
-def add_release_notes(module_path, module, check_name):
+def add_release_notes(module_path, module, check_name, description):
+ wrapped_desc = "\n".join(
+ textwrap.wrap(
+ description, width=80, initial_indent=" ", subsequent_indent=" "
+ )
+ )
check_name_dashes = module + "-" + check_name
filename = os.path.normpath(
os.path.join(module_path, "../../docs/ReleaseNotes.rst")
@@ -281,10 +295,10 @@ def add_release_notes(module_path, module, check_name):
"""- New :doc:`%s
<clang-tidy/checks/%s/%s>` check.
- FIXME: add release notes.
+%s
"""
- % (check_name_dashes, module, check_name)
+ % (check_name_dashes, module, check_name, wrapped_desc)
)
note_added = True
@@ -613,6 +627,13 @@ def main():
metavar="LANG",
)
parser.add_argument(
+ "--description",
+ "-d",
+ help="short description of what the check does",
+ default="FIXME: Write a short description",
+ type=str,
+ )
+ parser.add_argument(
"module",
nargs="?",
help="module directory under which to place the new tidy check (e.g., misc)",
@@ -652,10 +673,16 @@ def main():
else:
namespace = module
- write_header(module_path, module, namespace, check_name, check_name_camel)
+ description = args.description
+ if not description.endswith("."):
+ description += "."
+
+ write_header(
+ module_path, module, namespace, check_name, check_name_camel, description
+ )
write_implementation(module_path, module, namespace, check_name_camel)
adapt_module(module_path, module, check_name, check_name_camel)
- add_release_notes(module_path, module, check_name)
+ add_release_notes(module_path, module, check_name, description)
test_extension = language_to_extension.get(args.language)
write_test(module_path, module, check_name, test_extension)
write_docs(module_path, module, check_name)
diff --git a/clang-tools-extra/test/clang-doc/basic-project.test b/clang-tools-extra/test/clang-doc/basic-project.test
index 51d3ac6..38569d8 100644
--- a/clang-tools-extra/test/clang-doc/basic-project.test
+++ b/clang-tools-extra/test/clang-doc/basic-project.test
@@ -1,3 +1,6 @@
+// See https://github.com/llvm/llvm-project/issues/97507.
+// UNSUPPORTED: target={{.*}}
+
// RUN: rm -rf %t && mkdir -p %t/docs %t/build
// RUN: sed 's|$test_dir|%/S|g' %S/Inputs/basic-project/database_template.json > %t/build/compile_commands.json
// RUN: clang-doc --format=html --output=%t/docs --executor=all-TUs %t/build/compile_commands.json
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 81784c7..a747464 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -1546,6 +1546,7 @@ The following type trait primitives are supported by Clang. Those traits marked
* ``__array_extent(type, dim)`` (Embarcadero):
The ``dim``'th array bound in the type ``type``, or ``0`` if
``dim >= __array_rank(type)``.
+* ``__builtin_is_virtual_base_of`` (C++, GNU, Microsoft)
* ``__can_pass_in_regs`` (C++)
Returns whether a class can be passed in registers under the current
ABI. This type can only be applied to unqualified class types.
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 5dddd8f..286f319 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -81,6 +81,9 @@ C++23 Feature Support
C++2c Feature Support
^^^^^^^^^^^^^^^^^^^^^
+- Add ``__builtin_is_virtual_base_of`` intrinsic, which supports
+ `P2985R0 A type trait for detecting virtual base classes <https://wg21.link/p2985r0>`_
+
Resolutions to C++ Defect Reports
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -163,6 +166,9 @@ Miscellaneous Bug Fixes
Miscellaneous Clang Crashes Fixed
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+- Fixed a crash in C due to incorrect lookup that members in nested anonymous struct/union
+ can be found as ordinary identifiers in struct/union definition. (#GH31295)
+
OpenACC Specific Changes
------------------------
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index 72723c7..89a74ff 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -4698,26 +4698,25 @@ public:
};
private:
- LLVM_PREFERRED_TYPE(Kind)
- unsigned FKind : 3;
+ Kind FKind;
// Expansion: for hypothetical TCB+types, there could be one Kind for TCB,
// then ~16(?) bits "SubKind" to map to a specific named TCB. SubKind would
// be considered for uniqueness.
public:
- FunctionEffect() : FKind(unsigned(Kind::None)) {}
+ FunctionEffect() : FKind(Kind::None) {}
- explicit FunctionEffect(Kind K) : FKind(unsigned(K)) {}
+ explicit FunctionEffect(Kind K) : FKind(K) {}
/// The kind of the effect.
- Kind kind() const { return Kind(FKind); }
+ Kind kind() const { return FKind; }
/// Return the opposite kind, for effects which have opposites.
Kind oppositeKind() const;
/// For serialization.
- uint32_t toOpaqueInt32() const { return FKind; }
+ uint32_t toOpaqueInt32() const { return uint32_t(FKind); }
static FunctionEffect fromOpaqueInt32(uint32_t Value) {
return FunctionEffect(Kind(Value));
}
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 810abe4..beee243 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -3812,8 +3812,6 @@ def warn_sme_locally_streaming_has_vl_args_returns : Warning<
InGroup<AArch64SMEAttributes>, DefaultIgnore;
def err_conflicting_attributes_arm_state : Error<
"conflicting attributes for state '%0'">;
-def err_sme_streaming_cannot_be_multiversioned : Error<
- "streaming function cannot be multi-versioned">;
def err_unknown_arm_state : Error<
"unknown state '%0'">;
def err_missing_arm_state : Error<
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 834a6f6..0035092c 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -224,7 +224,6 @@ COMPATIBLE_LANGOPT(GNUInline , 1, 0, "GNU inline semantics")
COMPATIBLE_LANGOPT(NoInlineDefine , 1, 0, "__NO_INLINE__ predefined macro")
COMPATIBLE_LANGOPT(Deprecated , 1, 0, "__DEPRECATED predefined macro")
COMPATIBLE_LANGOPT(FastMath , 1, 0, "fast FP math optimizations, and __FAST_MATH__ predefined macro")
-COMPATIBLE_LANGOPT(FiniteMathOnly , 1, 0, "__FINITE_MATH_ONLY__ predefined macro")
COMPATIBLE_LANGOPT(UnsafeFPMath , 1, 0, "Unsafe Floating Point Math")
COMPATIBLE_LANGOPT(ProtectParens , 1, 0, "optimizer honors parentheses "
"when floating-point expressions are evaluated")
@@ -340,7 +339,6 @@ LANGOPT(SinglePrecisionConstants , 1, 0, "treating double-precision floating poi
LANGOPT(FastRelaxedMath , 1, 0, "OpenCL fast relaxed math")
BENIGN_LANGOPT(CLNoSignedZero , 1, 0, "Permit Floating Point optimization without regard to signed zeros")
COMPATIBLE_LANGOPT(CLUnsafeMath , 1, 0, "Unsafe Floating Point Math")
-COMPATIBLE_LANGOPT(CLFiniteMathOnly , 1, 0, "__FINITE_MATH_ONLY__ predefined macro")
/// FP_CONTRACT mode (on/off/fast).
BENIGN_ENUM_LANGOPT(DefaultFPContractMode, FPModeKind, 2, FPM_Off, "FP contraction type")
COMPATIBLE_LANGOPT(ExpStrictFP, 1, false, "Enable experimental strict floating point")
diff --git a/clang/include/clang/Basic/TokenKinds.def b/clang/include/clang/Basic/TokenKinds.def
index 8c54661..7e638dc 100644
--- a/clang/include/clang/Basic/TokenKinds.def
+++ b/clang/include/clang/Basic/TokenKinds.def
@@ -501,6 +501,7 @@ TYPE_TRAIT_1(__has_trivial_move_assign, HasTrivialMoveAssign, KEYCXX)
TYPE_TRAIT_1(__has_trivial_move_constructor, HasTrivialMoveConstructor, KEYCXX)
// GNU and MS Type Traits
+TYPE_TRAIT_2(__builtin_is_virtual_base_of, IsVirtualBaseOf, KEYCXX)
TYPE_TRAIT_1(__has_nothrow_assign, HasNothrowAssign, KEYCXX)
TYPE_TRAIT_1(__has_nothrow_copy, HasNothrowCopy, KEYCXX)
TYPE_TRAIT_1(__has_nothrow_constructor, HasNothrowConstructor, KEYCXX)
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 26811bf..ccccc95 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1117,8 +1117,7 @@ def cl_single_precision_constant : Flag<["-"], "cl-single-precision-constant">,
MarshallingInfoFlag<LangOpts<"SinglePrecisionConstants">>;
def cl_finite_math_only : Flag<["-"], "cl-finite-math-only">, Group<opencl_Group>,
Visibility<[ClangOption, CC1Option]>,
- HelpText<"OpenCL only. Allow floating-point optimizations that assume arguments and results are not NaNs or +-Inf.">,
- MarshallingInfoFlag<LangOpts<"CLFiniteMathOnly">>;
+ HelpText<"OpenCL only. Allow floating-point optimizations that assume arguments and results are not NaNs or +-Inf.">;
def cl_kernel_arg_info : Flag<["-"], "cl-kernel-arg-info">, Group<opencl_Group>,
Visibility<[ClangOption, CC1Option]>,
HelpText<"OpenCL only. Generate kernel argument metadata.">,
@@ -2609,13 +2608,12 @@ defm approx_func : BoolFOption<"approx-func", LangOpts<"ApproxFunc">, DefaultFal
"with an approximately equivalent calculation",
[funsafe_math_optimizations.KeyPath]>,
NegFlag<SetFalse, [], [ClangOption, CC1Option, FC1Option, FlangOption]>>;
-defm finite_math_only : BoolFOption<"finite-math-only",
- LangOpts<"FiniteMathOnly">, DefaultFalse,
+defm finite_math_only : BoolOptionWithoutMarshalling<"f", "finite-math-only",
PosFlag<SetTrue, [], [ClangOption, CC1Option],
"Allow floating-point optimizations that "
"assume arguments and results are not NaNs or +-inf. This defines "
"the \\_\\_FINITE\\_MATH\\_ONLY\\_\\_ preprocessor macro.",
- [cl_finite_math_only.KeyPath, ffast_math.KeyPath]>,
+ [ffast_math.KeyPath]>,
NegFlag<SetFalse>>;
defm signed_zeros : BoolFOption<"signed-zeros",
LangOpts<"NoSignedZero">, DefaultFalse,
@@ -7815,10 +7813,10 @@ def mreassociate : Flag<["-"], "mreassociate">,
MarshallingInfoFlag<LangOpts<"AllowFPReassoc">>, ImpliedByAnyOf<[funsafe_math_optimizations.KeyPath]>;
def menable_no_nans : Flag<["-"], "menable-no-nans">,
HelpText<"Allow optimization to assume there are no NaNs.">,
- MarshallingInfoFlag<LangOpts<"NoHonorNaNs">>, ImpliedByAnyOf<[ffinite_math_only.KeyPath]>;
-def menable_no_infinities : Flag<["-"], "menable-no-infs">,
+ MarshallingInfoFlag<LangOpts<"NoHonorNaNs">>, ImpliedByAnyOf<[ffast_math.KeyPath]>;
+def menable_no_infs : Flag<["-"], "menable-no-infs">,
HelpText<"Allow optimization to assume there are no infinities.">,
- MarshallingInfoFlag<LangOpts<"NoHonorInfs">>, ImpliedByAnyOf<[ffinite_math_only.KeyPath]>;
+ MarshallingInfoFlag<LangOpts<"NoHonorInfs">>, ImpliedByAnyOf<[ffast_math.KeyPath]>;
def pic_level : Separate<["-"], "pic-level">,
HelpText<"Value for __PIC__">,
diff --git a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
index ec5dbd2..38b55a0 100644
--- a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
+++ b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
@@ -1045,18 +1045,6 @@ def MallocOverflowSecurityChecker : Checker<"MallocOverflow">,
def MmapWriteExecChecker : Checker<"MmapWriteExec">,
HelpText<"Warn on mmap() calls that are both writable and executable">,
- CheckerOptions<[
- CmdLineOption<Integer,
- "MmapProtExec",
- "Specifies the value of PROT_EXEC",
- "0x04",
- Released>,
- CmdLineOption<Integer,
- "MmapProtRead",
- "Specifies the value of PROT_READ",
- "0x01",
- Released>
- ]>,
Documentation<HasDocumentation>;
def ReturnPointerRangeChecker : Checker<"ReturnPtrRange">,
diff --git a/clang/lib/AST/DeclBase.cpp b/clang/lib/AST/DeclBase.cpp
index bc5a920..a1f7054 100644
--- a/clang/lib/AST/DeclBase.cpp
+++ b/clang/lib/AST/DeclBase.cpp
@@ -879,8 +879,6 @@ unsigned Decl::getIdentifierNamespaceForKind(Kind DeclKind) {
return IDNS_Ordinary;
case Label:
return IDNS_Label;
- case IndirectField:
- return IDNS_Ordinary | IDNS_Member;
case Binding:
case NonTypeTemplateParm:
@@ -918,6 +916,7 @@ unsigned Decl::getIdentifierNamespaceForKind(Kind DeclKind) {
return IDNS_ObjCProtocol;
case Field:
+ case IndirectField:
case ObjCAtDefsField:
case ObjCIvar:
return IDNS_Member;
diff --git a/clang/lib/AST/Interp/Disasm.cpp b/clang/lib/AST/Interp/Disasm.cpp
index 867284e..5e3a5b9 100644
--- a/clang/lib/AST/Interp/Disasm.cpp
+++ b/clang/lib/AST/Interp/Disasm.cpp
@@ -278,10 +278,15 @@ LLVM_DUMP_METHOD void InterpFrame::dump(llvm::raw_ostream &OS,
OS << "\n";
OS.indent(Spaces) << "This: " << getThis() << "\n";
OS.indent(Spaces) << "RVO: " << getRVOPtr() << "\n";
-
- while (const InterpFrame *F = this->Caller) {
+ OS.indent(Spaces) << "Depth: " << Depth << "\n";
+ OS.indent(Spaces) << "ArgSize: " << ArgSize << "\n";
+ OS.indent(Spaces) << "Args: " << (void *)Args << "\n";
+ OS.indent(Spaces) << "FrameOffset: " << FrameOffset << "\n";
+ OS.indent(Spaces) << "FrameSize: " << (Func ? Func->getFrameSize() : 0)
+ << "\n";
+
+ for (const InterpFrame *F = this->Caller; F; F = F->Caller) {
F->dump(OS, Indent + 1);
- F = F->Caller;
}
}
diff --git a/clang/lib/AST/Interp/InterpBuiltin.cpp b/clang/lib/AST/Interp/InterpBuiltin.cpp
index c59bbc8..d7538c7 100644
--- a/clang/lib/AST/Interp/InterpBuiltin.cpp
+++ b/clang/lib/AST/Interp/InterpBuiltin.cpp
@@ -1170,27 +1170,30 @@ static bool interp__builtin_constant_p(InterpState &S, CodePtr OpPC,
Stk.clear();
}
- const APValue &LV = Res.toAPValue();
- if (!Res.isInvalid() && LV.isLValue()) {
- APValue::LValueBase Base = LV.getLValueBase();
- if (Base.isNull()) {
- // A null base is acceptable.
- return returnInt(true);
- } else if (const auto *E = Base.dyn_cast<const Expr *>()) {
- if (!isa<StringLiteral>(E))
+ if (!Res.isInvalid() && !Res.empty()) {
+ const APValue &LV = Res.toAPValue();
+ if (LV.isLValue()) {
+ APValue::LValueBase Base = LV.getLValueBase();
+ if (Base.isNull()) {
+ // A null base is acceptable.
+ return returnInt(true);
+ } else if (const auto *E = Base.dyn_cast<const Expr *>()) {
+ if (!isa<StringLiteral>(E))
+ return returnInt(false);
+ return returnInt(LV.getLValueOffset().isZero());
+ } else if (Base.is<TypeInfoLValue>()) {
+ // Surprisingly, GCC considers __builtin_constant_p(&typeid(int)) to
+ // evaluate to true.
+ return returnInt(true);
+ } else {
+ // Any other base is not constant enough for GCC.
return returnInt(false);
- return returnInt(LV.getLValueOffset().isZero());
- } else if (Base.is<TypeInfoLValue>()) {
- // Surprisingly, GCC considers __builtin_constant_p(&typeid(int)) to
- // evaluate to true.
- return returnInt(true);
- } else {
- // Any other base is not constant enough for GCC.
- return returnInt(false);
+ }
}
}
- return returnInt(!Res.isInvalid() && !Res.empty());
+ // Otherwise, any constant value is good enough.
+ return returnInt(true);
}
return returnInt(false);
diff --git a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
index 8d7fe18..e1f68e4 100644
--- a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
+++ b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
@@ -416,7 +416,7 @@ public:
// below them can initialize the same object (or part of it).
if (isa<CXXConstructExpr>(E) || isa<CallExpr>(E) || isa<LambdaExpr>(E) ||
isa<CXXDefaultArgExpr>(E) || isa<CXXStdInitializerListExpr>(E) ||
- isa<AtomicExpr>(E) ||
+ isa<AtomicExpr>(E) || isa<CXXInheritedCtorInitExpr>(E) ||
// We treat `BuiltinBitCastExpr` as an "original initializer" too as
// it may not even be casting from a record type -- and even if it is,
// the two objects are in general of unrelated type.
diff --git a/clang/lib/Basic/LangOptions.cpp b/clang/lib/Basic/LangOptions.cpp
index e5adc03..9331a63 100644
--- a/clang/lib/Basic/LangOptions.cpp
+++ b/clang/lib/Basic/LangOptions.cpp
@@ -34,8 +34,8 @@ void LangOptions::resetNonModularOptions() {
// invocations that cannot be round-tripped to arguments.
// FIXME: we should derive this automatically from ImpliedBy in tablegen.
AllowFPReassoc = UnsafeFPMath;
- NoHonorNaNs = FiniteMathOnly;
- NoHonorInfs = FiniteMathOnly;
+ NoHonorInfs = FastMath;
+ NoHonorNaNs = FastMath;
// These options do not affect AST generation.
NoSanitizeFiles.clear();
diff --git a/clang/lib/Basic/Targets/OSTargets.cpp b/clang/lib/Basic/Targets/OSTargets.cpp
index 899aefa..b56e2c7 100644
--- a/clang/lib/Basic/Targets/OSTargets.cpp
+++ b/clang/lib/Basic/Targets/OSTargets.cpp
@@ -173,10 +173,10 @@ static void addVisualCDefines(const LangOptions &Opts, MacroBuilder &Builder) {
// "Under /fp:precise and /fp:strict, the compiler doesn't do any mathematical
// transformation unless the transformation is guaranteed to produce a bitwise
// identical result."
- const bool any_imprecise_flags =
- Opts.FastMath || Opts.FiniteMathOnly || Opts.UnsafeFPMath ||
- Opts.AllowFPReassoc || Opts.NoHonorNaNs || Opts.NoHonorInfs ||
- Opts.NoSignedZero || Opts.AllowRecip || Opts.ApproxFunc;
+ const bool any_imprecise_flags = Opts.FastMath || Opts.UnsafeFPMath ||
+ Opts.AllowFPReassoc || Opts.NoHonorNaNs ||
+ Opts.NoHonorInfs || Opts.NoSignedZero ||
+ Opts.AllowRecip || Opts.ApproxFunc;
// "Under both /fp:precise and /fp:fast, the compiler generates code intended
// to run in the default floating-point environment."
diff --git a/clang/lib/CodeGen/CGAtomic.cpp b/clang/lib/CodeGen/CGAtomic.cpp
index fbf942d0..fbe9569 100644
--- a/clang/lib/CodeGen/CGAtomic.cpp
+++ b/clang/lib/CodeGen/CGAtomic.cpp
@@ -727,7 +727,7 @@ static void EmitAtomicOp(CodeGenFunction &CGF, AtomicExpr *E, Address Dest,
llvm::Value *LoadVal1 = CGF.Builder.CreateLoad(Val1);
llvm::AtomicRMWInst *RMWI =
- CGF.Builder.CreateAtomicRMW(Op, Ptr, LoadVal1, Order, Scope);
+ CGF.emitAtomicRMWInst(Op, Ptr, LoadVal1, Order, Scope);
RMWI->setVolatile(E->isVolatile());
// For __atomic_*_fetch operations, perform the operation again to
@@ -2034,6 +2034,17 @@ std::pair<RValue, llvm::Value *> CodeGenFunction::EmitAtomicCompareExchange(
IsWeak);
}
+llvm::AtomicRMWInst *
+CodeGenFunction::emitAtomicRMWInst(llvm::AtomicRMWInst::BinOp Op, Address Addr,
+ llvm::Value *Val, llvm::AtomicOrdering Order,
+ llvm::SyncScope::ID SSID) {
+
+ llvm::AtomicRMWInst *RMW =
+ Builder.CreateAtomicRMW(Op, Addr, Val, Order, SSID);
+ getTargetHooks().setTargetAtomicMetadata(*this, *RMW);
+ return RMW;
+}
+
void CodeGenFunction::EmitAtomicUpdate(
LValue LVal, llvm::AtomicOrdering AO,
const llvm::function_ref<RValue(RValue)> &UpdateOp, bool IsVolatile) {
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index a17d684..84392745 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -2841,9 +2841,10 @@ ScalarExprEmitter::EmitScalarPrePostIncDec(const UnaryOperator *E, LValue LV,
isInc ? llvm::Instruction::FAdd : llvm::Instruction::FSub;
llvm::Value *amt = llvm::ConstantFP::get(
VMContext, llvm::APFloat(static_cast<float>(1.0)));
- llvm::Value *old =
- Builder.CreateAtomicRMW(aop, LV.getAddress(), amt,
- llvm::AtomicOrdering::SequentiallyConsistent);
+ llvm::AtomicRMWInst *old =
+ CGF.emitAtomicRMWInst(aop, LV.getAddress(), amt,
+ llvm::AtomicOrdering::SequentiallyConsistent);
+
return isPre ? Builder.CreateBinOp(op, old, amt) : old;
}
value = EmitLoadOfLValue(LV, E->getExprLoc());
@@ -3583,9 +3584,9 @@ LValue ScalarExprEmitter::EmitCompoundAssignLValue(
EmitScalarConversion(OpInfo.RHS, E->getRHS()->getType(), LHSTy,
E->getExprLoc()),
LHSTy);
- Value *OldVal = Builder.CreateAtomicRMW(
- AtomicOp, LHSLV.getAddress(), Amt,
- llvm::AtomicOrdering::SequentiallyConsistent);
+
+ llvm::AtomicRMWInst *OldVal =
+ CGF.emitAtomicRMWInst(AtomicOp, LHSLV.getAddress(), Amt);
// Since operation is atomic, the result type is guaranteed to be the
// same as the input in LLVM terms.
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index 853046b..4ee9840 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -6326,8 +6326,8 @@ static std::pair<bool, RValue> emitOMPAtomicRMW(CodeGenFunction &CGF, LValue X,
UpdateVal = CGF.Builder.CreateCast(llvm::Instruction::CastOps::UIToFP, IC,
X.getAddress().getElementType());
}
- llvm::Value *Res =
- CGF.Builder.CreateAtomicRMW(RMWOp, X.getAddress(), UpdateVal, AO);
+ llvm::AtomicRMWInst *Res =
+ CGF.emitAtomicRMWInst(RMWOp, X.getAddress(), UpdateVal, AO);
return std::make_pair(true, RValue::get(Res));
}
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 67e30195..bd62c65 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4160,6 +4160,13 @@ public:
llvm::AtomicOrdering::SequentiallyConsistent,
bool IsWeak = false, AggValueSlot Slot = AggValueSlot::ignored());
+ /// Emit an atomicrmw instruction, and applying relevant metadata when
+ /// applicable.
+ llvm::AtomicRMWInst *emitAtomicRMWInst(
+ llvm::AtomicRMWInst::BinOp Op, Address Addr, llvm::Value *Val,
+ llvm::AtomicOrdering Order = llvm::AtomicOrdering::SequentiallyConsistent,
+ llvm::SyncScope::ID SSID = llvm::SyncScope::System);
+
void EmitAtomicUpdate(LValue LVal, llvm::AtomicOrdering AO,
const llvm::function_ref<RValue(RValue)> &UpdateOp,
bool IsVolatile);
diff --git a/clang/lib/CodeGen/TargetInfo.h b/clang/lib/CodeGen/TargetInfo.h
index 2f21385..8f17c05 100644
--- a/clang/lib/CodeGen/TargetInfo.h
+++ b/clang/lib/CodeGen/TargetInfo.h
@@ -334,6 +334,10 @@ public:
llvm::AtomicOrdering Ordering,
llvm::LLVMContext &Ctx) const;
+ /// Allow the target to apply other metadata to an atomic instruction
+ virtual void setTargetAtomicMetadata(CodeGenFunction &CGF,
+ llvm::AtomicRMWInst &RMW) const {}
+
/// Interface class for filling custom fields of a block literal for OpenCL.
class TargetOpenCLBlockHelper {
public:
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 4d3275e..37e6af3 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -311,6 +311,8 @@ public:
SyncScope Scope,
llvm::AtomicOrdering Ordering,
llvm::LLVMContext &Ctx) const override;
+ void setTargetAtomicMetadata(CodeGenFunction &CGF,
+ llvm::AtomicRMWInst &RMW) const override;
llvm::Value *createEnqueuedBlockKernel(CodeGenFunction &CGF,
llvm::Function *BlockInvokeFunc,
llvm::Type *BlockTy) const override;
@@ -546,6 +548,23 @@ AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &LangOpts,
return Ctx.getOrInsertSyncScopeID(Name);
}
+void AMDGPUTargetCodeGenInfo::setTargetAtomicMetadata(
+ CodeGenFunction &CGF, llvm::AtomicRMWInst &RMW) const {
+ if (!CGF.getTarget().allowAMDGPUUnsafeFPAtomics())
+ return;
+
+ // TODO: Introduce new, more controlled options that also work for integers,
+ // and deprecate allowAMDGPUUnsafeFPAtomics.
+ llvm::AtomicRMWInst::BinOp RMWOp = RMW.getOperation();
+ if (llvm::AtomicRMWInst::isFPOperation(RMWOp)) {
+ llvm::MDNode *Empty = llvm::MDNode::get(CGF.getLLVMContext(), {});
+ RMW.setMetadata("amdgpu.no.fine.grained.memory", Empty);
+
+ if (RMWOp == llvm::AtomicRMWInst::FAdd && RMW.getType()->isFloatTy())
+ RMW.setMetadata("amdgpu.ignore.denormal.mode", Empty);
+ }
+}
+
bool AMDGPUTargetCodeGenInfo::shouldEmitStaticExternCAliases() const {
return false;
}
diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp
index f6c7300..f796c31 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -620,8 +620,10 @@ void amdgpu::Linker::ConstructJob(Compilation &C, const JobAction &JA,
const char *LinkingOutput) const {
std::string Linker = getToolChain().GetLinkerPath();
ArgStringList CmdArgs;
- CmdArgs.push_back("--no-undefined");
- CmdArgs.push_back("-shared");
+ if (!Args.hasArg(options::OPT_r)) {
+ CmdArgs.push_back("--no-undefined");
+ CmdArgs.push_back("-shared");
+ }
addLinkerCompressDebugSectionsOption(getToolChain(), Args, CmdArgs);
Args.AddAllArgs(CmdArgs, options::OPT_L);
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 4a94df7..6790079 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -3384,9 +3384,28 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
CmdArgs.push_back("-ffast-math");
// Handle __FINITE_MATH_ONLY__ similarly.
- if (!HonorINFs && !HonorNaNs)
+ // The -ffinite-math-only is added to CmdArgs when !HonorINFs && !HonorNaNs.
+ // Otherwise process the Xclang arguments to determine if -menable-no-infs and
+ // -menable-no-nans are set by the user.
+ bool shouldAddFiniteMathOnly = false;
+ if (!HonorINFs && !HonorNaNs) {
+ shouldAddFiniteMathOnly = true;
+ } else {
+ bool InfValues = true;
+ bool NanValues = true;
+ for (const auto *Arg : Args.filtered(options::OPT_Xclang)) {
+ StringRef ArgValue = Arg->getValue();
+ if (ArgValue == "-menable-no-nans")
+ NanValues = false;
+ else if (ArgValue == "-menable-no-infs")
+ InfValues = false;
+ }
+ if (!NanValues && !InfValues)
+ shouldAddFiniteMathOnly = true;
+ }
+ if (shouldAddFiniteMathOnly) {
CmdArgs.push_back("-ffinite-math-only");
-
+ }
if (const Arg *A = Args.getLastArg(options::OPT_mfpmath_EQ)) {
CmdArgs.push_back("-mfpmath");
CmdArgs.push_back(A->getValue());
@@ -3755,6 +3774,11 @@ static void RenderOpenCLOptions(const ArgList &Args, ArgStringList &CmdArgs,
CmdArgs.push_back(Args.MakeArgString(CLExtStr));
}
+ if (Args.hasArg(options::OPT_cl_finite_math_only)) {
+ CmdArgs.push_back("-menable-no-infs");
+ CmdArgs.push_back("-menable-no-nans");
+ }
+
for (const auto &Arg : ForwardedArguments)
if (const auto *A = Args.getLastArg(Arg))
CmdArgs.push_back(Args.MakeArgString(A->getOption().getPrefixedName()));
diff --git a/clang/lib/Driver/ToolChains/PS4CPU.cpp b/clang/lib/Driver/ToolChains/PS4CPU.cpp
index 813a0fb..f883f29 100644
--- a/clang/lib/Driver/ToolChains/PS4CPU.cpp
+++ b/clang/lib/Driver/ToolChains/PS4CPU.cpp
@@ -266,6 +266,10 @@ void tools::PS5cpu::Linker::ConstructJob(Compilation &C, const JobAction &JA,
if (UseJMC)
AddLTOFlag("-enable-jmc-instrument");
+ if (Args.hasFlag(options::OPT_fstack_size_section,
+ options::OPT_fno_stack_size_section, false))
+ AddLTOFlag("-stack-size-section");
+
if (Arg *A = Args.getLastArg(options::OPT_fcrash_diagnostics_dir))
AddLTOFlag(Twine("-crash-diagnostics-dir=") + A->getValue());
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index 920ddf7..17b9ca7 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -1318,7 +1318,7 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
if (!LangOpts.MathErrno)
Builder.defineMacro("__NO_MATH_ERRNO__");
- if (LangOpts.FastMath || LangOpts.FiniteMathOnly)
+ if (LangOpts.FastMath || (LangOpts.NoHonorInfs && LangOpts.NoHonorNaNs))
Builder.defineMacro("__FINITE_MATH_ONLY__", "1");
else
Builder.defineMacro("__FINITE_MATH_ONLY__", "0");
diff --git a/clang/lib/Frontend/PrintPreprocessedOutput.cpp b/clang/lib/Frontend/PrintPreprocessedOutput.cpp
index 0592423..135dca0 100644
--- a/clang/lib/Frontend/PrintPreprocessedOutput.cpp
+++ b/clang/lib/Frontend/PrintPreprocessedOutput.cpp
@@ -980,7 +980,6 @@ static void PrintPreprocessedTokens(Preprocessor &PP, Token &Tok,
*Callbacks->OS << static_cast<unsigned>(*Iter);
PrintComma = true;
}
- IsStartOfLine = true;
} else if (Tok.isAnnotation()) {
// Ignore annotation tokens created by pragmas - the pragmas themselves
// will be reproduced in the preprocessed output.
diff --git a/clang/lib/Lex/PPMacroExpansion.cpp b/clang/lib/Lex/PPMacroExpansion.cpp
index 879f01e..1e31fcc 100644
--- a/clang/lib/Lex/PPMacroExpansion.cpp
+++ b/clang/lib/Lex/PPMacroExpansion.cpp
@@ -1604,6 +1604,16 @@ static bool isTargetVariantEnvironment(const TargetInfo &TI,
return false;
}
+#if defined(__sun__) && defined(__svr4__)
+// GCC mangles std::tm as tm for binary compatibility on Solaris (Issue
+// #33114). We need to match this to allow the std::put_time calls to link
+// (PR #99075).
+asm("_ZNKSt8time_putIcSt19ostreambuf_iteratorIcSt11char_traitsIcEEE3putES3_"
+ "RSt8ios_basecPKSt2tmPKcSB_ = "
+ "_ZNKSt8time_putIcSt19ostreambuf_iteratorIcSt11char_traitsIcEEE3putES3_"
+ "RSt8ios_basecPK2tmPKcSB_");
+#endif
+
/// ExpandBuiltinMacro - If an identifier token is read that is to be expanded
/// as a builtin macro, handle it and return the next token as 'Tok'.
void Preprocessor::ExpandBuiltinMacro(Token &Tok) {
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 07a34fd..bb30b1e 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -1477,6 +1477,18 @@ static bool BuiltinSEHScopeCheck(Sema &SemaRef, CallExpr *TheCall,
return false;
}
+// In OpenCL, __builtin_alloca_* should return a pointer to address space
+// that corresponds to the stack address space i.e private address space.
+static void builtinAllocaAddrSpace(Sema &S, CallExpr *TheCall) {
+ QualType RT = TheCall->getType();
+ assert((RT->isPointerType() && !(RT->getPointeeType().hasAddressSpace())) &&
+ "__builtin_alloca has invalid address space");
+
+ RT = RT->getPointeeType();
+ RT = S.Context.getAddrSpaceQualType(RT, LangAS::opencl_private);
+ TheCall->setType(S.Context.getPointerType(RT));
+}
+
namespace {
enum PointerAuthOpKind {
PAO_Strip,
@@ -2214,6 +2226,9 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
case Builtin::BI__builtin_alloca_uninitialized:
Diag(TheCall->getBeginLoc(), diag::warn_alloca)
<< TheCall->getDirectCallee();
+ if (getLangOpts().OpenCL) {
+ builtinAllocaAddrSpace(*this, TheCall);
+ }
break;
case Builtin::BI__arithmetic_fence:
if (BuiltinArithmeticFence(TheCall))
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 575bd29..7bdecb2 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -11009,6 +11009,9 @@ static bool AttrCompatibleWithMultiVersion(attr::Kind Kind,
switch (Kind) {
default:
return false;
+ case attr::ArmLocallyStreaming:
+ return MVKind == MultiVersionKind::TargetVersion ||
+ MVKind == MultiVersionKind::TargetClones;
case attr::Used:
return MVKind == MultiVersionKind::Target;
case attr::NonNull:
@@ -11145,7 +11148,21 @@ bool Sema::areMultiversionVariantFunctionsCompatible(
FunctionType::ExtInfo OldTypeInfo = OldType->getExtInfo();
FunctionType::ExtInfo NewTypeInfo = NewType->getExtInfo();
- if (OldTypeInfo.getCC() != NewTypeInfo.getCC())
+ const auto *OldFPT = OldFD->getType()->getAs<FunctionProtoType>();
+ const auto *NewFPT = NewFD->getType()->getAs<FunctionProtoType>();
+
+ bool ArmStreamingCCMismatched = false;
+ if (OldFPT && NewFPT) {
+ unsigned Diff =
+ OldFPT->getAArch64SMEAttributes() ^ NewFPT->getAArch64SMEAttributes();
+ // Arm-streaming, arm-streaming-compatible and non-streaming versions
+ // cannot be mixed.
+ if (Diff & (FunctionType::SME_PStateSMEnabledMask |
+ FunctionType::SME_PStateSMCompatibleMask))
+ ArmStreamingCCMismatched = true;
+ }
+
+ if (OldTypeInfo.getCC() != NewTypeInfo.getCC() || ArmStreamingCCMismatched)
return Diag(DiffDiagIDAt.first, DiffDiagIDAt.second) << CallingConv;
QualType OldReturnType = OldType->getReturnType();
@@ -11165,9 +11182,8 @@ bool Sema::areMultiversionVariantFunctionsCompatible(
if (!CLinkageMayDiffer && OldFD->isExternC() != NewFD->isExternC())
return Diag(DiffDiagIDAt.first, DiffDiagIDAt.second) << LanguageLinkage;
- if (CheckEquivalentExceptionSpec(
- OldFD->getType()->getAs<FunctionProtoType>(), OldFD->getLocation(),
- NewFD->getType()->getAs<FunctionProtoType>(), NewFD->getLocation()))
+ if (CheckEquivalentExceptionSpec(OldFPT, OldFD->getLocation(), NewFPT,
+ NewFD->getLocation()))
return true;
}
return false;
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 300bd89..8b0a6bf 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -3034,9 +3034,6 @@ bool Sema::checkTargetVersionAttr(SourceLocation LiteralLoc, Decl *D,
return Diag(LiteralLoc, diag::warn_unsupported_target_attribute)
<< Unsupported << None << CurFeature << TargetVersion;
}
- if (IsArmStreamingFunction(cast<FunctionDecl>(D),
- /*IncludeLocallyStreaming=*/false))
- return Diag(LiteralLoc, diag::err_sme_streaming_cannot_be_multiversioned);
return false;
}
@@ -3133,10 +3130,6 @@ bool Sema::checkTargetClonesAttrString(
HasNotDefault = true;
}
}
- if (IsArmStreamingFunction(cast<FunctionDecl>(D),
- /*IncludeLocallyStreaming=*/false))
- return Diag(LiteralLoc,
- diag::err_sme_streaming_cannot_be_multiversioned);
} else {
// Other targets ( currently X86 )
if (Cur.starts_with("arch=")) {
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 1cca8ac..5782daa 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -10385,7 +10385,7 @@ void Sema::checkIncorrectVTablePointerAuthenticationAttribute(
while (1) {
assert(PrimaryBase);
const CXXRecordDecl *Base = nullptr;
- for (auto BasePtr : PrimaryBase->bases()) {
+ for (const CXXBaseSpecifier &BasePtr : PrimaryBase->bases()) {
if (!BasePtr.getType()->getAsCXXRecordDecl()->isDynamicClass())
continue;
Base = BasePtr.getType()->getAsCXXRecordDecl();
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 51c4a36..029969c 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -6030,6 +6030,32 @@ static bool EvaluateBinaryTypeTrait(Sema &Self, TypeTrait BTT, const TypeSourceI
return cast<CXXRecordDecl>(rhsRecord->getDecl())
->isDerivedFrom(cast<CXXRecordDecl>(lhsRecord->getDecl()));
}
+ case BTT_IsVirtualBaseOf: {
+ const RecordType *BaseRecord = LhsT->getAs<RecordType>();
+ const RecordType *DerivedRecord = RhsT->getAs<RecordType>();
+
+ if (!BaseRecord || !DerivedRecord) {
+ DiagnoseVLAInCXXTypeTrait(Self, Lhs,
+ tok::kw___builtin_is_virtual_base_of);
+ DiagnoseVLAInCXXTypeTrait(Self, Rhs,
+ tok::kw___builtin_is_virtual_base_of);
+ return false;
+ }
+
+ if (BaseRecord->isUnionType() || DerivedRecord->isUnionType())
+ return false;
+
+ if (!BaseRecord->isStructureOrClassType() ||
+ !DerivedRecord->isStructureOrClassType())
+ return false;
+
+ if (Self.RequireCompleteType(Rhs->getTypeLoc().getBeginLoc(), RhsT,
+ diag::err_incomplete_type))
+ return false;
+
+ return cast<CXXRecordDecl>(DerivedRecord->getDecl())
+ ->isVirtuallyDerivedFrom(cast<CXXRecordDecl>(BaseRecord->getDecl()));
+ }
case BTT_IsSame:
return Self.Context.hasSameType(LhsT, RhsT);
case BTT_TypeCompatible: {
diff --git a/clang/lib/Sema/SemaPPC.cpp b/clang/lib/Sema/SemaPPC.cpp
index 99f46b1..5b764ed 100644
--- a/clang/lib/Sema/SemaPPC.cpp
+++ b/clang/lib/Sema/SemaPPC.cpp
@@ -93,7 +93,6 @@ bool SemaPPC::CheckPPCBuiltinFunctionCall(const TargetInfo &TI,
unsigned BuiltinID,
CallExpr *TheCall) {
ASTContext &Context = getASTContext();
- unsigned i = 0, l = 0, u = 0;
bool IsTarget64Bit = TI.getTypeWidth(TI.getIntPtrType()) == 64;
llvm::APSInt Result;
@@ -248,7 +247,7 @@ bool SemaPPC::CheckPPCBuiltinFunctionCall(const TargetInfo &TI,
return BuiltinPPCMMACall(TheCall, BuiltinID, Types);
#include "clang/Basic/BuiltinsPPC.def"
}
- return SemaRef.BuiltinConstantArgRange(TheCall, i, l, u);
+ llvm_unreachable("must return from switch");
}
// Check if the given type is a non-pointer PPC MMA type. This function is used
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index b7b857e..db7f233 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -951,9 +951,11 @@ private:
// Skip over the pack elements that were expanded into separate arguments.
// If we partially expanded, this is the number of partial arguments.
+ // FIXME: `&& FixedNumExpansions` is a workaround for UB described in
+ // https://github.com/llvm/llvm-project/issues/100095
if (IsPartiallyExpanded)
PackElements += NumPartialPackArgs;
- else if (IsExpanded)
+ else if (IsExpanded && FixedNumExpansions)
PackElements += *FixedNumExpansions;
for (auto &Pack : Packs) {
diff --git a/clang/lib/StaticAnalyzer/Checkers/MmapWriteExecChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/MmapWriteExecChecker.cpp
index cd1dd1b2..4b8e521 100644
--- a/clang/lib/StaticAnalyzer/Checkers/MmapWriteExecChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/MmapWriteExecChecker.cpp
@@ -21,49 +21,56 @@
#include "clang/StaticAnalyzer/Core/PathSensitive/CallDescription.h"
#include "clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h"
#include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/CheckerHelpers.h"
using namespace clang;
using namespace ento;
namespace {
-class MmapWriteExecChecker : public Checker<check::PreCall> {
+class MmapWriteExecChecker
+ : public Checker<check::ASTDecl<TranslationUnitDecl>, check::PreCall> {
CallDescription MmapFn{CDM::CLibrary, {"mmap"}, 6};
CallDescription MprotectFn{CDM::CLibrary, {"mprotect"}, 3};
- static int ProtWrite;
- static int ProtExec;
- static int ProtRead;
const BugType BT{this, "W^X check fails, Write Exec prot flags set",
"Security"};
+ // Default values are used if definition of the flags is not found.
+ mutable int ProtRead = 0x01;
+ mutable int ProtWrite = 0x02;
+ mutable int ProtExec = 0x04;
+
public:
+ void checkASTDecl(const TranslationUnitDecl *TU, AnalysisManager &Mgr,
+ BugReporter &BR) const;
void checkPreCall(const CallEvent &Call, CheckerContext &C) const;
- int ProtExecOv;
- int ProtReadOv;
};
}
-int MmapWriteExecChecker::ProtWrite = 0x02;
-int MmapWriteExecChecker::ProtExec = 0x04;
-int MmapWriteExecChecker::ProtRead = 0x01;
+void MmapWriteExecChecker::checkASTDecl(const TranslationUnitDecl *TU,
+ AnalysisManager &Mgr,
+ BugReporter &BR) const {
+ Preprocessor &PP = Mgr.getPreprocessor();
+ const std::optional<int> FoundProtRead = tryExpandAsInteger("PROT_READ", PP);
+ const std::optional<int> FoundProtWrite =
+ tryExpandAsInteger("PROT_WRITE", PP);
+ const std::optional<int> FoundProtExec = tryExpandAsInteger("PROT_EXEC", PP);
+ if (FoundProtRead && FoundProtWrite && FoundProtExec) {
+ ProtRead = *FoundProtRead;
+ ProtWrite = *FoundProtWrite;
+ ProtExec = *FoundProtExec;
+ }
+}
void MmapWriteExecChecker::checkPreCall(const CallEvent &Call,
- CheckerContext &C) const {
+ CheckerContext &C) const {
if (matchesAny(Call, MmapFn, MprotectFn)) {
SVal ProtVal = Call.getArgSVal(2);
auto ProtLoc = ProtVal.getAs<nonloc::ConcreteInt>();
if (!ProtLoc)
return;
int64_t Prot = ProtLoc->getValue().getSExtValue();
- if (ProtExecOv != ProtExec)
- ProtExec = ProtExecOv;
- if (ProtReadOv != ProtRead)
- ProtRead = ProtReadOv;
-
- // Wrong settings
- if (ProtRead == ProtExec)
- return;
- if ((Prot & (ProtWrite | ProtExec)) == (ProtWrite | ProtExec)) {
+ if ((Prot & ProtWrite) && (Prot & ProtExec)) {
ExplodedNode *N = C.generateNonFatalErrorNode();
if (!N)
return;
@@ -80,17 +87,10 @@ void MmapWriteExecChecker::checkPreCall(const CallEvent &Call,
}
}
-void ento::registerMmapWriteExecChecker(CheckerManager &mgr) {
- MmapWriteExecChecker *Mwec =
- mgr.registerChecker<MmapWriteExecChecker>();
- Mwec->ProtExecOv =
- mgr.getAnalyzerOptions()
- .getCheckerIntegerOption(Mwec, "MmapProtExec");
- Mwec->ProtReadOv =
- mgr.getAnalyzerOptions()
- .getCheckerIntegerOption(Mwec, "MmapProtRead");
+void ento::registerMmapWriteExecChecker(CheckerManager &Mgr) {
+ Mgr.registerChecker<MmapWriteExecChecker>();
}
-bool ento::shouldRegisterMmapWriteExecChecker(const CheckerManager &mgr) {
+bool ento::shouldRegisterMmapWriteExecChecker(const CheckerManager &) {
return true;
}
diff --git a/clang/test/AST/Interp/builtins.cpp b/clang/test/AST/Interp/builtins.cpp
index a74b68b..9b2b207 100644
--- a/clang/test/AST/Interp/builtins.cpp
+++ b/clang/test/AST/Interp/builtins.cpp
@@ -31,3 +31,8 @@ constexpr bool assume() {
return true;
}
static_assert(assume(), "");
+
+void test_builtin_os_log(void *buf, int i, const char *data) {
+ constexpr int len = __builtin_os_log_format_buffer_size("%d %{public}s %{private}.16P", i, data, data);
+ static_assert(len > 0, "Expect len > 0");
+}
diff --git a/clang/test/Analysis/analyzer-config.c b/clang/test/Analysis/analyzer-config.c
index 2a4c400..b8dbcdd 100644
--- a/clang/test/Analysis/analyzer-config.c
+++ b/clang/test/Analysis/analyzer-config.c
@@ -9,8 +9,6 @@
// CHECK-NEXT: alpha.clone.CloneChecker:ReportNormalClones = true
// CHECK-NEXT: alpha.cplusplus.STLAlgorithmModeling:AggressiveStdFindModeling = false
// CHECK-NEXT: alpha.osx.cocoa.DirectIvarAssignment:AnnotatedFunctions = false
-// CHECK-NEXT: alpha.security.MmapWriteExec:MmapProtExec = 0x04
-// CHECK-NEXT: alpha.security.MmapWriteExec:MmapProtRead = 0x01
// CHECK-NEXT: alpha.security.taint.TaintPropagation:Config = ""
// CHECK-NEXT: apply-fixits = false
// CHECK-NEXT: assume-controlled-environment = false
diff --git a/clang/test/Analysis/mmap-writeexec.c b/clang/test/Analysis/mmap-writeexec.c
index 8fd86ce..579cc75 100644
--- a/clang/test/Analysis/mmap-writeexec.c
+++ b/clang/test/Analysis/mmap-writeexec.c
@@ -1,13 +1,14 @@
-// RUN: %clang_analyze_cc1 -triple i686-unknown-linux -analyzer-checker=alpha.security.MmapWriteExec -analyzer-config alpha.security.MmapWriteExec:MmapProtExec=1 -analyzer-config alpha.security.MmapWriteExec:MmapProtRead=4 -DUSE_ALTERNATIVE_PROT_EXEC_DEFINITION -verify %s
+// RUN: %clang_analyze_cc1 -triple i686-unknown-linux -analyzer-checker=alpha.security.MmapWriteExec -DUSE_ALTERNATIVE_PROT_EXEC_DEFINITION -verify %s
// RUN: %clang_analyze_cc1 -triple x86_64-unknown-apple-darwin10 -analyzer-checker=alpha.security.MmapWriteExec -verify %s
-#define PROT_WRITE 0x02
#ifndef USE_ALTERNATIVE_PROT_EXEC_DEFINITION
-#define PROT_EXEC 0x04
-#define PROT_READ 0x01
-#else
#define PROT_EXEC 0x01
+#define PROT_WRITE 0x02
#define PROT_READ 0x04
+#else
+#define PROT_EXEC 0x08
+#define PROT_WRITE 0x04
+#define PROT_READ 0x02
#endif
#define MAP_PRIVATE 0x0002
#define MAP_ANON 0x1000
diff --git a/clang/test/CodeGen/AMDGPU/amdgpu-atomic-float.c b/clang/test/CodeGen/AMDGPU/amdgpu-atomic-float.c
new file mode 100644
index 0000000..6deff11
--- /dev/null
+++ b/clang/test/CodeGen/AMDGPU/amdgpu-atomic-float.c
@@ -0,0 +1,316 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -fnative-half-arguments-and-returns -triple amdgcn-amd-amdhsa-gnu -target-cpu gfx900 -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK,SAFE %s
+// RUN: %clang_cc1 -fnative-half-arguments-and-returns -triple amdgcn-amd-amdhsa-gnu -target-cpu gfx900 -emit-llvm -munsafe-fp-atomics -o - %s | FileCheck -check-prefixes=CHECK,UNSAFE %s
+
+// SAFE-LABEL: define dso_local float @test_float_post_inc(
+// SAFE-SAME: ) #[[ATTR0:[0-9]+]] {
+// SAFE-NEXT: [[ENTRY:.*:]]
+// SAFE-NEXT: [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)
+// SAFE-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_float_post_inc.n to ptr), float 1.000000e+00 seq_cst, align 4
+// SAFE-NEXT: ret float [[TMP0]]
+//
+// UNSAFE-LABEL: define dso_local float @test_float_post_inc(
+// UNSAFE-SAME: ) #[[ATTR0:[0-9]+]] {
+// UNSAFE-NEXT: [[ENTRY:.*:]]
+// UNSAFE-NEXT: [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)
+// UNSAFE-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_float_post_inc.n to ptr), float 1.000000e+00 seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META3:![0-9]+]], !amdgpu.ignore.denormal.mode [[META3]]
+// UNSAFE-NEXT: ret float [[TMP0]]
+//
+float test_float_post_inc()
+{
+ static _Atomic float n;
+ return n++;
+}
+
+// SAFE-LABEL: define dso_local float @test_float_post_dc(
+// SAFE-SAME: ) #[[ATTR0]] {
+// SAFE-NEXT: [[ENTRY:.*:]]
+// SAFE-NEXT: [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)
+// SAFE-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT: [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_float_post_dc.n to ptr), float 1.000000e+00 seq_cst, align 4
+// SAFE-NEXT: ret float [[TMP0]]
+//
+// UNSAFE-LABEL: define dso_local float @test_float_post_dc(
+// UNSAFE-SAME: ) #[[ATTR0]] {
+// UNSAFE-NEXT: [[ENTRY:.*:]]
+// UNSAFE-NEXT: [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)
+// UNSAFE-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT: [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_float_post_dc.n to ptr), float 1.000000e+00 seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META3]]
+// UNSAFE-NEXT: ret float [[TMP0]]
+//
+float test_float_post_dc()
+{
+ static _Atomic float n;
+ return n--;
+}
+
+// SAFE-LABEL: define dso_local float @test_float_pre_dc(
+// SAFE-SAME: ) #[[ATTR0]] {
+// SAFE-NEXT: [[ENTRY:.*:]]
+// SAFE-NEXT: [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)
+// SAFE-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT: [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_float_pre_dc.n to ptr), float 1.000000e+00 seq_cst, align 4
+// SAFE-NEXT: [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00
+// SAFE-NEXT: ret float [[TMP1]]
+//
+// UNSAFE-LABEL: define dso_local float @test_float_pre_dc(
+// UNSAFE-SAME: ) #[[ATTR0]] {
+// UNSAFE-NEXT: [[ENTRY:.*:]]
+// UNSAFE-NEXT: [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)
+// UNSAFE-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT: [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_float_pre_dc.n to ptr), float 1.000000e+00 seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META3]]
+// UNSAFE-NEXT: [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00
+// UNSAFE-NEXT: ret float [[TMP1]]
+//
+float test_float_pre_dc()
+{
+ static _Atomic float n;
+ return --n;
+}
+
+// SAFE-LABEL: define dso_local float @test_float_pre_inc(
+// SAFE-SAME: ) #[[ATTR0]] {
+// SAFE-NEXT: [[ENTRY:.*:]]
+// SAFE-NEXT: [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)
+// SAFE-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_float_pre_inc.n to ptr), float 1.000000e+00 seq_cst, align 4
+// SAFE-NEXT: [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00
+// SAFE-NEXT: ret float [[TMP1]]
+//
+// UNSAFE-LABEL: define dso_local float @test_float_pre_inc(
+// UNSAFE-SAME: ) #[[ATTR0]] {
+// UNSAFE-NEXT: [[ENTRY:.*:]]
+// UNSAFE-NEXT: [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)
+// UNSAFE-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_float_pre_inc.n to ptr), float 1.000000e+00 seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.ignore.denormal.mode [[META3]]
+// UNSAFE-NEXT: [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00
+// UNSAFE-NEXT: ret float [[TMP1]]
+//
+float test_float_pre_inc()
+{
+ static _Atomic float n;
+ return ++n;
+}
+
+// SAFE-LABEL: define dso_local double @test_double_post_inc(
+// SAFE-SAME: ) #[[ATTR0]] {
+// SAFE-NEXT: [[ENTRY:.*:]]
+// SAFE-NEXT: [[RETVAL:%.*]] = alloca double, align 8, addrspace(5)
+// SAFE-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_double_post_inc.n to ptr), float 1.000000e+00 seq_cst, align 8
+// SAFE-NEXT: store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 8
+// SAFE-NEXT: [[TMP1:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8
+// SAFE-NEXT: ret double [[TMP1]]
+//
+// UNSAFE-LABEL: define dso_local double @test_double_post_inc(
+// UNSAFE-SAME: ) #[[ATTR0]] {
+// UNSAFE-NEXT: [[ENTRY:.*:]]
+// UNSAFE-NEXT: [[RETVAL:%.*]] = alloca double, align 8, addrspace(5)
+// UNSAFE-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_double_post_inc.n to ptr), float 1.000000e+00 seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.ignore.denormal.mode [[META3]]
+// UNSAFE-NEXT: store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 8
+// UNSAFE-NEXT: [[TMP1:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8
+// UNSAFE-NEXT: ret double [[TMP1]]
+//
+double test_double_post_inc()
+{
+ static _Atomic double n;
+ return n++;
+}
+
+// SAFE-LABEL: define dso_local double @test_double_post_dc(
+// SAFE-SAME: ) #[[ATTR0]] {
+// SAFE-NEXT: [[ENTRY:.*:]]
+// SAFE-NEXT: [[RETVAL:%.*]] = alloca double, align 8, addrspace(5)
+// SAFE-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT: [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_double_post_dc.n to ptr), float 1.000000e+00 seq_cst, align 8
+// SAFE-NEXT: store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 8
+// SAFE-NEXT: [[TMP1:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8
+// SAFE-NEXT: ret double [[TMP1]]
+//
+// UNSAFE-LABEL: define dso_local double @test_double_post_dc(
+// UNSAFE-SAME: ) #[[ATTR0]] {
+// UNSAFE-NEXT: [[ENTRY:.*:]]
+// UNSAFE-NEXT: [[RETVAL:%.*]] = alloca double, align 8, addrspace(5)
+// UNSAFE-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT: [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_double_post_dc.n to ptr), float 1.000000e+00 seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META3]]
+// UNSAFE-NEXT: store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 8
+// UNSAFE-NEXT: [[TMP1:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8
+// UNSAFE-NEXT: ret double [[TMP1]]
+//
+double test_double_post_dc()
+{
+ static _Atomic double n;
+ return n--;
+}
+
+// SAFE-LABEL: define dso_local double @test_double_pre_dc(
+// SAFE-SAME: ) #[[ATTR0]] {
+// SAFE-NEXT: [[ENTRY:.*:]]
+// SAFE-NEXT: [[RETVAL:%.*]] = alloca double, align 8, addrspace(5)
+// SAFE-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT: [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_double_pre_dc.n to ptr), float 1.000000e+00 seq_cst, align 8
+// SAFE-NEXT: [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00
+// SAFE-NEXT: store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 8
+// SAFE-NEXT: [[TMP2:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8
+// SAFE-NEXT: ret double [[TMP2]]
+//
+// UNSAFE-LABEL: define dso_local double @test_double_pre_dc(
+// UNSAFE-SAME: ) #[[ATTR0]] {
+// UNSAFE-NEXT: [[ENTRY:.*:]]
+// UNSAFE-NEXT: [[RETVAL:%.*]] = alloca double, align 8, addrspace(5)
+// UNSAFE-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT: [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_double_pre_dc.n to ptr), float 1.000000e+00 seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META3]]
+// UNSAFE-NEXT: [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00
+// UNSAFE-NEXT: store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 8
+// UNSAFE-NEXT: [[TMP2:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8
+// UNSAFE-NEXT: ret double [[TMP2]]
+//
+double test_double_pre_dc()
+{
+ static _Atomic double n;
+ return --n;
+}
+
+// SAFE-LABEL: define dso_local double @test_double_pre_inc(
+// SAFE-SAME: ) #[[ATTR0]] {
+// SAFE-NEXT: [[ENTRY:.*:]]
+// SAFE-NEXT: [[RETVAL:%.*]] = alloca double, align 8, addrspace(5)
+// SAFE-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_double_pre_inc.n to ptr), float 1.000000e+00 seq_cst, align 8
+// SAFE-NEXT: [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00
+// SAFE-NEXT: store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 8
+// SAFE-NEXT: [[TMP2:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8
+// SAFE-NEXT: ret double [[TMP2]]
+//
+// UNSAFE-LABEL: define dso_local double @test_double_pre_inc(
+// UNSAFE-SAME: ) #[[ATTR0]] {
+// UNSAFE-NEXT: [[ENTRY:.*:]]
+// UNSAFE-NEXT: [[RETVAL:%.*]] = alloca double, align 8, addrspace(5)
+// UNSAFE-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_double_pre_inc.n to ptr), float 1.000000e+00 seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.ignore.denormal.mode [[META3]]
+// UNSAFE-NEXT: [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00
+// UNSAFE-NEXT: store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 8
+// UNSAFE-NEXT: [[TMP2:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8
+// UNSAFE-NEXT: ret double [[TMP2]]
+//
+double test_double_pre_inc()
+{
+ static _Atomic double n;
+ return ++n;
+}
+
+// SAFE-LABEL: define dso_local half @test__Float16_post_inc(
+// SAFE-SAME: ) #[[ATTR0]] {
+// SAFE-NEXT: [[ENTRY:.*:]]
+// SAFE-NEXT: [[RETVAL:%.*]] = alloca half, align 2, addrspace(5)
+// SAFE-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test__Float16_post_inc.n to ptr), float 1.000000e+00 seq_cst, align 2
+// SAFE-NEXT: store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 2
+// SAFE-NEXT: [[TMP1:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2
+// SAFE-NEXT: ret half [[TMP1]]
+//
+// UNSAFE-LABEL: define dso_local half @test__Float16_post_inc(
+// UNSAFE-SAME: ) #[[ATTR0]] {
+// UNSAFE-NEXT: [[ENTRY:.*:]]
+// UNSAFE-NEXT: [[RETVAL:%.*]] = alloca half, align 2, addrspace(5)
+// UNSAFE-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test__Float16_post_inc.n to ptr), float 1.000000e+00 seq_cst, align 2, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.ignore.denormal.mode [[META3]]
+// UNSAFE-NEXT: store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 2
+// UNSAFE-NEXT: [[TMP1:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2
+// UNSAFE-NEXT: ret half [[TMP1]]
+//
+_Float16 test__Float16_post_inc()
+{
+ static _Atomic _Float16 n;
+ return n++;
+}
+
+// SAFE-LABEL: define dso_local half @test__Float16_post_dc(
+// SAFE-SAME: ) #[[ATTR0]] {
+// SAFE-NEXT: [[ENTRY:.*:]]
+// SAFE-NEXT: [[RETVAL:%.*]] = alloca half, align 2, addrspace(5)
+// SAFE-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT: [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test__Float16_post_dc.n to ptr), float 1.000000e+00 seq_cst, align 2
+// SAFE-NEXT: store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 2
+// SAFE-NEXT: [[TMP1:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2
+// SAFE-NEXT: ret half [[TMP1]]
+//
+// UNSAFE-LABEL: define dso_local half @test__Float16_post_dc(
+// UNSAFE-SAME: ) #[[ATTR0]] {
+// UNSAFE-NEXT: [[ENTRY:.*:]]
+// UNSAFE-NEXT: [[RETVAL:%.*]] = alloca half, align 2, addrspace(5)
+// UNSAFE-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT: [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test__Float16_post_dc.n to ptr), float 1.000000e+00 seq_cst, align 2, !amdgpu.no.fine.grained.memory [[META3]]
+// UNSAFE-NEXT: store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 2
+// UNSAFE-NEXT: [[TMP1:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2
+// UNSAFE-NEXT: ret half [[TMP1]]
+//
+_Float16 test__Float16_post_dc()
+{
+ static _Atomic _Float16 n;
+ return n--;
+}
+
+// SAFE-LABEL: define dso_local half @test__Float16_pre_dc(
+// SAFE-SAME: ) #[[ATTR0]] {
+// SAFE-NEXT: [[ENTRY:.*:]]
+// SAFE-NEXT: [[RETVAL:%.*]] = alloca half, align 2, addrspace(5)
+// SAFE-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT: [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test__Float16_pre_dc.n to ptr), float 1.000000e+00 seq_cst, align 2
+// SAFE-NEXT: [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00
+// SAFE-NEXT: store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 2
+// SAFE-NEXT: [[TMP2:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2
+// SAFE-NEXT: ret half [[TMP2]]
+//
+// UNSAFE-LABEL: define dso_local half @test__Float16_pre_dc(
+// UNSAFE-SAME: ) #[[ATTR0]] {
+// UNSAFE-NEXT: [[ENTRY:.*:]]
+// UNSAFE-NEXT: [[RETVAL:%.*]] = alloca half, align 2, addrspace(5)
+// UNSAFE-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT: [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test__Float16_pre_dc.n to ptr), float 1.000000e+00 seq_cst, align 2, !amdgpu.no.fine.grained.memory [[META3]]
+// UNSAFE-NEXT: [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00
+// UNSAFE-NEXT: store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 2
+// UNSAFE-NEXT: [[TMP2:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2
+// UNSAFE-NEXT: ret half [[TMP2]]
+//
+_Float16 test__Float16_pre_dc()
+{
+ static _Atomic _Float16 n;
+ return --n;
+}
+
+// SAFE-LABEL: define dso_local half @test__Float16_pre_inc(
+// SAFE-SAME: ) #[[ATTR0]] {
+// SAFE-NEXT: [[ENTRY:.*:]]
+// SAFE-NEXT: [[RETVAL:%.*]] = alloca half, align 2, addrspace(5)
+// SAFE-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test__Float16_pre_inc.n to ptr), float 1.000000e+00 seq_cst, align 2
+// SAFE-NEXT: [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00
+// SAFE-NEXT: store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 2
+// SAFE-NEXT: [[TMP2:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2
+// SAFE-NEXT: ret half [[TMP2]]
+//
+// UNSAFE-LABEL: define dso_local half @test__Float16_pre_inc(
+// UNSAFE-SAME: ) #[[ATTR0]] {
+// UNSAFE-NEXT: [[ENTRY:.*:]]
+// UNSAFE-NEXT: [[RETVAL:%.*]] = alloca half, align 2, addrspace(5)
+// UNSAFE-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test__Float16_pre_inc.n to ptr), float 1.000000e+00 seq_cst, align 2, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.ignore.denormal.mode [[META3]]
+// UNSAFE-NEXT: [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00
+// UNSAFE-NEXT: store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 2
+// UNSAFE-NEXT: [[TMP2:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2
+// UNSAFE-NEXT: ret half [[TMP2]]
+//
+_Float16 test__Float16_pre_inc()
+{
+ static _Atomic _Float16 n;
+ return ++n;
+}
+//.
+// UNSAFE: [[META3]] = !{}
+//.
+//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+// CHECK: {{.*}}
diff --git a/clang/test/CodeGen/aarch64-fmv-streaming.c b/clang/test/CodeGen/aarch64-fmv-streaming.c
new file mode 100644
index 0000000..e549ccd
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-fmv-streaming.c
@@ -0,0 +1,107 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -emit-llvm -o - %s | FileCheck %s
+
+
+// CHECK-LABEL: define {{[^@]+}}@n_callee._Msve
+// CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+//
+// CHECK-LABEL: define {{[^@]+}}@n_callee._Msimd
+// CHECK-SAME: () #[[ATTR1:[0-9]+]] {
+//
+__arm_locally_streaming __attribute__((target_clones("sve", "simd"))) void n_callee(void) {}
+// CHECK-LABEL: define {{[^@]+}}@n_callee._Msme2
+// CHECK-SAME: () #[[ATTR2:[0-9]+]] {
+//
+__attribute__((target_version("sme2"))) void n_callee(void) {}
+// CHECK-LABEL: define {{[^@]+}}@n_callee.default
+// CHECK-SAME: () #[[ATTR3:[0-9]+]] {
+//
+__attribute__((target_version("default"))) void n_callee(void) {}
+
+
+// CHECK-LABEL: define {{[^@]+}}@s_callee._Msve
+// CHECK-SAME: () #[[ATTR4:[0-9]+]] {
+//
+// CHECK-LABEL: define {{[^@]+}}@s_callee._Msimd
+// CHECK-SAME: () #[[ATTR5:[0-9]+]] {
+//
+__attribute__((target_clones("sve", "simd"))) void s_callee(void) __arm_streaming {}
+// CHECK-LABEL: define {{[^@]+}}@s_callee._Msme2
+// CHECK-SAME: () #[[ATTR6:[0-9]+]] {
+//
+__arm_locally_streaming __attribute__((target_version("sme2"))) void s_callee(void) __arm_streaming {}
+// CHECK-LABEL: define {{[^@]+}}@s_callee.default
+// CHECK-SAME: () #[[ATTR7:[0-9]+]] {
+//
+__attribute__((target_version("default"))) void s_callee(void) __arm_streaming {}
+
+
+// CHECK-LABEL: define {{[^@]+}}@sc_callee._Msve
+// CHECK-SAME: () #[[ATTR8:[0-9]+]] {
+//
+// CHECK-LABEL: define {{[^@]+}}@sc_callee._Msimd
+// CHECK-SAME: () #[[ATTR9:[0-9]+]] {
+//
+__attribute__((target_clones("sve", "simd"))) void sc_callee(void) __arm_streaming_compatible {}
+// CHECK-LABEL: define {{[^@]+}}@sc_callee._Msme2
+// CHECK-SAME: () #[[ATTR10:[0-9]+]] {
+//
+__arm_locally_streaming __attribute__((target_version("sme2"))) void sc_callee(void) __arm_streaming_compatible {}
+// CHECK-LABEL: define {{[^@]+}}@sc_callee.default
+// CHECK-SAME: () #[[ATTR11:[0-9]+]] {
+//
+__attribute__((target_version("default"))) void sc_callee(void) __arm_streaming_compatible {}
+
+
+// CHECK-LABEL: define {{[^@]+}}@n_caller
+// CHECK-SAME: () #[[ATTR3:[0-9]+]] {
+// CHECK: call void @n_callee()
+// CHECK: call void @s_callee() #[[ATTR12:[0-9]+]]
+// CHECK: call void @sc_callee() #[[ATTR13:[0-9]+]]
+//
+void n_caller(void) {
+ n_callee();
+ s_callee();
+ sc_callee();
+}
+
+
+// CHECK-LABEL: define {{[^@]+}}@s_caller
+// CHECK-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK: call void @n_callee()
+// CHECK: call void @s_callee() #[[ATTR12]]
+// CHECK: call void @sc_callee() #[[ATTR13]]
+//
+void s_caller(void) __arm_streaming {
+ n_callee();
+ s_callee();
+ sc_callee();
+}
+
+
+// CHECK-LABEL: define {{[^@]+}}@sc_caller
+// CHECK-SAME: () #[[ATTR11:[0-9]+]] {
+// CHECK: call void @n_callee()
+// CHECK: call void @s_callee() #[[ATTR12]]
+// CHECK: call void @sc_callee() #[[ATTR13]]
+//
+void sc_caller(void) __arm_streaming_compatible {
+ n_callee();
+ s_callee();
+ sc_callee();
+}
+
+
+// CHECK: attributes #[[ATTR0:[0-9]+]] = {{.*}} "aarch64_pstate_sm_body"
+// CHECK: attributes #[[ATTR1:[0-9]+]] = {{.*}} "aarch64_pstate_sm_body"
+// CHECK: attributes #[[ATTR2:[0-9]+]] = {{.*}}
+// CHECK: attributes #[[ATTR3]] = {{.*}}
+// CHECK: attributes #[[ATTR4:[0-9]+]] = {{.*}} "aarch64_pstate_sm_enabled"
+// CHECK: attributes #[[ATTR5:[0-9]+]] = {{.*}} "aarch64_pstate_sm_enabled"
+// CHECK: attributes #[[ATTR6:[0-9]+]] = {{.*}} "aarch64_pstate_sm_body" "aarch64_pstate_sm_enabled"
+// CHECK: attributes #[[ATTR7]] = {{.*}} "aarch64_pstate_sm_enabled"
+// CHECK: attributes #[[ATTR8:[0-9]+]] = {{.*}} "aarch64_pstate_sm_compatible"
+// CHECK: attributes #[[ATTR9:[0-9]+]] = {{.*}} "aarch64_pstate_sm_compatible"
+// CHECK: attributes #[[ATTR10]] = {{.*}} "aarch64_pstate_sm_body" "aarch64_pstate_sm_compatible"
+// CHECK: attributes #[[ATTR11]] = {{.*}} "aarch64_pstate_sm_compatible"
+// CHECK: attributes #[[ATTR12]] = {{.*}} "aarch64_pstate_sm_enabled"
+// CHECK: attributes #[[ATTR13]] = {{.*}} "aarch64_pstate_sm_compatible"
diff --git a/clang/test/CodeGen/finite-math.c b/clang/test/CodeGen/finite-math.c
index d1a2956..9cddba99d 100644
--- a/clang/test/CodeGen/finite-math.c
+++ b/clang/test/CodeGen/finite-math.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -ffinite-math-only -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK -check-prefix=FINITE
+// RUN: %clang_cc1 -menable-no-infs -menable-no-nans -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK -check-prefix=FINITE
// RUN: %clang_cc1 -fno-signed-zeros -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK -check-prefix=NSZ
// RUN: %clang_cc1 -freciprocal-math -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK -check-prefix=RECIP
// RUN: %clang_cc1 -mreassociate -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK -check-prefix=REASSOC
diff --git a/clang/test/CodeGen/fp-floatcontrol-stack.cpp b/clang/test/CodeGen/fp-floatcontrol-stack.cpp
index 090da25..237c9d4 100644
--- a/clang/test/CodeGen/fp-floatcontrol-stack.cpp
+++ b/clang/test/CodeGen/fp-floatcontrol-stack.cpp
@@ -1,7 +1,7 @@
// RUN: %clang_cc1 -triple x86_64-linux-gnu -ffp-contract=on -DDEFAULT=1 -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-DDEFAULT %s
// RUN: %clang_cc1 -triple x86_64-linux-gnu -ffp-contract=on -DEBSTRICT=1 -ffp-exception-behavior=strict -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-DEBSTRICT %s
// RUN: %clang_cc1 -triple x86_64-linux-gnu -DFAST=1 -ffast-math -ffp-contract=fast -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-FAST %s
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -ffp-contract=on -DNOHONOR=1 -menable-no-infs -menable-no-nans -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-NOHONOR %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -ffp-contract=on -DNOHONOR=1 -ffinite-math-only -menable-no-infs -menable-no-nans -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-NOHONOR %s
#define FUN(n) \
(float z) { return n * z + n; }
diff --git a/clang/test/CodeGen/fp-options-to-fast-math-flags.c b/clang/test/CodeGen/fp-options-to-fast-math-flags.c
index abdcf85..6aa6226 100644
--- a/clang/test/CodeGen/fp-options-to-fast-math-flags.c
+++ b/clang/test/CodeGen/fp-options-to-fast-math-flags.c
@@ -1,7 +1,7 @@
// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s | FileCheck -check-prefix CHECK-PRECISE %s
// RUN: %clang_cc1 -triple x86_64-unknown-unknown -menable-no-nans -emit-llvm -o - %s | FileCheck -check-prefix CHECK-NO-NANS %s
// RUN: %clang_cc1 -triple x86_64-unknown-unknown -menable-no-infs -emit-llvm -o - %s | FileCheck -check-prefix CHECK-NO-INFS %s
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -ffinite-math-only -emit-llvm -o - %s | FileCheck -check-prefix CHECK-FINITE %s
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -menable-no-infs -menable-no-nans -emit-llvm -o - %s | FileCheck -check-prefix CHECK-FINITE %s
// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fno-signed-zeros -emit-llvm -o - %s | FileCheck -check-prefix CHECK-NO-SIGNED-ZEROS %s
// RUN: %clang_cc1 -triple x86_64-unknown-unknown -mreassociate -emit-llvm -o - %s | FileCheck -check-prefix CHECK-REASSOC %s
// RUN: %clang_cc1 -triple x86_64-unknown-unknown -freciprocal-math -emit-llvm -o - %s | FileCheck -check-prefix CHECK-RECIP %s
diff --git a/clang/test/CodeGen/nofpclass.c b/clang/test/CodeGen/nofpclass.c
index fc4c64f..2347091 100644
--- a/clang/test/CodeGen/nofpclass.c
+++ b/clang/test/CodeGen/nofpclass.c
@@ -1,7 +1,7 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-attributes --version 2
// REQUIRES: x86-registered-target
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-feature +avx -fenable-matrix -ffinite-math-only -emit-llvm -o - %s | FileCheck -check-prefixes=CFINITEONLY %s
-// RUN: %clang_cc1 -x cl -triple x86_64-unknown-unknown -target-feature +avx -fenable-matrix -cl-finite-math-only -emit-llvm -o - %s | FileCheck -check-prefixes=CLFINITEONLY %s
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-feature +avx -fenable-matrix -menable-no-infs -menable-no-nans -emit-llvm -o - %s | FileCheck -check-prefixes=CFINITEONLY %s
+// RUN: %clang_cc1 -x cl -triple x86_64-unknown-unknown -target-feature +avx -fenable-matrix -menable-no-nans -menable-no-infs -emit-llvm -o - %s | FileCheck -check-prefixes=CLFINITEONLY %s
// RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-feature +avx -fenable-matrix -menable-no-nans -emit-llvm -o - %s | FileCheck -check-prefixes=NONANS %s
// RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-feature +avx -fenable-matrix -menable-no-infs -emit-llvm -o - %s | FileCheck -check-prefixes=NOINFS %s
diff --git a/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu b/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu
index eeb23bc..55ddb52 100644
--- a/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu
+++ b/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu
@@ -1,6 +1,10 @@
// RUN: %clang_cc1 -x hip %s -emit-llvm -o - -triple=amdgcn-amd-amdhsa \
// RUN: -fcuda-is-device -target-cpu gfx906 -fnative-half-type \
-// RUN: -fnative-half-arguments-and-returns | FileCheck %s
+// RUN: -fnative-half-arguments-and-returns | FileCheck -check-prefixes=CHECK,SAFEIR %s
+
+// RUN: %clang_cc1 -x hip %s -emit-llvm -o - -triple=amdgcn-amd-amdhsa \
+// RUN: -fcuda-is-device -target-cpu gfx906 -fnative-half-type \
+// RUN: -fnative-half-arguments-and-returns -munsafe-fp-atomics | FileCheck -check-prefixes=CHECK,UNSAFEIR %s
// RUN: %clang_cc1 -x hip %s -O3 -S -o - -triple=amdgcn-amd-amdhsa \
// RUN: -fcuda-is-device -target-cpu gfx1100 -fnative-half-type \
@@ -18,24 +22,38 @@
__global__ void ffp1(float *p) {
// CHECK-LABEL: @_Z4ffp1Pf
- // CHECK: atomicrmw fadd ptr {{.*}} monotonic
- // CHECK: atomicrmw fmax ptr {{.*}} monotonic
- // CHECK: atomicrmw fmin ptr {{.*}} monotonic
- // CHECK: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic
- // CHECK: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic
+ // SAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 4{{$}}
+ // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 4{{$}}
+ // SAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 4{{$}}
+ // SAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 4{{$}}
+ // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 4{{$}}
+ // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 4{{$}}
+
+ // UNSAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
+ // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+ // UNSAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+ // UNSAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+ // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+ // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+
// SAFE: _Z4ffp1Pf
// SAFE: global_atomic_cmpswap
// SAFE: global_atomic_cmpswap
// SAFE: global_atomic_cmpswap
// SAFE: global_atomic_cmpswap
// SAFE: global_atomic_cmpswap
+ // SAFE: global_atomic_cmpswap
+
// UNSAFE: _Z4ffp1Pf
// UNSAFE: global_atomic_add_f32
// UNSAFE: global_atomic_cmpswap
// UNSAFE: global_atomic_cmpswap
// UNSAFE: global_atomic_cmpswap
// UNSAFE: global_atomic_cmpswap
+ // UNSAFE: global_atomic_cmpswap
+
__atomic_fetch_add(p, 1.0f, memory_order_relaxed);
+ __atomic_fetch_sub(p, 1.0f, memory_order_relaxed);
__atomic_fetch_max(p, 1.0f, memory_order_relaxed);
__atomic_fetch_min(p, 1.0f, memory_order_relaxed);
__hip_atomic_fetch_max(p, 1.0f, memory_order_relaxed, __HIP_MEMORY_SCOPE_AGENT);
@@ -44,23 +62,36 @@ __global__ void ffp1(float *p) {
__global__ void ffp2(double *p) {
// CHECK-LABEL: @_Z4ffp2Pd
- // CHECK: atomicrmw fsub ptr {{.*}} monotonic
- // CHECK: atomicrmw fmax ptr {{.*}} monotonic
- // CHECK: atomicrmw fmin ptr {{.*}} monotonic
- // CHECK: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic
- // CHECK: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic
+ // SAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 8{{$}}
+ // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8{{$}}
+ // SAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 8{{$}}
+ // SAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 8{{$}}
+ // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 8{{$}}
+ // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8{{$}}
+
+ // UNSAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+ // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+ // UNSAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+ // UNSAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+ // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+ // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+
// SAFE-LABEL: @_Z4ffp2Pd
// SAFE: global_atomic_cmpswap_b64
// SAFE: global_atomic_cmpswap_b64
// SAFE: global_atomic_cmpswap_b64
// SAFE: global_atomic_cmpswap_b64
// SAFE: global_atomic_cmpswap_b64
+ // SAFE: global_atomic_cmpswap_b64
+
// UNSAFE-LABEL: @_Z4ffp2Pd
+ // UNSAFE: global_atomic_add_f64
// UNSAFE: global_atomic_cmpswap_x2
// UNSAFE: global_atomic_cmpswap_x2
// UNSAFE: global_atomic_cmpswap_x2
// UNSAFE: global_atomic_max_f64
// UNSAFE: global_atomic_min_f64
+ __atomic_fetch_add(p, 1.0, memory_order_relaxed);
__atomic_fetch_sub(p, 1.0, memory_order_relaxed);
__atomic_fetch_max(p, 1.0, memory_order_relaxed);
__atomic_fetch_min(p, 1.0, memory_order_relaxed);
@@ -71,11 +102,20 @@ __global__ void ffp2(double *p) {
// long double is the same as double for amdgcn.
__global__ void ffp3(long double *p) {
// CHECK-LABEL: @_Z4ffp3Pe
- // CHECK: atomicrmw fsub ptr {{.*}} monotonic
- // CHECK: atomicrmw fmax ptr {{.*}} monotonic
- // CHECK: atomicrmw fmin ptr {{.*}} monotonic
- // CHECK: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic
- // CHECK: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic
+ // SAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 8{{$}}
+ // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8{{$}}
+ // SAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 8{{$}}
+ // SAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 8{{$}}
+ // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 8{{$}}
+ // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8{{$}}
+
+ // UNSAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+ // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+ // UNSAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+ // UNSAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+ // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+ // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+
// SAFE-LABEL: @_Z4ffp3Pe
// SAFE: global_atomic_cmpswap_b64
// SAFE: global_atomic_cmpswap_b64
@@ -88,6 +128,7 @@ __global__ void ffp3(long double *p) {
// UNSAFE: global_atomic_cmpswap_x2
// UNSAFE: global_atomic_max_f64
// UNSAFE: global_atomic_min_f64
+ __atomic_fetch_add(p, 1.0L, memory_order_relaxed);
__atomic_fetch_sub(p, 1.0L, memory_order_relaxed);
__atomic_fetch_max(p, 1.0L, memory_order_relaxed);
__atomic_fetch_min(p, 1.0L, memory_order_relaxed);
@@ -98,37 +139,52 @@ __global__ void ffp3(long double *p) {
__device__ double ffp4(double *p, float f) {
// CHECK-LABEL: @_Z4ffp4Pdf
// CHECK: fpext float {{.*}} to double
- // CHECK: atomicrmw fsub ptr {{.*}} monotonic
+ // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8{{$}}
+ // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
return __atomic_fetch_sub(p, f, memory_order_relaxed);
}
__device__ double ffp5(double *p, int i) {
// CHECK-LABEL: @_Z4ffp5Pdi
// CHECK: sitofp i32 {{.*}} to double
- // CHECK: atomicrmw fsub ptr {{.*}} monotonic
+ // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8{{$}}
+ // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
return __atomic_fetch_sub(p, i, memory_order_relaxed);
}
__global__ void ffp6(_Float16 *p) {
// CHECK-LABEL: @_Z4ffp6PDF16
- // CHECK: atomicrmw fadd ptr {{.*}} monotonic
- // CHECK: atomicrmw fmax ptr {{.*}} monotonic
- // CHECK: atomicrmw fmin ptr {{.*}} monotonic
- // CHECK: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic
- // CHECK: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic
+ // SAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 2{{$}}
+ // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 2{{$}}
+ // SAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 2{{$}}
+ // SAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 2{{$}}
+ // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 2{{$}}
+ // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 2{{$}}
+
+ // UNSAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 2, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+ // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 2, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+ // UNSAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 2, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+ // UNSAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 2, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+ // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 2, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+ // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 2, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+
// SAFE: _Z4ffp6PDF16
// SAFE: global_atomic_cmpswap
// SAFE: global_atomic_cmpswap
// SAFE: global_atomic_cmpswap
// SAFE: global_atomic_cmpswap
// SAFE: global_atomic_cmpswap
+ // SAFE: global_atomic_cmpswap
+
// UNSAFE: _Z4ffp6PDF16
// UNSAFE: global_atomic_cmpswap
// UNSAFE: global_atomic_cmpswap
// UNSAFE: global_atomic_cmpswap
// UNSAFE: global_atomic_cmpswap
// UNSAFE: global_atomic_cmpswap
+ // UNSAFE: global_atomic_cmpswap
__atomic_fetch_add(p, 1.0, memory_order_relaxed);
+ __atomic_fetch_sub(p, 1.0, memory_order_relaxed);
__atomic_fetch_max(p, 1.0, memory_order_relaxed);
__atomic_fetch_min(p, 1.0, memory_order_relaxed);
__hip_atomic_fetch_max(p, 1.0f, memory_order_relaxed, __HIP_MEMORY_SCOPE_AGENT);
diff --git a/clang/test/CodeGenOpenCL/builtins-alloca.cl b/clang/test/CodeGenOpenCL/builtins-alloca.cl
new file mode 100644
index 0000000..474e95e
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-alloca.cl
@@ -0,0 +1,141 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 %s -O0 -triple amdgcn-amd-amdhsa -cl-std=CL1.2 \
+// RUN: -emit-llvm -o - | FileCheck --check-prefixes=OPENCL %s
+// RUN: %clang_cc1 %s -O0 -triple amdgcn-amd-amdhsa -cl-std=CL2.0 \
+// RUN: -emit-llvm -o - | FileCheck --check-prefixes=OPENCL %s
+// RUN: %clang_cc1 %s -O0 -triple amdgcn-amd-amdhsa -cl-std=CL3.0 \
+// RUN: -emit-llvm -o - | FileCheck --check-prefixes=OPENCL %s
+// RUN: %clang_cc1 %s -O0 -triple amdgcn-amd-amdhsa -cl-std=CL3.0 -cl-ext=+__opencl_c_generic_address_space \
+// RUN: -emit-llvm -o - | FileCheck --check-prefixes=OPENCL %s
+
+// OPENCL-LABEL: define dso_local void @test1_builtin_alloca(
+// OPENCL-SAME: i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+// OPENCL-NEXT: [[ENTRY:.*:]]
+// OPENCL-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// OPENCL-NEXT: [[ALLOC_PTR:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
+// OPENCL-NEXT: store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT: [[CONV:%.*]] = zext i32 [[TMP0]] to i64
+// OPENCL-NEXT: [[MUL:%.*]] = mul i64 [[CONV]], 4
+// OPENCL-NEXT: [[TMP1:%.*]] = alloca i8, i64 [[MUL]], align 8, addrspace(5)
+// OPENCL-NEXT: store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR]], align 4
+// OPENCL-NEXT: ret void
+//
+void test1_builtin_alloca(unsigned n) {
+ __private float* alloc_ptr = (__private float*)__builtin_alloca(n*sizeof(int));
+}
+
+// OPENCL-LABEL: define dso_local void @test1_builtin_alloca_uninitialized(
+// OPENCL-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// OPENCL-NEXT: [[ENTRY:.*:]]
+// OPENCL-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// OPENCL-NEXT: [[ALLOC_PTR_UNINITIALIZED:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
+// OPENCL-NEXT: store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT: [[CONV:%.*]] = zext i32 [[TMP0]] to i64
+// OPENCL-NEXT: [[MUL:%.*]] = mul i64 [[CONV]], 4
+// OPENCL-NEXT: [[TMP1:%.*]] = alloca i8, i64 [[MUL]], align 8, addrspace(5)
+// OPENCL-NEXT: store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_UNINITIALIZED]], align 4
+// OPENCL-NEXT: ret void
+//
+void test1_builtin_alloca_uninitialized(unsigned n) {
+ __private float* alloc_ptr_uninitialized = (__private float*)__builtin_alloca_uninitialized(n*sizeof(int));
+}
+
+// OPENCL-LABEL: define dso_local void @test1_builtin_alloca_with_align(
+// OPENCL-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// OPENCL-NEXT: [[ENTRY:.*:]]
+// OPENCL-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// OPENCL-NEXT: [[ALLOC_PTR_ALIGN:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
+// OPENCL-NEXT: store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT: [[CONV:%.*]] = zext i32 [[TMP0]] to i64
+// OPENCL-NEXT: [[MUL:%.*]] = mul i64 [[CONV]], 4
+// OPENCL-NEXT: [[TMP1:%.*]] = alloca i8, i64 [[MUL]], align 1, addrspace(5)
+// OPENCL-NEXT: store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_ALIGN]], align 4
+// OPENCL-NEXT: ret void
+//
+void test1_builtin_alloca_with_align(unsigned n) {
+ __private float* alloc_ptr_align = (__private float*)__builtin_alloca_with_align((n*sizeof(int)), 8);
+}
+
+// OPENCL-LABEL: define dso_local void @test1_builtin_alloca_with_align_uninitialized(
+// OPENCL-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// OPENCL-NEXT: [[ENTRY:.*:]]
+// OPENCL-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// OPENCL-NEXT: [[ALLOC_PTR_ALIGN_UNINITIALIZED:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
+// OPENCL-NEXT: store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT: [[CONV:%.*]] = zext i32 [[TMP0]] to i64
+// OPENCL-NEXT: [[MUL:%.*]] = mul i64 [[CONV]], 4
+// OPENCL-NEXT: [[TMP1:%.*]] = alloca i8, i64 [[MUL]], align 1, addrspace(5)
+// OPENCL-NEXT: store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_ALIGN_UNINITIALIZED]], align 4
+// OPENCL-NEXT: ret void
+//
+void test1_builtin_alloca_with_align_uninitialized(unsigned n) {
+ __private float* alloc_ptr_align_uninitialized = (__private float*)__builtin_alloca_with_align_uninitialized((n*sizeof(int)), 8);
+}
+
+// OPENCL-LABEL: define dso_local void @test2_builtin_alloca(
+// OPENCL-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// OPENCL-NEXT: [[ENTRY:.*:]]
+// OPENCL-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// OPENCL-NEXT: [[ALLOC_PTR:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
+// OPENCL-NEXT: store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT: [[CONV:%.*]] = zext i32 [[TMP0]] to i64
+// OPENCL-NEXT: [[TMP1:%.*]] = alloca i8, i64 [[CONV]], align 8, addrspace(5)
+// OPENCL-NEXT: store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR]], align 4
+// OPENCL-NEXT: ret void
+//
+void test2_builtin_alloca(unsigned n) {
+ __private void *alloc_ptr = __builtin_alloca(n);
+}
+
+// OPENCL-LABEL: define dso_local void @test2_builtin_alloca_uninitialized(
+// OPENCL-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// OPENCL-NEXT: [[ENTRY:.*:]]
+// OPENCL-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// OPENCL-NEXT: [[ALLOC_PTR_UNINITIALIZED:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
+// OPENCL-NEXT: store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT: [[CONV:%.*]] = zext i32 [[TMP0]] to i64
+// OPENCL-NEXT: [[TMP1:%.*]] = alloca i8, i64 [[CONV]], align 8, addrspace(5)
+// OPENCL-NEXT: store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_UNINITIALIZED]], align 4
+// OPENCL-NEXT: ret void
+//
+void test2_builtin_alloca_uninitialized(unsigned n) {
+ __private void *alloc_ptr_uninitialized = __builtin_alloca_uninitialized(n);
+}
+
+// OPENCL-LABEL: define dso_local void @test2_builtin_alloca_with_align(
+// OPENCL-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// OPENCL-NEXT: [[ENTRY:.*:]]
+// OPENCL-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// OPENCL-NEXT: [[ALLOC_PTR_ALIGN:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
+// OPENCL-NEXT: store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT: [[CONV:%.*]] = zext i32 [[TMP0]] to i64
+// OPENCL-NEXT: [[TMP1:%.*]] = alloca i8, i64 [[CONV]], align 1, addrspace(5)
+// OPENCL-NEXT: store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_ALIGN]], align 4
+// OPENCL-NEXT: ret void
+//
+void test2_builtin_alloca_with_align(unsigned n) {
+ __private void *alloc_ptr_align = __builtin_alloca_with_align(n, 8);
+}
+
+// OPENCL-LABEL: define dso_local void @test2_builtin_alloca_with_align_uninitialized(
+// OPENCL-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// OPENCL-NEXT: [[ENTRY:.*:]]
+// OPENCL-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// OPENCL-NEXT: [[ALLOC_PTR_ALIGN_UNINITIALIZED:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
+// OPENCL-NEXT: store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT: [[CONV:%.*]] = zext i32 [[TMP0]] to i64
+// OPENCL-NEXT: [[TMP1:%.*]] = alloca i8, i64 [[CONV]], align 1, addrspace(5)
+// OPENCL-NEXT: store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_ALIGN_UNINITIALIZED]], align 4
+// OPENCL-NEXT: ret void
+//
+void test2_builtin_alloca_with_align_uninitialized(unsigned n) {
+ __private void *alloc_ptr_align_uninitialized = __builtin_alloca_with_align_uninitialized(n, 8);
+}
diff --git a/clang/test/CodeGenOpenCL/relaxed-fpmath.cl b/clang/test/CodeGenOpenCL/relaxed-fpmath.cl
index 2751caa..a5f0019 100644
--- a/clang/test/CodeGenOpenCL/relaxed-fpmath.cl
+++ b/clang/test/CodeGenOpenCL/relaxed-fpmath.cl
@@ -1,6 +1,6 @@
// RUN: %clang_cc1 %s -emit-llvm -o - | FileCheck %s -check-prefix=NORMAL
// RUN: %clang_cc1 %s -emit-llvm -cl-fast-relaxed-math -o - | FileCheck %s -check-prefix=FAST
-// RUN: %clang_cc1 %s -emit-llvm -cl-finite-math-only -o - | FileCheck %s -check-prefix=FINITE
+// RUN: %clang_cc1 %s -emit-llvm -menable-no-infs -menable-no-nans -cl-finite-math-only -o - | FileCheck %s -check-prefix=FINITE
// RUN: %clang_cc1 %s -emit-llvm -cl-unsafe-math-optimizations -o - | FileCheck %s -check-prefix=UNSAFE
// RUN: %clang_cc1 %s -emit-llvm -cl-mad-enable -o - | FileCheck %s -check-prefix=MAD
// RUN: %clang_cc1 %s -emit-llvm -cl-no-signed-zeros -o - | FileCheck %s -check-prefix=NOSIGNED
@@ -9,7 +9,7 @@
// RUN: %clang_cc1 %s -DGEN_PCH=1 -finclude-default-header -triple spir-unknown-unknown -emit-pch -o %t.pch
// RUN: %clang_cc1 %s -include-pch %t.pch -fno-validate-pch -emit-llvm -o - | FileCheck %s -check-prefix=NORMAL
// RUN: %clang_cc1 %s -include-pch %t.pch -fno-validate-pch -emit-llvm -cl-fast-relaxed-math -o - | FileCheck %s -check-prefix=FAST
-// RUN: %clang_cc1 %s -include-pch %t.pch -fno-validate-pch -emit-llvm -cl-finite-math-only -o - | FileCheck %s -check-prefix=FINITE
+// RUN: %clang_cc1 %s -include-pch %t.pch -fno-validate-pch -emit-llvm -menable-no-infs -menable-no-nans -cl-finite-math-only -o - | FileCheck %s -check-prefix=FINITE
// RUN: %clang_cc1 %s -include-pch %t.pch -fno-validate-pch -emit-llvm -cl-unsafe-math-optimizations -o - | FileCheck %s -check-prefix=UNSAFE
// RUN: %clang_cc1 %s -include-pch %t.pch -fno-validate-pch -emit-llvm -cl-mad-enable -o - | FileCheck %s -check-prefix=MAD
// RUN: %clang_cc1 %s -include-pch %t.pch -fno-validate-pch -emit-llvm -cl-no-signed-zeros -o - | FileCheck %s -check-prefix=NOSIGNED
diff --git a/clang/test/Driver/amdgpu-toolchain.c b/clang/test/Driver/amdgpu-toolchain.c
index 8ab6a07..ebd1158 100644
--- a/clang/test/Driver/amdgpu-toolchain.c
+++ b/clang/test/Driver/amdgpu-toolchain.c
@@ -28,3 +28,7 @@
// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx906 -nogpulib \
// RUN: -fuse-ld=ld %s 2>&1 | FileCheck -check-prefixes=LD %s
// LD: ld.lld
+
+// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx906 -nogpulib \
+// RUN: -r %s 2>&1 | FileCheck -check-prefixes=RELO %s
+// RELO-NOT: -shared
diff --git a/clang/test/Driver/opencl.cl b/clang/test/Driver/opencl.cl
index aba37fc..3b0b191 100644
--- a/clang/test/Driver/opencl.cl
+++ b/clang/test/Driver/opencl.cl
@@ -35,7 +35,7 @@
// CHECK-OPT-DISABLE: "-cc1" {{.*}} "-cl-opt-disable"
// CHECK-STRICT-ALIASING: "-cc1" {{.*}} "-cl-strict-aliasing"
// CHECK-SINGLE-PRECISION-CONST: "-cc1" {{.*}} "-cl-single-precision-constant"
-// CHECK-FINITE-MATH-ONLY: "-cc1" {{.*}} "-cl-finite-math-only"
+// CHECK-FINITE-MATH-ONLY: "-cc1" {{.*}} "-menable-no-infs" "-menable-no-nans" "-cl-finite-math-only"
// CHECK-KERNEL-ARG-INFO: "-cc1" {{.*}} "-cl-kernel-arg-info"
// CHECK-UNSAFE-MATH-OPT: "-cc1" {{.*}} "-cl-unsafe-math-optimizations"
// CHECK-FAST-RELAXED-MATH: "-cc1" {{.*}} "-cl-fast-relaxed-math"
diff --git a/clang/test/Driver/stack-size-section.c b/clang/test/Driver/stack-size-section.c
index 71b9f85..7cd41e4 100644
--- a/clang/test/Driver/stack-size-section.c
+++ b/clang/test/Driver/stack-size-section.c
@@ -14,6 +14,7 @@
// RUN: %clang -### --target=x86_64-linux-gnu -flto -fstack-size-section %s 2>&1 | FileCheck %s --check-prefix=LTO
// RUN: %clang -### --target=x86_64-linux-gnu -flto -fstack-size-section -fno-stack-size-section %s 2>&1 | FileCheck %s --check-prefix=LTO-NO
+// RUN: %clang -### --target=x86_64-sie-ps5 -fstack-size-section %s 2>&1 | FileCheck %s --check-prefix=LTO
// LTO: "-plugin-opt=-stack-size-section"
// LTO-NO-NOT: "-plugin-opt=-stack-size-section"
diff --git a/clang/test/Headers/__clang_hip_cmath.hip b/clang/test/Headers/__clang_hip_cmath.hip
index cd085fd..ed1030b 100644
--- a/clang/test/Headers/__clang_hip_cmath.hip
+++ b/clang/test/Headers/__clang_hip_cmath.hip
@@ -13,7 +13,8 @@
// RUN: -internal-isystem %S/../../lib/Headers/cuda_wrappers \
// RUN: -internal-isystem %S/Inputs/include \
// RUN: -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-unknown \
-// RUN: -target-cpu gfx906 -emit-llvm %s -fcuda-is-device -O1 -ffinite-math-only -o - \
+// RUN: -target-cpu gfx906 -emit-llvm %s -fcuda-is-device -O1 -menable-no-infs \
+// RUN: -menable-no-nans -o - \
// RUN: -D__HIPCC_RTC__ | FileCheck -check-prefix=FINITEONLY %s
// DEFAULT-LABEL: @test_fma_f16(
diff --git a/clang/test/Headers/__clang_hip_math.hip b/clang/test/Headers/__clang_hip_math.hip
index 26da828..6ee1097 100644
--- a/clang/test/Headers/__clang_hip_math.hip
+++ b/clang/test/Headers/__clang_hip_math.hip
@@ -14,7 +14,8 @@
// RUN: -internal-isystem %S/../../lib/Headers/cuda_wrappers \
// RUN: -internal-isystem %S/Inputs/include \
// RUN: -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-unknown \
-// RUN: -target-cpu gfx906 -emit-llvm %s -fcuda-is-device -O1 -ffinite-math-only -o - \
+// RUN: -target-cpu gfx906 -emit-llvm %s -fcuda-is-device -O1 -menable-no-infs \
+// RUN: -menable-no-nans -o - \
// RUN: -D__HIPCC_RTC__ | FileCheck -check-prefixes=CHECK,FINITEONLY %s
// Check that we end up with -fapprox-func set on intrinsic calls
diff --git a/clang/test/Headers/float.c b/clang/test/Headers/float.c
index 051c505..a3dd9c9 100644
--- a/clang/test/Headers/float.c
+++ b/clang/test/Headers/float.c
@@ -2,7 +2,7 @@
// RUN: %clang_cc1 -fsyntax-only -verify -std=c99 -ffreestanding %s
// RUN: %clang_cc1 -fsyntax-only -verify -std=c11 -ffreestanding %s
// RUN: %clang_cc1 -fsyntax-only -verify -std=c23 -ffreestanding %s
-// RUN: %clang_cc1 -fsyntax-only -verify=finite -std=c23 -ffreestanding -ffinite-math-only %s
+// RUN: %clang_cc1 -fsyntax-only -verify=finite -std=c23 -ffreestanding -menable-no-nans -menable-no-infs %s
// RUN: %clang_cc1 -fsyntax-only -verify -xc++ -std=c++11 -ffreestanding %s
// RUN: %clang_cc1 -fsyntax-only -verify -xc++ -std=c++14 -ffreestanding %s
// RUN: %clang_cc1 -fsyntax-only -verify -xc++ -std=c++17 -ffreestanding %s
diff --git a/clang/test/OpenMP/amdgpu-unsafe-fp-atomics.cpp b/clang/test/OpenMP/amdgpu-unsafe-fp-atomics.cpp
new file mode 100644
index 0000000..7a34113
--- /dev/null
+++ b/clang/test/OpenMP/amdgpu-unsafe-fp-atomics.cpp
@@ -0,0 +1,59 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -o - | FileCheck -check-prefix=DEFAULT %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -munsafe-fp-atomics -emit-llvm %s -fopenmp-is-target-device -o - | FileCheck -check-prefix=UNSAFE-FP-ATOMICS %s
+
+#pragma omp declare target
+
+float fv, fx;
+double dv, dx;
+
+// DEFAULT-LABEL: define hidden void @_Z15atomic_fadd_f32v(
+// DEFAULT-SAME: ) #[[ATTR0:[0-9]+]] {
+// DEFAULT-NEXT: [[ENTRY:.*:]]
+// DEFAULT-NEXT: [[TMP0:%.*]] = load float, ptr addrspacecast (ptr addrspace(1) @fv to ptr), align 4
+// DEFAULT-NEXT: [[TMP1:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @fx to ptr), float [[TMP0]] monotonic, align 4
+// DEFAULT-NEXT: [[ADD:%.*]] = fadd float [[TMP1]], [[TMP0]]
+// DEFAULT-NEXT: store float [[ADD]], ptr addrspacecast (ptr addrspace(1) @fv to ptr), align 4
+// DEFAULT-NEXT: ret void
+//
+// UNSAFE-FP-ATOMICS-LABEL: define hidden void @_Z15atomic_fadd_f32v(
+// UNSAFE-FP-ATOMICS-SAME: ) #[[ATTR0:[0-9]+]] {
+// UNSAFE-FP-ATOMICS-NEXT: [[ENTRY:.*:]]
+// UNSAFE-FP-ATOMICS-NEXT: [[TMP0:%.*]] = load float, ptr addrspacecast (ptr addrspace(1) @fv to ptr), align 4
+// UNSAFE-FP-ATOMICS-NEXT: [[TMP1:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @fx to ptr), float [[TMP0]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META5:![0-9]+]], !amdgpu.ignore.denormal.mode [[META5]]
+// UNSAFE-FP-ATOMICS-NEXT: [[ADD:%.*]] = fadd float [[TMP1]], [[TMP0]]
+// UNSAFE-FP-ATOMICS-NEXT: store float [[ADD]], ptr addrspacecast (ptr addrspace(1) @fv to ptr), align 4
+// UNSAFE-FP-ATOMICS-NEXT: ret void
+//
+void atomic_fadd_f32() {
+#pragma omp atomic capture
+ fv = fx = fx + fv;
+}
+
+// DEFAULT-LABEL: define hidden void @_Z15atomic_fadd_f64v(
+// DEFAULT-SAME: ) #[[ATTR0]] {
+// DEFAULT-NEXT: [[ENTRY:.*:]]
+// DEFAULT-NEXT: [[TMP0:%.*]] = load double, ptr addrspacecast (ptr addrspace(1) @dv to ptr), align 8
+// DEFAULT-NEXT: [[TMP1:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @dx to ptr), double [[TMP0]] monotonic, align 8
+// DEFAULT-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], [[TMP0]]
+// DEFAULT-NEXT: store double [[ADD]], ptr addrspacecast (ptr addrspace(1) @dv to ptr), align 8
+// DEFAULT-NEXT: ret void
+//
+// UNSAFE-FP-ATOMICS-LABEL: define hidden void @_Z15atomic_fadd_f64v(
+// UNSAFE-FP-ATOMICS-SAME: ) #[[ATTR0]] {
+// UNSAFE-FP-ATOMICS-NEXT: [[ENTRY:.*:]]
+// UNSAFE-FP-ATOMICS-NEXT: [[TMP0:%.*]] = load double, ptr addrspacecast (ptr addrspace(1) @dv to ptr), align 8
+// UNSAFE-FP-ATOMICS-NEXT: [[TMP1:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @dx to ptr), double [[TMP0]] monotonic, align 8, !amdgpu.no.fine.grained.memory [[META5]]
+// UNSAFE-FP-ATOMICS-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], [[TMP0]]
+// UNSAFE-FP-ATOMICS-NEXT: store double [[ADD]], ptr addrspacecast (ptr addrspace(1) @dv to ptr), align 8
+// UNSAFE-FP-ATOMICS-NEXT: ret void
+//
+void atomic_fadd_f64() {
+#pragma omp atomic capture
+ dv = dx = dx + dv;
+}
+
+#pragma omp end declare target
+//.
+// UNSAFE-FP-ATOMICS: [[META5]] = !{}
+//.
diff --git a/clang/test/Parser/namelookup-anonymous-struct.c b/clang/test/Parser/namelookup-anonymous-struct.c
new file mode 100644
index 0000000..cb691c2
--- /dev/null
+++ b/clang/test/Parser/namelookup-anonymous-struct.c
@@ -0,0 +1,6 @@
+// RUN: %clang_cc1 -std=c11 -verify %s
+
+struct GH31295 {
+ struct { int x; };
+ int arr[sizeof(x)]; // expected-error{{use of undeclared identifier 'x'}}
+};
diff --git a/clang/test/Preprocessor/predefined-macros.c b/clang/test/Preprocessor/predefined-macros.c
index 7f036bf..633ba468 100644
--- a/clang/test/Preprocessor/predefined-macros.c
+++ b/clang/test/Preprocessor/predefined-macros.c
@@ -70,7 +70,7 @@
// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-NO-MATH-ERRNO
// CHECK-NO-MATH-ERRNO: #define __NO_MATH_ERRNO__ 1
//
-// RUN: %clang_cc1 %s -E -dM -ffinite-math-only -o - \
+// RUN: %clang_cc1 %s -E -dM -menable-no-nans -menable-no-infs -o - \
// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-FINITE-MATH-ONLY
// CHECK-FINITE-MATH-ONLY: #define __FINITE_MATH_ONLY__ 1
//
@@ -316,4 +316,4 @@
// RUN: -triple amdgcn-amd-amdhsa -fcuda-is-device | FileCheck -match-full-lines \
// RUN: %s --check-prefix=CHECK-HIPSTDPAR-INTERPOSE-DEV-NEG
// CHECK-HIPSTDPAR-INTERPOSE-DEV-NEG: #define __HIPSTDPAR__ 1
-// CHECK-HIPSTDPAR-INTERPOSE-DEV-NEG-NOT: #define __HIPSTDPAR_INTERPOSE_ALLOC__ 1 \ No newline at end of file
+// CHECK-HIPSTDPAR-INTERPOSE-DEV-NEG-NOT: #define __HIPSTDPAR_INTERPOSE_ALLOC__ 1
diff --git a/clang/test/Sema/aarch64-fmv-streaming.c b/clang/test/Sema/aarch64-fmv-streaming.c
new file mode 100644
index 0000000..93b7656
--- /dev/null
+++ b/clang/test/Sema/aarch64-fmv-streaming.c
@@ -0,0 +1,46 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -Waarch64-sme-attributes -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -Waarch64-sme-attributes -fsyntax-only -verify=expected-cpp -x c++ %s
+
+__attribute__((target_clones("sve", "simd"))) void ok_arm_streaming(void) __arm_streaming {}
+__arm_locally_streaming __attribute__((target_version("sme2"))) void ok_arm_streaming(void) __arm_streaming {}
+__attribute__((target_version("default"))) void ok_arm_streaming(void) __arm_streaming {}
+
+__attribute__((target_clones("sve", "simd"))) void ok_arm_streaming_compatible(void) __arm_streaming_compatible {}
+__arm_locally_streaming __attribute__((target_version("sme2"))) void ok_arm_streaming_compatible(void) __arm_streaming_compatible {}
+__attribute__((target_version("default"))) void ok_arm_streaming_compatible(void) __arm_streaming_compatible {}
+
+__arm_locally_streaming __attribute__((target_clones("sve", "simd"))) void ok_no_streaming(void) {}
+__attribute__((target_version("sme2"))) void ok_no_streaming(void) {}
+__attribute__((target_version("default"))) void ok_no_streaming(void) {}
+
+__attribute__((target_clones("sve", "simd"))) void bad_mixed_streaming(void) {}
+// expected-cpp-error@+2 {{multiversioned function declaration has a different calling convention}}
+// expected-error@+1 {{multiversioned function declaration has a different calling convention}}
+__attribute__((target_version("sme2"))) void bad_mixed_streaming(void) __arm_streaming {}
+// expected-cpp-error@+2 {{multiversioned function declaration has a different calling convention}}
+// expected-error@+1 {{multiversioned function declaration has a different calling convention}}
+__attribute__((target_version("default"))) void bad_mixed_streaming(void) __arm_streaming_compatible {}
+// expected-cpp-error@+2 {{multiversioned function declaration has a different calling convention}}
+// expected-error@+1 {{multiversioned function declaration has a different calling convention}}
+__arm_locally_streaming __attribute__((target_version("dotprod"))) void bad_mixed_streaming(void) __arm_streaming {}
+
+void n_caller(void) {
+ ok_arm_streaming();
+ ok_arm_streaming_compatible();
+ ok_no_streaming();
+ bad_mixed_streaming();
+}
+
+void s_caller(void) __arm_streaming {
+ ok_arm_streaming();
+ ok_arm_streaming_compatible();
+ ok_no_streaming();
+ bad_mixed_streaming();
+}
+
+void sc_caller(void) __arm_streaming_compatible {
+ ok_arm_streaming();
+ ok_arm_streaming_compatible();
+ ok_no_streaming();
+ bad_mixed_streaming();
+}
diff --git a/clang/test/Sema/aarch64-sme-func-attrs.c b/clang/test/Sema/aarch64-sme-func-attrs.c
index 6db39d6..0c263eb 100644
--- a/clang/test/Sema/aarch64-sme-func-attrs.c
+++ b/clang/test/Sema/aarch64-sme-func-attrs.c
@@ -455,48 +455,6 @@ void unimplemented_spill_fill_za(void (*share_zt0_only)(void) __arm_inout("zt0")
share_zt0_only();
}
-// expected-cpp-error@+2 {{streaming function cannot be multi-versioned}}
-// expected-error@+1 {{streaming function cannot be multi-versioned}}
-__attribute__((target_version("sme2")))
-void cannot_work_version(void) __arm_streaming {}
-// expected-cpp-error@+5 {{function declared 'void ()' was previously declared 'void () __arm_streaming', which has different SME function attributes}}
-// expected-cpp-note@-2 {{previous declaration is here}}
-// expected-error@+3 {{function declared 'void (void)' was previously declared 'void (void) __arm_streaming', which has different SME function attributes}}
-// expected-note@-4 {{previous declaration is here}}
-__attribute__((target_version("default")))
-void cannot_work_version(void) {}
-
-
-// expected-cpp-error@+2 {{streaming function cannot be multi-versioned}}
-// expected-error@+1 {{streaming function cannot be multi-versioned}}
-__attribute__((target_clones("sme2")))
-void cannot_work_clones(void) __arm_streaming {}
-
-
-__attribute__((target("sme2")))
-void just_fine_streaming(void) __arm_streaming {}
-__attribute__((target_version("sme2")))
-void just_fine(void) { just_fine_streaming(); }
-__attribute__((target_version("default")))
-void just_fine(void) {}
-
-
-__arm_locally_streaming
-__attribute__((target_version("sme2")))
-void incompatible_locally_streaming(void) {}
-// expected-error@-1 {{attribute 'target_version' multiversioning cannot be combined with attribute '__arm_locally_streaming'}}
-// expected-cpp-error@-2 {{attribute 'target_version' multiversioning cannot be combined with attribute '__arm_locally_streaming'}}
-__attribute__((target_version("default")))
-void incompatible_locally_streaming(void) {}
-
-
-void fmv_caller() {
- cannot_work_version();
- cannot_work_clones();
- just_fine();
- incompatible_locally_streaming();
-}
-
void sme_streaming_with_vl_arg(__SVInt8_t a) __arm_streaming { }
__SVInt8_t sme_streaming_returns_vl(void) __arm_streaming { __SVInt8_t r; return r; }
diff --git a/clang/test/Sema/warn-infinity-nan-disabled-lnx.cpp b/clang/test/Sema/warn-infinity-nan-disabled-lnx.cpp
index d18aaad..357c9e5 100644
--- a/clang/test/Sema/warn-infinity-nan-disabled-lnx.cpp
+++ b/clang/test/Sema/warn-infinity-nan-disabled-lnx.cpp
@@ -1,10 +1,11 @@
// RUN: %clang_cc1 -x c++ -verify=no-inf-no-nan \
-// RUN: -triple powerpc64le-unknown-unknown %s -menable-no-infs \
-// RUN: -menable-no-nans -std=c++23
+// RUN: -triple powerpc64le-unknown-unknown %s \
+// RUN: -menable-no-infs -menable-no-nans -std=c++23
// RUN: %clang_cc1 -x c++ -verify=no-inf-no-nan \
-// RUN: -triple powerpc64le-unknown-unknown %s -menable-no-infs \
-// RUN: -menable-no-nans -funsafe-math-optimizations -std=c++23
+// RUN: -triple powerpc64le-unknown-unknown %s \
+// RUN: -menable-no-infs -menable-no-nans -funsafe-math-optimizations \
+// RUN: -std=c++23
// RUN: %clang_cc1 -x c++ -verify=no-fast -triple powerpc64le-unknown-unknown \
// RUN: %s -std=c++23
diff --git a/clang/test/Sema/warn-infinity-nan-disabled-win.cpp b/clang/test/Sema/warn-infinity-nan-disabled-win.cpp
index 51f9d32..ee4eb33 100644
--- a/clang/test/Sema/warn-infinity-nan-disabled-win.cpp
+++ b/clang/test/Sema/warn-infinity-nan-disabled-win.cpp
@@ -6,8 +6,9 @@
// RUN: -menable-no-nans -std=c++23
// RUN: %clang_cc1 -x c++ -verify=no-inf-no-nan \
-// RUN: -triple powerpc64le-unknown-unknown %s -menable-no-infs \
-// RUN: -menable-no-nans -funsafe-math-optimizations -std=c++23
+// RUN: -triple powerpc64le-unknown-unknown %s \
+// RUN: -menable-no-infs -menable-no-nans -funsafe-math-optimizations \
+// RUN: -std=c++23
// RUN: %clang_cc1 -x c++ -verify=no-fast -triple powerpc64le-unknown-unknown \
// RUN: %s -std=c++23
diff --git a/clang/test/SemaCXX/pr100095.cpp b/clang/test/SemaCXX/pr100095.cpp
new file mode 100644
index 0000000..15913fe
--- /dev/null
+++ b/clang/test/SemaCXX/pr100095.cpp
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -fsyntax-only -std=c++11 %s
+// XFAIL: asserts
+
+template <class> struct Pair;
+template <class...> struct Tuple {
+ template <class _Up> Tuple(_Up);
+};
+template <typename> struct StatusOr;
+template <int> using ElementType = int;
+template <int... fields>
+using Key = Tuple<ElementType<fields>...>;
+template <int... fields>
+StatusOr<Pair<Key<fields...>>> Parser();
+struct Helper { Helper(Tuple<>, Tuple<>, int, int); };
+struct D : Helper {
+ D(Key<> f, int n, int e) : Helper(f, Parser<>, n, e) {}
+};
diff --git a/clang/test/SemaCXX/type-traits.cpp b/clang/test/SemaCXX/type-traits.cpp
index 23b07ca..e131212 100644
--- a/clang/test/SemaCXX/type-traits.cpp
+++ b/clang/test/SemaCXX/type-traits.cpp
@@ -2402,11 +2402,11 @@ template<typename T> struct DerivedB : BaseA<T> { };
template<typename T> struct CrazyDerived : T { };
-class class_forward; // expected-note 2 {{forward declaration of 'class_forward'}}
+class class_forward; // expected-note 4 {{forward declaration of 'class_forward'}}
template <class T> class DerivedTemp : Base {};
template <class T> class NonderivedTemp {};
-template <class T> class UndefinedTemp; // expected-note {{declared here}}
+template <class T> class UndefinedTemp; // expected-note 2 {{declared here}}
void is_base_of() {
static_assert(__is_base_of(Base, Derived));
@@ -2457,6 +2457,76 @@ void is_base_of() {
static_assert(!__is_base_of(DerivedB<int>, BaseA<int>));
}
+struct DerivedTransitiveViaNonVirtual : Derived3 {};
+struct DerivedTransitiveViaVirtual : virtual Derived3 {};
+
+template <typename T>
+struct CrazyDerivedVirtual : virtual T {};
+
+struct DerivedPrivate : private virtual Base {};
+struct DerivedProtected : protected virtual Base {};
+struct DerivedPrivatePrivate : private DerivedPrivate {};
+struct DerivedPrivateProtected : private DerivedProtected {};
+struct DerivedProtectedPrivate : protected DerivedProtected {};
+struct DerivedProtectedProtected : protected DerivedProtected {};
+
+void is_virtual_base_of(int n) {
+ static_assert(!__builtin_is_virtual_base_of(Base, Derived));
+ static_assert(!__builtin_is_virtual_base_of(const Base, Derived));
+ static_assert(!__builtin_is_virtual_base_of(Derived, Base));
+ static_assert(!__builtin_is_virtual_base_of(Derived, int));
+ static_assert(!__builtin_is_virtual_base_of(Base, Base));
+ static_assert(!__builtin_is_virtual_base_of(Base, Derived3));
+ static_assert(!__builtin_is_virtual_base_of(Derived, Derived3));
+ static_assert(__builtin_is_virtual_base_of(Derived2b, Derived3));
+ static_assert(__builtin_is_virtual_base_of(Derived2a, Derived3));
+ static_assert(!__builtin_is_virtual_base_of(BaseA<int>, DerivedB<int>));
+ static_assert(!__builtin_is_virtual_base_of(DerivedB<int>, BaseA<int>));
+ static_assert(!__builtin_is_virtual_base_of(Union, Union));
+ static_assert(!__builtin_is_virtual_base_of(Empty, Empty));
+ static_assert(!__builtin_is_virtual_base_of(class_forward, class_forward)); // expected-error {{incomplete type 'class_forward' where a complete type is required}}
+ static_assert(!__builtin_is_virtual_base_of(Empty, class_forward)); // expected-error {{incomplete type 'class_forward' where a complete type is required}}
+ static_assert(!__builtin_is_virtual_base_of(class_forward, Empty));
+ static_assert(!__builtin_is_virtual_base_of(Base&, Derived&));
+ static_assert(!__builtin_is_virtual_base_of(Base[10], Derived[10]));
+ static_assert(!__builtin_is_virtual_base_of(Base[n], Derived[n])); // expected-error 2 {{variable length arrays are not supported in '__builtin_is_virtual_base_of'}}
+ static_assert(!__builtin_is_virtual_base_of(int, int));
+ static_assert(!__builtin_is_virtual_base_of(int[], int[]));
+ static_assert(!__builtin_is_virtual_base_of(long, int));
+ static_assert(!__builtin_is_virtual_base_of(Base, DerivedTemp<int>));
+ static_assert(!__builtin_is_virtual_base_of(Base, NonderivedTemp<int>));
+ static_assert(!__builtin_is_virtual_base_of(Base, UndefinedTemp<int>)); // expected-error {{implicit instantiation of undefined template 'UndefinedTemp<int>'}}
+ static_assert(__builtin_is_virtual_base_of(Base, DerivedPrivate));
+ static_assert(__builtin_is_virtual_base_of(Base, DerivedProtected));
+ static_assert(__builtin_is_virtual_base_of(Base, DerivedPrivatePrivate));
+ static_assert(__builtin_is_virtual_base_of(Base, DerivedPrivateProtected));
+ static_assert(__builtin_is_virtual_base_of(Base, DerivedProtectedPrivate));
+ static_assert(__builtin_is_virtual_base_of(Base, DerivedProtectedProtected));
+ static_assert(__builtin_is_virtual_base_of(Derived2a, DerivedTransitiveViaNonVirtual));
+ static_assert(__builtin_is_virtual_base_of(Derived2b, DerivedTransitiveViaNonVirtual));
+ static_assert(__builtin_is_virtual_base_of(Derived2a, DerivedTransitiveViaVirtual));
+ static_assert(__builtin_is_virtual_base_of(Derived2b, DerivedTransitiveViaVirtual));
+ static_assert(!__builtin_is_virtual_base_of(Base, CrazyDerived<Base>));
+ static_assert(!__builtin_is_virtual_base_of(CrazyDerived<Base>, Base));
+ static_assert(__builtin_is_virtual_base_of(Base, CrazyDerivedVirtual<Base>));
+ static_assert(!__builtin_is_virtual_base_of(CrazyDerivedVirtual<Base>, Base));
+
+ static_assert(!__builtin_is_virtual_base_of(IncompleteUnion, IncompleteUnion));
+ static_assert(!__builtin_is_virtual_base_of(Union, IncompleteUnion));
+ static_assert(!__builtin_is_virtual_base_of(IncompleteUnion, Union));
+ static_assert(!__builtin_is_virtual_base_of(IncompleteStruct, IncompleteUnion));
+ static_assert(!__builtin_is_virtual_base_of(IncompleteUnion, IncompleteStruct));
+ static_assert(!__builtin_is_virtual_base_of(Empty, IncompleteUnion));
+ static_assert(!__builtin_is_virtual_base_of(IncompleteUnion, Empty));
+ static_assert(!__builtin_is_virtual_base_of(int, IncompleteUnion));
+ static_assert(!__builtin_is_virtual_base_of(IncompleteUnion, int));
+ static_assert(!__builtin_is_virtual_base_of(Empty, Union));
+ static_assert(!__builtin_is_virtual_base_of(Union, Empty));
+ static_assert(!__builtin_is_virtual_base_of(int, Empty));
+ static_assert(!__builtin_is_virtual_base_of(Union, int));
+ static_assert(!__builtin_is_virtual_base_of(IncompleteStruct, IncompleteStruct[n])); // expected-error {{variable length arrays are not supported in '__builtin_is_virtual_base_of'}}
+}
+
template<class T, class U>
class TemplateClass {};
diff --git a/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp b/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp
index 3885166..30c45ed 100644
--- a/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp
+++ b/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp
@@ -598,10 +598,11 @@ Expected<SmallVector<StringRef>> getInput(const ArgList &Args) {
Res.Prevailing = !Sym.isUndefined() && ObjSym.File == *BitcodeFile;
// We need LTO to preseve the following global symbols:
- // 1) Symbols used in regular objects.
- // 2) Prevailing symbols that are needed visible to the gpu runtime.
+ // 1) All symbols during a relocatable link.
+ // 2) Symbols used in regular objects.
+ // 3) Prevailing symbols that are needed visible to the gpu runtime.
Res.VisibleToRegularObj =
- ObjSym.UsedInRegularObj ||
+ Args.hasArg(OPT_relocatable) || ObjSym.UsedInRegularObj ||
(Res.Prevailing &&
(Sym.getVisibility() != GlobalValue::HiddenVisibility &&
!Sym.canBeOmittedFromSymbolTable()));
diff --git a/clang/tools/clang-nvlink-wrapper/NVLinkOpts.td b/clang/tools/clang-nvlink-wrapper/NVLinkOpts.td
index 8c80a51..01bd0f8 100644
--- a/clang/tools/clang-nvlink-wrapper/NVLinkOpts.td
+++ b/clang/tools/clang-nvlink-wrapper/NVLinkOpts.td
@@ -73,6 +73,10 @@ def plugin_opt : Joined<["--", "-"], "plugin-opt=">, Flags<[WrapperOnlyOption]>,
def save_temps : Flag<["--", "-"], "save-temps">,
Flags<[WrapperOnlyOption]>, HelpText<"Save intermediate results">;
+def relocatable : Flag<["--", "-"], "relocatable">,
+ Flags<[WrapperOnlyOption]>, HelpText<"Perform a relocatable link (LTO only)">;
+def r : Flag<["-"], "r">, Flags<[WrapperOnlyOption]>, Alias<relocatable>;
+
def whole_archive : Flag<["--", "-"], "whole-archive">,
Flags<[WrapperOnlyOption, HelpHidden]>;
def no_whole_archive : Flag<["--", "-"], "no-whole-archive">,
@@ -83,8 +87,7 @@ def mllvm : Separate<["-"], "mllvm">, Flags<[WrapperOnlyOption]>,
HelpText<"Arguments passed to LLVM, including Clang invocations, for which "
"the '-mllvm' prefix is preserved. Use '-mllvm --help' for a list "
"of options.">;
-def mllvm_EQ : Joined<["-"], "mllvm=">, Flags<[HelpHidden]>,
- Alias<mllvm>;
+def mllvm_EQ : Joined<["-"], "mllvm=">, Flags<[HelpHidden]>, Alias<mllvm>;
def dry_run : Flag<["--", "-"], "dry-run">, Flags<[WrapperOnlyOption]>,
HelpText<"Print generated commands without running.">;
diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp
index 6d987cc..9b12caa 100644
--- a/clang/unittests/AST/ASTImporterTest.cpp
+++ b/clang/unittests/AST/ASTImporterTest.cpp
@@ -9783,6 +9783,43 @@ TEST_P(ASTImporterOptionSpecificTestBase, ImportExistingEmptyAnonymousEnums) {
EXPECT_EQ(ImportedE2, ToE1);
}
+TEST_P(ASTImporterOptionSpecificTestBase, ImportMultipleAnonymousEnumDecls) {
+ Decl *ToTU = getToTuDecl("", Lang_CXX03);
+ Decl *FromTU = getTuDecl(
+ R"(
+ struct foo {
+ enum { A };
+ enum { B };
+ };
+ )",
+ Lang_CXX03);
+
+ auto EnumConstA = enumConstantDecl(hasName("A"));
+ auto EnumConstB = enumConstantDecl(hasName("B"));
+
+ auto *FromA = FirstDeclMatcher<EnumConstantDecl>().match(FromTU, EnumConstA);
+ auto *FromB = FirstDeclMatcher<EnumConstantDecl>().match(FromTU, EnumConstB);
+
+ auto *ToA = Import(FromA, Lang_CXX03);
+ auto *ToB = Import(FromB, Lang_CXX03);
+
+ ASSERT_TRUE(ToA);
+ ASSERT_TRUE(ToB);
+
+ auto *ToFooA = FirstDeclMatcher<CXXRecordDecl>().match(
+ ToTU, tagDecl(has(enumDecl(has(EnumConstA)))));
+ auto *ToFooB = FirstDeclMatcher<CXXRecordDecl>().match(
+ ToTU, tagDecl(has(enumDecl(has(EnumConstB)))));
+ ASSERT_EQ(ToFooA, ToFooB);
+
+ // different EnumDecl
+ auto *ToEnumDeclA =
+ FirstDeclMatcher<EnumDecl>().match(ToTU, enumDecl(has(EnumConstA)));
+ auto *ToEnumDeclB =
+ FirstDeclMatcher<EnumDecl>().match(ToTU, enumDecl(has(EnumConstB)));
+ ASSERT_NE(ToEnumDeclA, ToEnumDeclB);
+}
+
INSTANTIATE_TEST_SUITE_P(ParameterizedTests, ASTImporterLookupTableTest,
DefaultTestValuesForRunOptions);
diff --git a/clang/unittests/AST/StructuralEquivalenceTest.cpp b/clang/unittests/AST/StructuralEquivalenceTest.cpp
index 952c83b..e994086 100644
--- a/clang/unittests/AST/StructuralEquivalenceTest.cpp
+++ b/clang/unittests/AST/StructuralEquivalenceTest.cpp
@@ -1109,6 +1109,20 @@ TEST_F(StructuralEquivalenceEnumTest, EnumsWithDifferentBody) {
EXPECT_FALSE(testStructuralMatch(t));
}
+TEST_F(StructuralEquivalenceEnumTest, AnonymousEnumsWithSameConsts) {
+ // field x is required to trigger comparison of the anonymous enum
+ auto t = makeNamedDecls("struct foo { enum { A } x; };",
+ "struct foo { enum { A } x;};", Lang_CXX11);
+ EXPECT_TRUE(testStructuralMatch(t));
+}
+
+TEST_F(StructuralEquivalenceEnumTest, AnonymousEnumsWithDiffConsts) {
+ // field x is required to trigger comparison of the anonymous enum
+ auto t = makeNamedDecls("struct foo { enum { A } x; };",
+ "struct foo { enum { B } x;};", Lang_CXX11);
+ EXPECT_FALSE(testStructuralMatch(t));
+}
+
struct StructuralEquivalenceEnumConstantTest : StructuralEquivalenceTest {};
TEST_F(StructuralEquivalenceEnumConstantTest, EnumConstantsWithSameValues) {
diff --git a/clang/unittests/Analysis/FlowSensitive/DataflowEnvironmentTest.cpp b/clang/unittests/Analysis/FlowSensitive/DataflowEnvironmentTest.cpp
index 41fca6b..737277e 100644
--- a/clang/unittests/Analysis/FlowSensitive/DataflowEnvironmentTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/DataflowEnvironmentTest.cpp
@@ -442,6 +442,55 @@ TEST_F(EnvironmentTest, CXXDefaultInitExprResultObjIsWrappedExprResultObj) {
&Env.getResultObjectLocation(*DefaultInit->getExpr()));
}
+// This test verifies the behavior of `getResultObjectLocation()` in
+// scenarios involving inherited constructors.
+// Since the specific AST node of interest `CXXConstructorDecl` is implicitly
+// generated, we cannot annotate any statements inside of it as we do in tests
+// within TransferTest. Thus, the only way to get the right `Environment` is by
+// explicitly initializing it as we do in tests within EnvironmentTest.
+// This is why this test is not inside TransferTest, where most of the tests for
+// `getResultObjectLocation()` are located.
+TEST_F(EnvironmentTest, ResultObjectLocationForInheritedCtorInitExpr) {
+ using namespace ast_matchers;
+
+ std::string Code = R"(
+ struct Base {
+ Base(int b) {}
+ };
+ struct Derived : Base {
+ using Base::Base;
+ };
+
+ Derived d = Derived(0);
+ )";
+
+ auto Unit =
+ tooling::buildASTFromCodeWithArgs(Code, {"-fsyntax-only", "-std=c++20"});
+ auto &Context = Unit->getASTContext();
+
+ ASSERT_EQ(Context.getDiagnostics().getClient()->getNumErrors(), 0U);
+
+ auto Results =
+ match(cxxConstructorDecl(
+ hasAnyConstructorInitializer(cxxCtorInitializer(
+ withInitializer(expr().bind("inherited_ctor_init_expr")))))
+ .bind("ctor"),
+ Context);
+ const auto *Constructor = selectFirst<CXXConstructorDecl>("ctor", Results);
+ const auto *InheritedCtorInit = selectFirst<CXXInheritedCtorInitExpr>(
+ "inherited_ctor_init_expr", Results);
+
+ EXPECT_EQ(InheritedCtorInit->child_begin(), InheritedCtorInit->child_end());
+
+ Environment Env(DAContext, *Constructor);
+ Env.initialize();
+
+ RecordStorageLocation &Loc = Env.getResultObjectLocation(*InheritedCtorInit);
+ EXPECT_NE(&Loc, nullptr);
+
+ EXPECT_EQ(&Loc, Env.getThisPointeeStorageLocation());
+}
+
TEST_F(EnvironmentTest, Stmt) {
using namespace ast_matchers;
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index 8c892d9..7c7f003 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -1072,7 +1072,7 @@ static bool parseFloatingPointArgs(CompilerInvocation &invoc,
opts.setFPContractMode(fpContractMode);
}
- if (args.getLastArg(clang::driver::options::OPT_menable_no_infinities)) {
+ if (args.getLastArg(clang::driver::options::OPT_menable_no_infs)) {
opts.NoHonorInfs = true;
}
diff --git a/flang/lib/Semantics/check-cuda.cpp b/flang/lib/Semantics/check-cuda.cpp
index 5b3ea21..60b8b32 100644
--- a/flang/lib/Semantics/check-cuda.cpp
+++ b/flang/lib/Semantics/check-cuda.cpp
@@ -307,6 +307,25 @@ private:
WarnOnIoStmt(source);
}
}
+ template <typename A>
+ void ErrorIfHostSymbol(const A &expr, const parser::CharBlock &source) {
+ for (const Symbol &sym : CollectCudaSymbols(expr)) {
+ if (const auto *details =
+ sym.GetUltimate().detailsIf<semantics::ObjectEntityDetails>()) {
+ if (details->IsArray() &&
+ (!details->cudaDataAttr() ||
+ (details->cudaDataAttr() &&
+ *details->cudaDataAttr() != common::CUDADataAttr::Device &&
+ *details->cudaDataAttr() != common::CUDADataAttr::Managed &&
+ *details->cudaDataAttr() !=
+ common::CUDADataAttr::Unified))) {
+ context_.Say(source,
+ "Host array '%s' cannot be present in CUF kernel"_err_en_US,
+ sym.name());
+ }
+ }
+ }
+ }
void Check(const parser::ActionStmt &stmt, const parser::CharBlock &source) {
common::visit(
common::visitors{
@@ -349,6 +368,19 @@ private:
[&](const common::Indirection<parser::IfStmt> &x) {
Check(x.value());
},
+ [&](const common::Indirection<parser::AssignmentStmt> &x) {
+ if (IsCUFKernelDo) {
+ const evaluate::Assignment *assign{
+ semantics::GetAssignment(x.value())};
+ if (assign) {
+ ErrorIfHostSymbol(assign->lhs, source);
+ ErrorIfHostSymbol(assign->rhs, source);
+ }
+ }
+ if (auto msg{ActionStmtChecker<IsCUFKernelDo>::WhyNotOk(x)}) {
+ context_.Say(source, std::move(*msg));
+ }
+ },
[&](const auto &x) {
if (auto msg{ActionStmtChecker<IsCUFKernelDo>::WhyNotOk(x)}) {
context_.Say(source, std::move(*msg));
diff --git a/flang/runtime/transformational.cpp b/flang/runtime/transformational.cpp
index cf1e61c..b6b204b 100644
--- a/flang/runtime/transformational.cpp
+++ b/flang/runtime/transformational.cpp
@@ -508,7 +508,8 @@ void RTDEF(CshiftVector)(Descriptor &result, const Descriptor &source,
SubscriptValue lb{sourceDim.LowerBound()};
for (SubscriptValue j{0}; j < extent; ++j) {
SubscriptValue resultAt{1 + j};
- SubscriptValue sourceAt{lb + (j + shift) % extent};
+ SubscriptValue sourceAt{
+ lb + static_cast<SubscriptValue>(j + shift) % extent};
if (sourceAt < lb) {
sourceAt += extent;
}
@@ -619,7 +620,7 @@ void RTDEF(EoshiftVector)(Descriptor &result, const Descriptor &source,
}
SubscriptValue lb{source.GetDimension(0).LowerBound()};
for (SubscriptValue j{1}; j <= extent; ++j) {
- SubscriptValue sourceAt{lb + j - 1 + shift};
+ SubscriptValue sourceAt{lb + j - 1 + static_cast<SubscriptValue>(shift)};
if (sourceAt >= lb && sourceAt < lb + extent) {
CopyElement(result, &j, source, &sourceAt, terminator);
} else if (boundary) {
diff --git a/flang/test/Lower/CUDA/cuda-data-transfer.cuf b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
index d657f81..7eb74a4 100644
--- a/flang/test/Lower/CUDA/cuda-data-transfer.cuf
+++ b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
@@ -133,7 +133,7 @@ subroutine sub4()
integer, parameter :: n = 10
real, device :: adev(n)
real :: ahost(n)
- real :: b
+ real, managed :: b
integer :: i
adev = ahost
diff --git a/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf b/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf
index 99cb6eb..ba5d390 100644
--- a/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf
+++ b/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf
@@ -7,8 +7,8 @@ subroutine sub1()
integer :: i, j
integer, parameter :: n = 100
integer(8) :: istream
- real :: a(n), b(n)
- real :: c(n,n), d(n,n)
+ real, device :: a(n), b(n)
+ real, device :: c(n,n), d(n,n)
! CHECK-LABEL: func.func @_QPsub1()
! CHECK: %[[IV:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsub1Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
diff --git a/flang/test/Parser/cuf-sanity-common b/flang/test/Parser/cuf-sanity-common
index 9d73204..9341f05 100644
--- a/flang/test/Parser/cuf-sanity-common
+++ b/flang/test/Parser/cuf-sanity-common
@@ -23,7 +23,8 @@ module m
end subroutine
subroutine test
logical isPinned
- real a(10), x, y, z
+ real, device :: a(10)
+ real :: x, y, z
!$cuf kernel do(1) <<<*, *, stream = 1>>>
do j = 1, 10
end do
diff --git a/flang/test/Semantics/cuf09.cuf b/flang/test/Semantics/cuf09.cuf
index d2d4d23..195ddac 100644
--- a/flang/test/Semantics/cuf09.cuf
+++ b/flang/test/Semantics/cuf09.cuf
@@ -18,6 +18,8 @@ module m
end
program main
+ integer, device :: a_d(10 ,10)
+ integer :: b(10, 10)
!$cuf kernel do <<< *, * >>> ! ok
do j = 1, 0
end do
@@ -90,4 +92,12 @@ program main
else if (ifunc() /= 1) then
end if
end do
+
+ !$cuf kernel do (2) <<<*, *>>>
+ do j = 1, 10
+ do i = 1, 10
+ !ERROR: Host array 'b' cannot be present in CUF kernel
+ a_d(i,j) = b(i,j)
+ enddo
+ enddo
end
diff --git a/flang/test/Semantics/reduce.cuf b/flang/test/Semantics/reduce.cuf
index 95ff2e8..92d12ab1 100644
--- a/flang/test/Semantics/reduce.cuf
+++ b/flang/test/Semantics/reduce.cuf
@@ -1,9 +1,9 @@
! RUN: %python %S/test_errors.py %s %flang_fc1
subroutine s(n,m,a,l)
integer, intent(in) :: n
- integer, intent(in) :: m(n)
- real, intent(in) :: a(n)
- logical, intent(in) :: l(n)
+ integer, device, intent(in) :: m(n)
+ real, device, intent(in) :: a(n)
+ logical, device, intent(in) :: l(n)
integer j, mr
real ar
logical lr
diff --git a/libc/config/darwin/arm/entrypoints.txt b/libc/config/darwin/arm/entrypoints.txt
index 3b50666..38eace2 100644
--- a/libc/config/darwin/arm/entrypoints.txt
+++ b/libc/config/darwin/arm/entrypoints.txt
@@ -120,6 +120,7 @@ set(TARGET_LIBM_ENTRYPOINTS
libc.src.math.acoshf
libc.src.math.asinf
libc.src.math.asinhf
+ libc.src.math.atan2
libc.src.math.atan2f
libc.src.math.atanf
libc.src.math.atanhf
diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index 2334fed..b2c5341 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -343,6 +343,7 @@ set(TARGET_LIBM_ENTRYPOINTS
libc.src.math.acoshf
libc.src.math.asinf
libc.src.math.asinhf
+ libc.src.math.atan2
libc.src.math.atan2f
libc.src.math.atanf
libc.src.math.atanhf
diff --git a/libc/config/linux/arm/entrypoints.txt b/libc/config/linux/arm/entrypoints.txt
index 61ee68a..8e77105 100644
--- a/libc/config/linux/arm/entrypoints.txt
+++ b/libc/config/linux/arm/entrypoints.txt
@@ -213,6 +213,7 @@ set(TARGET_LIBM_ENTRYPOINTS
libc.src.math.acoshf
libc.src.math.asinf
libc.src.math.asinhf
+ libc.src.math.atan2
libc.src.math.atan2f
libc.src.math.atanf
libc.src.math.atanhf
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index 0746680..e3ed5a5 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -365,6 +365,7 @@ set(TARGET_LIBM_ENTRYPOINTS
libc.src.math.acoshf
libc.src.math.asinf
libc.src.math.asinhf
+ libc.src.math.atan2
libc.src.math.atan2f
libc.src.math.atanf
libc.src.math.atanhf
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 035ceb8..96f9755 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -365,6 +365,7 @@ set(TARGET_LIBM_ENTRYPOINTS
libc.src.math.acoshf
libc.src.math.asinf
libc.src.math.asinhf
+ libc.src.math.atan2
libc.src.math.atan2f
libc.src.math.atanf
libc.src.math.atanhf
diff --git a/libc/config/windows/entrypoints.txt b/libc/config/windows/entrypoints.txt
index b6aced8..06c3682 100644
--- a/libc/config/windows/entrypoints.txt
+++ b/libc/config/windows/entrypoints.txt
@@ -118,6 +118,7 @@ set(TARGET_LIBM_ENTRYPOINTS
libc.src.math.acoshf
libc.src.math.asinf
libc.src.math.asinhf
+ libc.src.math.atan2
libc.src.math.atan2f
libc.src.math.atanf
libc.src.math.atanhf
diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst
index e7db07f..9f88b4d 100644
--- a/libc/docs/math/index.rst
+++ b/libc/docs/math/index.rst
@@ -260,7 +260,7 @@ Higher Math Functions
+-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
| atan | |check| | | | | | 7.12.4.3 | F.10.1.3 |
+-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| atan2 | |check| | | | | | 7.12.4.4 | F.10.1.4 |
+| atan2 | |check| | 1 ULP | | | | 7.12.4.4 | F.10.1.4 |
+-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
| atan2pi | | | | | | 7.12.4.11 | F.10.1.11 |
+-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index d5a5cb6..9c84acc 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -692,6 +692,7 @@ def StdC : StandardSpec<"stdc"> {
FunctionSpec<"atanf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
+ FunctionSpec<"atan2", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
FunctionSpec<"atan2f", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
FunctionSpec<"acoshf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 53155c6..27b5b94 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -3849,6 +3849,26 @@ add_entrypoint_object(
)
add_entrypoint_object(
+ atan2
+ SRCS
+ atan2.cpp
+ HDRS
+ ../atan2.h
+ COMPILE_OPTIONS
+ -O3
+ DEPENDS
+ .inv_trigf_utils
+ libc.src.__support.FPUtil.double_double
+ libc.src.__support.FPUtil.dyadic_float
+ libc.src.__support.FPUtil.fp_bits
+ libc.src.__support.FPUtil.multiply_add
+ libc.src.__support.FPUtil.nearest_integer
+ libc.src.__support.FPUtil.polyeval
+ libc.src.__support.FPUtil.rounding_mode
+ libc.src.__support.macros.optimization
+)
+
+add_entrypoint_object(
scalblnf16
SRCS
scalblnf16.cpp
diff --git a/libc/src/math/generic/atan2.cpp b/libc/src/math/generic/atan2.cpp
new file mode 100644
index 0000000..c39deeb
--- /dev/null
+++ b/libc/src/math/generic/atan2.cpp
@@ -0,0 +1,313 @@
+//===-- Double-precision atan2 function -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/atan2.h"
+#include "inv_trigf_utils.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/double_double.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/nearest_integer.h"
+#include "src/__support/FPUtil/rounding_mode.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace {
+
+using DoubleDouble = fputil::DoubleDouble;
+
+// atan(i/64) with i = 0..64, generated by Sollya with:
+// > for i from 0 to 64 do {
+// a = round(atan(i/64), D, RN);
+// b = round(atan(i/64) - a, D, RN);
+// print("{", b, ",", a, "},");
+// };
+constexpr fputil::DoubleDouble ATAN_I[65] = {
+ {0.0, 0.0},
+ {-0x1.220c39d4dff5p-61, 0x1.fff555bbb729bp-7},
+ {-0x1.5ec431444912cp-60, 0x1.ffd55bba97625p-6},
+ {-0x1.86ef8f794f105p-63, 0x1.7fb818430da2ap-5},
+ {-0x1.c934d86d23f1dp-60, 0x1.ff55bb72cfdeap-5},
+ {0x1.ac4ce285df847p-58, 0x1.3f59f0e7c559dp-4},
+ {-0x1.cfb654c0c3d98p-58, 0x1.7ee182602f10fp-4},
+ {0x1.f7b8f29a05987p-58, 0x1.be39ebe6f07c3p-4},
+ {-0x1.cd37686760c17p-59, 0x1.fd5ba9aac2f6ep-4},
+ {-0x1.b485914dacf8cp-59, 0x1.1e1fafb043727p-3},
+ {0x1.61a3b0ce9281bp-57, 0x1.3d6eee8c6626cp-3},
+ {-0x1.054ab2c010f3dp-58, 0x1.5c9811e3ec26ap-3},
+ {0x1.347b0b4f881cap-58, 0x1.7b97b4bce5b02p-3},
+ {0x1.cf601e7b4348ep-59, 0x1.9a6a8e96c8626p-3},
+ {0x1.17b10d2e0e5abp-61, 0x1.b90d7529260a2p-3},
+ {0x1.c648d1534597ep-57, 0x1.d77d5df205736p-3},
+ {0x1.8ab6e3cf7afbdp-57, 0x1.f5b75f92c80ddp-3},
+ {0x1.62e47390cb865p-56, 0x1.09dc597d86362p-2},
+ {0x1.30ca4748b1bf9p-57, 0x1.18bf5a30bf178p-2},
+ {-0x1.077cdd36dfc81p-56, 0x1.278372057ef46p-2},
+ {-0x1.963a544b672d8p-57, 0x1.362773707ebccp-2},
+ {-0x1.5d5e43c55b3bap-56, 0x1.44aa436c2af0ap-2},
+ {-0x1.2566480884082p-57, 0x1.530ad9951cd4ap-2},
+ {-0x1.a725715711fp-56, 0x1.614840309cfe2p-2},
+ {-0x1.c63aae6f6e918p-56, 0x1.6f61941e4def1p-2},
+ {0x1.69c885c2b249ap-56, 0x1.7d5604b63b3f7p-2},
+ {0x1.b6d0ba3748fa8p-56, 0x1.8b24d394a1b25p-2},
+ {0x1.9e6c988fd0a77p-56, 0x1.98cd5454d6b18p-2},
+ {-0x1.24dec1b50b7ffp-56, 0x1.a64eec3cc23fdp-2},
+ {0x1.ae187b1ca504p-56, 0x1.b3a911da65c6cp-2},
+ {-0x1.cc1ce70934c34p-56, 0x1.c0db4c94ec9fp-2},
+ {-0x1.a2cfa4418f1adp-56, 0x1.cde53432c1351p-2},
+ {0x1.a2b7f222f65e2p-56, 0x1.dac670561bb4fp-2},
+ {0x1.0e53dc1bf3435p-56, 0x1.e77eb7f175a34p-2},
+ {-0x1.a3992dc382a23p-57, 0x1.f40dd0b541418p-2},
+ {-0x1.b32c949c9d593p-55, 0x1.0039c73c1a40cp-1},
+ {-0x1.d5b495f6349e6p-56, 0x1.0657e94db30dp-1},
+ {0x1.974fa13b5404fp-58, 0x1.0c6145b5b43dap-1},
+ {-0x1.2bdaee1c0ee35p-58, 0x1.1255d9bfbd2a9p-1},
+ {0x1.c621cec00c301p-55, 0x1.1835a88be7c13p-1},
+ {-0x1.928df287a668fp-58, 0x1.1e00babdefeb4p-1},
+ {0x1.c421c9f38224ep-57, 0x1.23b71e2cc9e6ap-1},
+ {-0x1.09e73b0c6c087p-56, 0x1.2958e59308e31p-1},
+ {0x1.c5d5e9ff0cf8dp-55, 0x1.2ee628406cbcap-1},
+ {0x1.1021137c71102p-55, 0x1.345f01cce37bbp-1},
+ {-0x1.2304331d8bf46p-55, 0x1.39c391cd4171ap-1},
+ {0x1.ecf8b492644fp-56, 0x1.3f13fb89e96f4p-1},
+ {-0x1.f76d0163f79c8p-56, 0x1.445065b795b56p-1},
+ {0x1.2419a87f2a458p-56, 0x1.4978fa3269ee1p-1},
+ {0x1.4a33dbeb3796cp-55, 0x1.4e8de5bb6ec04p-1},
+ {-0x1.1bb74abda520cp-55, 0x1.538f57b89061fp-1},
+ {-0x1.5e5c9d8c5a95p-56, 0x1.587d81f732fbbp-1},
+ {0x1.0028e4bc5e7cap-57, 0x1.5d58987169b18p-1},
+ {-0x1.2b785350ee8c1p-57, 0x1.6220d115d7b8ep-1},
+ {-0x1.6ea6febe8bbbap-56, 0x1.66d663923e087p-1},
+ {-0x1.a80386188c50ep-55, 0x1.6b798920b3d99p-1},
+ {-0x1.8c34d25aadef6p-56, 0x1.700a7c5784634p-1},
+ {0x1.7b2a6165884a1p-59, 0x1.748978fba8e0fp-1},
+ {0x1.406a08980374p-55, 0x1.78f6bbd5d315ep-1},
+ {0x1.560821e2f3aa9p-55, 0x1.7d528289fa093p-1},
+ {-0x1.bf76229d3b917p-56, 0x1.819d0b7158a4dp-1},
+ {0x1.6b66e7fc8b8c3p-57, 0x1.85d69576cc2c5p-1},
+ {-0x1.55b9a5e177a1bp-55, 0x1.89ff5ff57f1f8p-1},
+ {-0x1.ec182ab042f61p-56, 0x1.8e17aa99cc05ep-1},
+ {0x1.1a62633145c07p-55, 0x1.921fb54442d18p-1},
+};
+
+// Approximate atan(x) for |x| <= 2^-7.
+// Using degree-9 Taylor polynomial:
+// P = x - x^3/3 + x^5/5 -x^7/7 + x^9/9;
+// Then the absolute error is bounded by:
+// |atan(x) - P(x)| < |x|^11/11 < 2^(-7*11) / 11 < 2^-80.
+// And the relative error is bounded by:
+// |(atan(x) - P(x))/atan(x)| < |x|^10 / 10 < 2^-73.
+// For x = x_hi + x_lo, fully expand the polynomial and drop any terms less than
+// ulp(x_hi^3 / 3) gives us:
+// P(x) ~ x_hi - x_hi^3/3 + x_hi^5/5 - x_hi^7/7 + x_hi^9/9 +
+// + x_lo * (1 - x_hi^2 + x_hi^4)
+DoubleDouble atan_eval(const DoubleDouble &x) {
+ DoubleDouble p;
+ p.hi = x.hi;
+ double x_hi_sq = x.hi * x.hi;
+ // c0 ~ x_hi^2 * 1/5 - 1/3
+ double c0 = fputil::multiply_add(x_hi_sq, 0x1.999999999999ap-3,
+ -0x1.5555555555555p-2);
+ // c1 ~ x_hi^2 * 1/9 - 1/7
+ double c1 = fputil::multiply_add(x_hi_sq, 0x1.c71c71c71c71cp-4,
+ -0x1.2492492492492p-3);
+ // x_hi^3
+ double x_hi_3 = x_hi_sq * x.hi;
+ // x_hi^4
+ double x_hi_4 = x_hi_sq * x_hi_sq;
+ // d0 ~ 1/3 - x_hi^2 / 5 + x_hi^4 / 7 - x_hi^6 / 9
+ double d0 = fputil::multiply_add(x_hi_4, c1, c0);
+ // x_lo - x_lo * x_hi^2 + x_lo * x_hi^4
+ double d1 = fputil::multiply_add(x_hi_4 - x_hi_sq, x.lo, x.lo);
+ // p.lo ~ -x_hi^3/3 + x_hi^5/5 - x_hi^7/7 + x_hi^9/9 +
+ // + x_lo * (1 - x_hi^2 + x_hi^4)
+ p.lo = fputil::multiply_add(x_hi_3, d0, d1);
+ return p;
+}
+
+} // anonymous namespace
+
+// There are several range reduction steps we can take for atan2(y, x) as
+// follow:
+
+// * Range reduction 1: signness
+// atan2(y, x) will return a number between -PI and PI representing the angle
+// forming by the 0x axis and the vector (x, y) on the 0xy-plane.
+// In particular, we have that:
+// atan2(y, x) = atan( y/x ) if x >= 0 and y >= 0 (I-quadrant)
+// = pi + atan( y/x ) if x < 0 and y >= 0 (II-quadrant)
+// = -pi + atan( y/x ) if x < 0 and y < 0 (III-quadrant)
+// = atan( y/x ) if x >= 0 and y < 0 (IV-quadrant)
+// Since atan function is odd, we can use the formula:
+// atan(-u) = -atan(u)
+// to adjust the above conditions a bit further:
+// atan2(y, x) = atan( |y|/|x| ) if x >= 0 and y >= 0 (I-quadrant)
+// = pi - atan( |y|/|x| ) if x < 0 and y >= 0 (II-quadrant)
+// = -pi + atan( |y|/|x| ) if x < 0 and y < 0 (III-quadrant)
+// = -atan( |y|/|x| ) if x >= 0 and y < 0 (IV-quadrant)
+// Which can be simplified to:
+// atan2(y, x) = sign(y) * atan( |y|/|x| ) if x >= 0
+// = sign(y) * (pi - atan( |y|/|x| )) if x < 0
+
+// * Range reduction 2: reciprocal
+// Now that the argument inside atan is positive, we can use the formula:
+// atan(1/x) = pi/2 - atan(x)
+// to make the argument inside atan <= 1 as follow:
+// atan2(y, x) = sign(y) * atan( |y|/|x|) if 0 <= |y| <= x
+// = sign(y) * (pi/2 - atan( |x|/|y| ) if 0 <= x < |y|
+// = sign(y) * (pi - atan( |y|/|x| )) if 0 <= |y| <= -x
+// = sign(y) * (pi/2 + atan( |x|/|y| )) if 0 <= -x < |y|
+
+// * Range reduction 3: look up table.
+// After the previous two range reduction steps, we reduce the problem to
+// compute atan(u) with 0 <= u <= 1, or to be precise:
+// atan( n / d ) where n = min(|x|, |y|) and d = max(|x|, |y|).
+// An accurate polynomial approximation for the whole [0, 1] input range will
+// require a very large degree. To make it more efficient, we reduce the input
+// range further by finding an integer idx such that:
+// | n/d - idx/64 | <= 1/128.
+// In particular,
+// idx := round(2^6 * n/d)
+// Then for the fast pass, we find a polynomial approximation for:
+// atan( n/d ) ~ atan( idx/64 ) + (n/d - idx/64) * Q(n/d - idx/64)
+// For the accurate pass, we use the addition formula:
+// atan( n/d ) - atan( idx/64 ) = atan( (n/d - idx/64)/(1 + (n*idx)/(64*d)) )
+// = atan( (n - d*(idx/64))/(d + n*(idx/64)) )
+// And for the fast pass, we use degree-9 Taylor polynomial to compute the RHS:
+// atan(u) ~ P(u) = u - u^3/3 + u^5/5 - u^7/7 + u^9/9
+// with absolute errors bounded by:
+// |atan(u) - P(u)| < |u|^11 / 11 < 2^-80
+// and relative errors bounded by:
+// |(atan(u) - P(u)) / P(u)| < u^10 / 11 < 2^-73.
+
+LLVM_LIBC_FUNCTION(double, atan2, (double y, double x)) {
+ using FPBits = fputil::FPBits<double>;
+
+ constexpr double IS_NEG[2] = {1.0, -1.0};
+ constexpr DoubleDouble ZERO = {0.0, 0.0};
+ constexpr DoubleDouble MZERO = {-0.0, -0.0};
+ constexpr DoubleDouble PI = {0x1.1a62633145c07p-53, 0x1.921fb54442d18p+1};
+ constexpr DoubleDouble MPI = {-0x1.1a62633145c07p-53, -0x1.921fb54442d18p+1};
+ constexpr DoubleDouble PI_OVER_2 = {0x1.1a62633145c07p-54,
+ 0x1.921fb54442d18p0};
+ constexpr DoubleDouble MPI_OVER_2 = {-0x1.1a62633145c07p-54,
+ -0x1.921fb54442d18p0};
+ constexpr DoubleDouble PI_OVER_4 = {0x1.1a62633145c07p-55,
+ 0x1.921fb54442d18p-1};
+ constexpr DoubleDouble THREE_PI_OVER_4 = {0x1.a79394c9e8a0ap-54,
+ 0x1.2d97c7f3321d2p+1};
+ // Adjustment for constant term:
+ // CONST_ADJ[x_sign][y_sign][recip]
+ constexpr DoubleDouble CONST_ADJ[2][2][2] = {
+ {{ZERO, MPI_OVER_2}, {MZERO, MPI_OVER_2}},
+ {{MPI, PI_OVER_2}, {MPI, PI_OVER_2}}};
+
+ FPBits x_bits(x), y_bits(y);
+ bool x_sign = x_bits.sign().is_neg();
+ bool y_sign = y_bits.sign().is_neg();
+ x_bits = x_bits.abs();
+ y_bits = y_bits.abs();
+ uint64_t x_abs = x_bits.uintval();
+ uint64_t y_abs = y_bits.uintval();
+ bool recip = x_abs < y_abs;
+ uint64_t min_abs = recip ? x_abs : y_abs;
+ uint64_t max_abs = !recip ? x_abs : y_abs;
+ unsigned min_exp = static_cast<unsigned>(min_abs >> FPBits::FRACTION_LEN);
+ unsigned max_exp = static_cast<unsigned>(max_abs >> FPBits::FRACTION_LEN);
+
+ double num = FPBits(min_abs).get_val();
+ double den = FPBits(max_abs).get_val();
+
+ // Check for exceptional cases, whether inputs are 0, inf, nan, or close to
+ // overflow, or close to underflow.
+ if (LIBC_UNLIKELY(max_exp > 0x7ffU - 128U || min_exp < 128U)) {
+ if (x_bits.is_nan() || y_bits.is_nan())
+ return FPBits::quiet_nan().get_val();
+ unsigned x_except = x_abs == 0 ? 0 : (FPBits(x_abs).is_inf() ? 2 : 1);
+ unsigned y_except = y_abs == 0 ? 0 : (FPBits(y_abs).is_inf() ? 2 : 1);
+
+ // Exceptional cases:
+ // EXCEPT[y_except][x_except][x_is_neg]
+ // with x_except & y_except:
+ // 0: zero
+ // 1: finite, non-zero
+ // 2: infinity
+ constexpr DoubleDouble EXCEPTS[3][3][2] = {
+ {{ZERO, PI}, {ZERO, PI}, {ZERO, PI}},
+ {{PI_OVER_2, PI_OVER_2}, {ZERO, ZERO}, {ZERO, PI}},
+ {{PI_OVER_2, PI_OVER_2},
+ {PI_OVER_2, PI_OVER_2},
+ {PI_OVER_4, THREE_PI_OVER_4}},
+ };
+
+ if ((x_except != 1) || (y_except != 1)) {
+ DoubleDouble r = EXCEPTS[y_except][x_except][x_sign];
+ return fputil::multiply_add(IS_NEG[y_sign], r.hi, IS_NEG[y_sign] * r.lo);
+ }
+ bool scale_up = min_exp < 128U;
+ bool scale_down = max_exp > 0x7ffU - 128U;
+ // At least one input is denormal, multiply both numerator and denominator
+ // by some large enough power of 2 to normalize denormal inputs.
+ if (scale_up) {
+ num *= 0x1.0p64;
+ if (!scale_down)
+ den *= 0x1.0p64;
+ } else if (scale_down) {
+ den *= 0x1.0p-64;
+ if (!scale_up)
+ num *= 0x1.0p-64;
+ }
+
+ min_abs = FPBits(num).uintval();
+ max_abs = FPBits(den).uintval();
+ min_exp = static_cast<unsigned>(min_abs >> FPBits::FRACTION_LEN);
+ max_exp = static_cast<unsigned>(max_abs >> FPBits::FRACTION_LEN);
+ }
+
+ double final_sign = IS_NEG[(x_sign != y_sign) != recip];
+ DoubleDouble const_term = CONST_ADJ[x_sign][y_sign][recip];
+ unsigned exp_diff = max_exp - min_exp;
+ // We have the following bound for normalized n and d:
+ // 2^(-exp_diff - 1) < n/d < 2^(-exp_diff + 1).
+ if (LIBC_UNLIKELY(exp_diff > 54)) {
+ return fputil::multiply_add(final_sign, const_term.hi,
+ final_sign * (const_term.lo + num / den));
+ }
+
+ double k = fputil::nearest_integer(64.0 * num / den);
+ unsigned idx = static_cast<unsigned>(k);
+ // k = idx / 64
+ k *= 0x1.0p-6;
+
+ // Range reduction:
+ // atan(n/d) - atan(k/64) = atan((n/d - k/64) / (1 + (n/d) * (k/64)))
+ // = atan((n - d * k/64)) / (d + n * k/64))
+ DoubleDouble num_k = fputil::exact_mult(num, k);
+ DoubleDouble den_k = fputil::exact_mult(den, k);
+
+ // num_dd = n - d * k
+ DoubleDouble num_dd = fputil::exact_add(num - den_k.hi, -den_k.lo);
+ // den_dd = d + n * k
+ DoubleDouble den_dd = fputil::exact_add(den, num_k.hi);
+ den_dd.lo += num_k.lo;
+
+ // q = (n - d * k) / (d + n * k)
+ DoubleDouble q = fputil::div(num_dd, den_dd);
+ // p ~ atan(q)
+ DoubleDouble p = atan_eval(q);
+
+ DoubleDouble r = fputil::add(const_term, fputil::add(ATAN_I[idx], p));
+ r.hi *= final_sign;
+ r.lo *= final_sign;
+
+ return r.hi + r.lo;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/startup/gpu/CMakeLists.txt b/libc/startup/gpu/CMakeLists.txt
index 6f67fa9..3830bf3 100644
--- a/libc/startup/gpu/CMakeLists.txt
+++ b/libc/startup/gpu/CMakeLists.txt
@@ -26,6 +26,16 @@ function(add_startup_object name)
PROPERTIES
OUTPUT_NAME ${name}.o
)
+
+ # Make an executable target of relocatable bitcode for clang if needed.
+ if(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR)
+ add_executable(${fq_target_name}.exe $<TARGET_OBJECTS:${fq_target_name}>)
+ set_target_properties(${fq_target_name}.exe PROPERTIES
+ RUNTIME_OUTPUT_DIRECTORY ${LIBC_LIBRARY_DIR}
+ RUNTIME_OUTPUT_NAME ${name}.o)
+ target_link_options(${fq_target_name}.exe PRIVATE
+ "-nostdlib" "-flto" "-Wl,--lto-emit-llvm" "-march= ")
+ endif()
endfunction()
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_ARCHITECTURE})
diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt
index 3ad5d98..380d283 100644
--- a/libc/test/src/math/CMakeLists.txt
+++ b/libc/test/src/math/CMakeLists.txt
@@ -2045,6 +2045,18 @@ add_fp_unittest(
)
add_fp_unittest(
+ atan2_test
+ NEED_MPFR
+ SUITE
+ libc-math-unittests
+ SRCS
+ atan2_test.cpp
+ DEPENDS
+ libc.src.math.atan2
+ libc.src.__support.FPUtil.fp_bits
+)
+
+add_fp_unittest(
f16add_test
NEED_MPFR
SUITE
diff --git a/libc/test/src/math/atan2_test.cpp b/libc/test/src/math/atan2_test.cpp
new file mode 100644
index 0000000..637ac89
--- /dev/null
+++ b/libc/test/src/math/atan2_test.cpp
@@ -0,0 +1,125 @@
+//===-- Unittests for atan2 -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/math/atan2.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+
+using LlvmLibcAtan2Test = LIBC_NAMESPACE::testing::FPTest<double>;
+using LIBC_NAMESPACE::testing::tlog;
+
+namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
+
+TEST_F(LlvmLibcAtan2Test, TrickyInputs) {
+ mpfr::BinaryInput<double> inputs[] = {
+ {0x1.0853408534085p-2, 0x1.e7b54166c6126p-2},
+ {FPBits::inf().get_val(), 0x0.0000000000001p-1022},
+ };
+
+ for (mpfr::BinaryInput<double> &input : inputs) {
+ double x = input.x;
+ double y = input.y;
+ mpfr::RoundingMode rm = mpfr::RoundingMode::Downward;
+ mpfr::ForceRoundingMode rr(rm);
+ ASSERT_MPFR_MATCH(mpfr::Operation::Atan2, input,
+ LIBC_NAMESPACE::atan2(x, y), 0.5, rm);
+ input.x = -input.x;
+ ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Atan2, input,
+ LIBC_NAMESPACE::atan2(-x, y), 0.5);
+ input.y = -input.y;
+ ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Atan2, input,
+ LIBC_NAMESPACE::atan2(-x, -y), 0.5);
+ input.x = -input.x;
+ ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Atan2, input,
+ LIBC_NAMESPACE::atan2(x, -y), 0.5);
+ }
+}
+
+TEST_F(LlvmLibcAtan2Test, InDoubleRange) {
+ constexpr uint64_t X_COUNT = 123;
+ constexpr uint64_t X_START = FPBits(0.25).uintval();
+ constexpr uint64_t X_STOP = FPBits(4.0).uintval();
+ constexpr uint64_t X_STEP = (X_STOP - X_START) / X_COUNT;
+
+ constexpr uint64_t Y_COUNT = 137;
+ constexpr uint64_t Y_START = FPBits(0.25).uintval();
+ constexpr uint64_t Y_STOP = FPBits(4.0).uintval();
+ constexpr uint64_t Y_STEP = (Y_STOP - Y_START) / Y_COUNT;
+
+ auto test = [&](mpfr::RoundingMode rounding_mode) {
+ mpfr::ForceRoundingMode __r(rounding_mode);
+ if (!__r.success)
+ return;
+
+ uint64_t fails = 0;
+ uint64_t finite_count = 0;
+ uint64_t total_count = 0;
+ double failed_x = 0.0, failed_y = 0.0, failed_r = 0.0;
+ double tol = 0.5;
+
+ for (uint64_t i = 0, v = X_START; i <= X_COUNT; ++i, v += X_STEP) {
+ double x = FPBits(v).get_val();
+ if (FPBits(x).is_inf_or_nan() || x < 0.0)
+ continue;
+
+ for (uint64_t j = 0, w = Y_START; j <= Y_COUNT; ++j, w += Y_STEP) {
+ double y = FPBits(w).get_val();
+ if (FPBits(y).is_inf_or_nan())
+ continue;
+
+ double result = LIBC_NAMESPACE::atan2(x, y);
+ ++total_count;
+ if (FPBits(result).is_inf_or_nan())
+ continue;
+
+ ++finite_count;
+ mpfr::BinaryInput<double> inputs{x, y};
+
+ if (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Atan2, inputs,
+ result, 0.5, rounding_mode)) {
+ ++fails;
+ while (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(
+ mpfr::Operation::Atan2, inputs, result, tol, rounding_mode)) {
+ failed_x = x;
+ failed_y = y;
+ failed_r = result;
+
+ if (tol > 1000.0)
+ break;
+
+ tol *= 2.0;
+ }
+ }
+ }
+ }
+ if (fails || (finite_count < total_count)) {
+ tlog << " Atan2 failed: " << fails << "/" << finite_count << "/"
+ << total_count << " tests.\n"
+ << " Max ULPs is at most: " << static_cast<uint64_t>(tol) << ".\n";
+ }
+ if (fails) {
+ mpfr::BinaryInput<double> inputs{failed_x, failed_y};
+ EXPECT_MPFR_MATCH(mpfr::Operation::Atan2, inputs, failed_r, 0.5,
+ rounding_mode);
+ }
+ };
+
+ tlog << " Test Rounding To Nearest...\n";
+ test(mpfr::RoundingMode::Nearest);
+
+ tlog << " Test Rounding Downward...\n";
+ test(mpfr::RoundingMode::Downward);
+
+ tlog << " Test Rounding Upward...\n";
+ test(mpfr::RoundingMode::Upward);
+
+ tlog << " Test Rounding Toward Zero...\n";
+ test(mpfr::RoundingMode::TowardZero);
+}
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index 5ddc88a3..8b29423 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -3532,6 +3532,16 @@ add_fp_unittest(
)
add_fp_unittest(
+ atan2_test
+ SUITE
+ libc-math-smoke-tests
+ SRCS
+ atan2_test.cpp
+ DEPENDS
+ libc.src.math.atan2
+)
+
+add_fp_unittest(
scalblnf16_test
SUITE
libc-math-smoke-tests
diff --git a/libc/test/src/math/smoke/atan2_test.cpp b/libc/test/src/math/smoke/atan2_test.cpp
new file mode 100644
index 0000000..61dd6ca
--- /dev/null
+++ b/libc/test/src/math/smoke/atan2_test.cpp
@@ -0,0 +1,22 @@
+//===-- Unittests for atan2 -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/atan2.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+
+using LlvmLibcAtan2Test = LIBC_NAMESPACE::testing::FPTest<double>;
+
+TEST_F(LlvmLibcAtan2Test, SpecialNumbers) {
+ EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::atan2(aNaN, zero));
+ EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::atan2(1.0, aNaN));
+ EXPECT_FP_EQ_ALL_ROUNDING(0.0, LIBC_NAMESPACE::atan2(zero, zero));
+ EXPECT_FP_EQ_ALL_ROUNDING(-0.0, LIBC_NAMESPACE::atan2(-0.0, zero));
+ EXPECT_FP_EQ_ALL_ROUNDING(0.0, LIBC_NAMESPACE::atan2(1.0, inf));
+ EXPECT_FP_EQ_ALL_ROUNDING(-0.0, LIBC_NAMESPACE::atan2(-1.0, inf));
+}
diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index 917c6be..0840607 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -310,10 +310,6 @@ endif()
option(LIBCXX_ENABLE_PEDANTIC "Compile with pedantic enabled." OFF)
option(LIBCXX_ENABLE_WERROR "Fail and stop if a warning is triggered." OFF)
-option(LIBCXX_GENERATE_COVERAGE "Enable generating code coverage." OFF)
-set(LIBCXX_COVERAGE_LIBRARY "" CACHE STRING
- "The Profile-rt library used to build with code coverage")
-
set(LIBCXX_HERMETIC_STATIC_LIBRARY_DEFAULT OFF)
if (WIN32)
set(LIBCXX_HERMETIC_STATIC_LIBRARY_DEFAULT ON)
@@ -376,12 +372,6 @@ if (NOT LIBCXX_ENABLE_RTTI AND LIBCXX_ENABLE_EXCEPTIONS)
" for details.")
endif()
-# Ensure LLVM_USE_SANITIZER is not specified when LIBCXX_GENERATE_COVERAGE
-# is ON.
-if (LLVM_USE_SANITIZER AND LIBCXX_GENERATE_COVERAGE)
- message(FATAL_ERROR "LLVM_USE_SANITIZER cannot be used with LIBCXX_GENERATE_COVERAGE")
-endif()
-
if (LIBCXX_ENABLE_ABI_LINKER_SCRIPT)
if (APPLE)
message(FATAL_ERROR "LIBCXX_ENABLE_ABI_LINKER_SCRIPT cannot be used on APPLE targets")
@@ -490,12 +480,6 @@ endif()
# Configure compiler.
include(config-ix)
-# Configure coverage options.
-if (LIBCXX_GENERATE_COVERAGE)
- include(CodeCoverage)
- set(CMAKE_BUILD_TYPE "COVERAGE" CACHE STRING "" FORCE)
-endif()
-
#===============================================================================
# Setup Compiler Flags
#===============================================================================
diff --git a/libcxx/cmake/Modules/CodeCoverage.cmake b/libcxx/cmake/Modules/CodeCoverage.cmake
deleted file mode 100644
index 1bd3a78..0000000
--- a/libcxx/cmake/Modules/CodeCoverage.cmake
+++ /dev/null
@@ -1,50 +0,0 @@
-find_program(CODE_COVERAGE_LCOV lcov)
-if (NOT CODE_COVERAGE_LCOV)
- message(FATAL_ERROR "Cannot find lcov...")
-endif()
-
-find_program(CODE_COVERAGE_LLVM_COV llvm-cov)
-if (NOT CODE_COVERAGE_LLVM_COV)
- message(FATAL_ERROR "Cannot find llvm-cov...")
-endif()
-
-find_program(CODE_COVERAGE_GENHTML genhtml)
-if (NOT CODE_COVERAGE_GENHTML)
- message(FATAL_ERROR "Cannot find genhtml...")
-endif()
-
-set(CMAKE_CXX_FLAGS_COVERAGE "-g -O0 --coverage")
-
-function(setup_lcov_test_target_coverage target_name output_dir capture_dirs source_dirs)
- if (NOT DEFINED LIBCXX_BINARY_DIR)
- message(FATAL_ERROR "Variable must be set")
- endif()
-
- set(GCOV_TOOL "${LIBCXX_BINARY_DIR}/llvm-cov-wrapper")
- file(GENERATE OUTPUT ${GCOV_TOOL}
- CONTENT "#!/usr/bin/env bash\n${CODE_COVERAGE_LLVM_COV} gcov \"$@\"\n")
-
- file(MAKE_DIRECTORY ${output_dir})
-
- set(CAPTURE_DIRS "")
- foreach(cdir ${capture_dirs})
- list(APPEND CAPTURE_DIRS "-d;${cdir}")
- endforeach()
-
- set(EXTRACT_DIRS "")
- foreach(sdir ${source_dirs})
- list(APPEND EXTRACT_DIRS "'${sdir}/*'")
- endforeach()
-
- message(STATUS "Capture Directories: ${CAPTURE_DIRS}")
- message(STATUS "Extract Directories: ${EXTRACT_DIRS}")
-
- add_custom_target(generate-lib${target_name}-coverage
- COMMAND chmod +x ${GCOV_TOOL}
- COMMAND ${CODE_COVERAGE_LCOV} --gcov-tool ${GCOV_TOOL} --capture ${CAPTURE_DIRS} -o test_coverage.info
- COMMAND ${CODE_COVERAGE_LCOV} --gcov-tool ${GCOV_TOOL} --extract test_coverage.info ${EXTRACT_DIRS} -o test_coverage.info
- COMMAND ${CODE_COVERAGE_GENHTML} --demangle-cpp test_coverage.info -o test_coverage
- COMMAND ${CMAKE_COMMAND} -E remove test_coverage.info
- WORKING_DIRECTORY ${output_dir}
- COMMENT "Generating coverage results")
-endfunction()
diff --git a/libcxx/cmake/caches/Generic-no-exceptions.cmake b/libcxx/cmake/caches/Generic-no-exceptions.cmake
index f0dffef..c5b2ffd 100644
--- a/libcxx/cmake/caches/Generic-no-exceptions.cmake
+++ b/libcxx/cmake/caches/Generic-no-exceptions.cmake
@@ -1,2 +1,6 @@
set(LIBCXX_ENABLE_EXCEPTIONS OFF CACHE BOOL "")
set(LIBCXXABI_ENABLE_EXCEPTIONS OFF CACHE BOOL "")
+
+# Speed up the CI
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-experimental.cmake b/libcxx/cmake/caches/Generic-no-experimental.cmake
index f33ed01..f68b265 100644
--- a/libcxx/cmake/caches/Generic-no-experimental.cmake
+++ b/libcxx/cmake/caches/Generic-no-experimental.cmake
@@ -1,2 +1,6 @@
set(LIBCXX_TEST_PARAMS "enable_experimental=False" CACHE STRING "")
set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
+
+# Speed up the CI
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-filesystem.cmake b/libcxx/cmake/caches/Generic-no-filesystem.cmake
index 4000f3a..57b8d9f 100644
--- a/libcxx/cmake/caches/Generic-no-filesystem.cmake
+++ b/libcxx/cmake/caches/Generic-no-filesystem.cmake
@@ -1 +1,5 @@
set(LIBCXX_ENABLE_FILESYSTEM OFF CACHE BOOL "")
+
+# Speed up the CI
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-localization.cmake b/libcxx/cmake/caches/Generic-no-localization.cmake
index 79d6b44..d3150ec 100644
--- a/libcxx/cmake/caches/Generic-no-localization.cmake
+++ b/libcxx/cmake/caches/Generic-no-localization.cmake
@@ -1 +1,5 @@
set(LIBCXX_ENABLE_LOCALIZATION OFF CACHE BOOL "")
+
+# Speed up the CI
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-random_device.cmake b/libcxx/cmake/caches/Generic-no-random_device.cmake
index e9b4cc6..8cd1027 100644
--- a/libcxx/cmake/caches/Generic-no-random_device.cmake
+++ b/libcxx/cmake/caches/Generic-no-random_device.cmake
@@ -1 +1,5 @@
set(LIBCXX_ENABLE_RANDOM_DEVICE OFF CACHE BOOL "")
+
+# Speed up the CI
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-rtti.cmake b/libcxx/cmake/caches/Generic-no-rtti.cmake
index c62ddce..d080360 100644
--- a/libcxx/cmake/caches/Generic-no-rtti.cmake
+++ b/libcxx/cmake/caches/Generic-no-rtti.cmake
@@ -2,3 +2,7 @@ set(LIBCXX_ENABLE_RTTI OFF CACHE BOOL "")
set(LIBCXX_ENABLE_EXCEPTIONS OFF CACHE BOOL "")
set(LIBCXXABI_ENABLE_RTTI OFF CACHE BOOL "")
set(LIBCXXABI_ENABLE_EXCEPTIONS OFF CACHE BOOL "")
+
+# Speed up the CI
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-threads.cmake b/libcxx/cmake/caches/Generic-no-threads.cmake
index 616baef..81c92fc 100644
--- a/libcxx/cmake/caches/Generic-no-threads.cmake
+++ b/libcxx/cmake/caches/Generic-no-threads.cmake
@@ -1,3 +1,7 @@
set(LIBCXX_ENABLE_THREADS OFF CACHE BOOL "")
set(LIBCXXABI_ENABLE_THREADS OFF CACHE BOOL "")
set(LIBCXX_ENABLE_MONOTONIC_CLOCK OFF CACHE BOOL "")
+
+# Speed up the CI
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-tzdb.cmake b/libcxx/cmake/caches/Generic-no-tzdb.cmake
index 27c826ed..afe1c8a 100644
--- a/libcxx/cmake/caches/Generic-no-tzdb.cmake
+++ b/libcxx/cmake/caches/Generic-no-tzdb.cmake
@@ -1 +1,5 @@
set(LIBCXX_ENABLE_TIME_ZONE_DATABASE OFF CACHE BOOL "")
+
+# Speed up the CI
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-unicode.cmake b/libcxx/cmake/caches/Generic-no-unicode.cmake
index 01160bf2..27fbc33 100644
--- a/libcxx/cmake/caches/Generic-no-unicode.cmake
+++ b/libcxx/cmake/caches/Generic-no-unicode.cmake
@@ -1 +1,5 @@
set(LIBCXX_ENABLE_UNICODE OFF CACHE BOOL "")
+
+# Speed up the CI
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-wide-characters.cmake b/libcxx/cmake/caches/Generic-no-wide-characters.cmake
index 728d410..72c3045 100644
--- a/libcxx/cmake/caches/Generic-no-wide-characters.cmake
+++ b/libcxx/cmake/caches/Generic-no-wide-characters.cmake
@@ -1 +1,5 @@
set(LIBCXX_ENABLE_WIDE_CHARACTERS OFF CACHE BOOL "")
+
+# Speed up the CI
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/docs/Status/Cxx20Issues.csv b/libcxx/docs/Status/Cxx20Issues.csv
index 55a0f85..97ecf5e 100644
--- a/libcxx/docs/Status/Cxx20Issues.csv
+++ b/libcxx/docs/Status/Cxx20Issues.csv
@@ -78,7 +78,7 @@
"`2139 <https://wg21.link/LWG2139>`__","What is a user-defined type?","Rapperswil","",""
"`2970 <https://wg21.link/LWG2970>`__","Return type of std::visit misspecified","Rapperswil","|Complete|","11.0"
"`3058 <https://wg21.link/LWG3058>`__","Parallel adjacent_difference shouldn't require creating temporaries","Rapperswil","",""
-"`3062 <https://wg21.link/LWG3062>`__","Unnecessary decay_t in is_execution_policy_v should be remove_cvref_t","Rapperswil","",""
+"`3062 <https://wg21.link/LWG3062>`__","Unnecessary decay_t in is_execution_policy_v should be remove_cvref_t","Rapperswil","|Complete|","17.0"
"`3067 <https://wg21.link/LWG3067>`__","recursive_directory_iterator::pop must invalidate","Rapperswil","|Nothing To Do|",""
"`3071 <https://wg21.link/LWG3071>`__","[networking.ts] read_until still refers to ""input sequence""","Rapperswil","|Nothing To Do|",""
"`3074 <https://wg21.link/LWG3074>`__","Non-member functions for valarray should only deduce from the valarray","Rapperswil","",""
@@ -124,7 +124,7 @@
"`3137 <https://wg21.link/LWG3137>`__","Header for ``__cpp_lib_to_chars``\ ","San Diego","|Complete|",""
"`3140 <https://wg21.link/LWG3140>`__","``COMMON_REF``\ is unimplementable as specified","San Diego","|Nothing To Do|",""
"`3145 <https://wg21.link/LWG3145>`__","``file_clock``\ breaks ABI for C++17 implementations","San Diego","|Complete|",""
-"`3147 <https://wg21.link/LWG3147>`__","Definitions of ""likely"" and ""unlikely"" are likely to cause problems","San Diego","",""
+"`3147 <https://wg21.link/LWG3147>`__","Definitions of ""likely"" and ""unlikely"" are likely to cause problems","San Diego","|Nothing To Do|",""
"`3148 <https://wg21.link/LWG3148>`__","``<concepts>``\ should be freestanding","San Diego","",""
"`3153 <https://wg21.link/LWG3153>`__","``Common``\ and ``common_type``\ have too little in common","San Diego","|Complete|","13.0"
"`3154 <https://wg21.link/LWG3154>`__","``Common``\ and ``CommonReference``\ have a common defect","San Diego","|Nothing To Do|",""
@@ -155,7 +155,7 @@
"`3191 <https://wg21.link/LWG3191>`__","``std::ranges::shuffle``\ synopsis does not match algorithm definition","Cologne","|Complete|","15.0","|ranges|"
"`3196 <https://wg21.link/LWG3196>`__","``std::optional<T>``\ is ill-formed is ``T``\ is an array","Cologne","|Complete|",""
"`3198 <https://wg21.link/LWG3198>`__","Bad constraint on ``std::span::span()``\ ","Cologne","|Complete|",""
-"`3199 <https://wg21.link/LWG3199>`__","``istream >> bitset<0>``\ fails","Cologne","",""
+"`3199 <https://wg21.link/LWG3199>`__","``istream >> bitset<0>``\ fails","Cologne","|Complete|","10.0"
"`3202 <https://wg21.link/LWG3202>`__","P0318R1 was supposed to be revised","Cologne","|Complete|",""
"`3206 <https://wg21.link/LWG3206>`__","``year_month_day``\ conversion to ``sys_days``\ uses not-existing member function","Cologne","|Complete|",""
"`3208 <https://wg21.link/LWG3208>`__","``Boolean``\ 's expression requirements are ordered inconsistently","Cologne","|Nothing To Do|",""
@@ -249,7 +249,7 @@
"`3325 <https://wg21.link/LWG3325>`__","Constrain return type of transformation function for ``transform_view``\ ","Prague","|Complete|","15.0","|ranges|"
"`3326 <https://wg21.link/LWG3326>`__","``enable_view``\ has false positives","Prague","|Complete|","15.0","|ranges|"
"`3327 <https://wg21.link/LWG3327>`__","Format alignment specifiers vs. text direction","Prague","|Nothing To Do|","","|format|"
-"`3328 <https://wg21.link/LWG3328>`__","Clarify that ``std::string``\ is not good for UTF-8","Prague","",""
+"`3328 <https://wg21.link/LWG3328>`__","Clarify that ``std::string``\ is not good for UTF-8","Prague","|Nothing To Do|",""
"`3329 <https://wg21.link/LWG3329>`__","``totally_ordered_with``\ both directly and indirectly requires ``common_reference_with``\ ","Prague","|Complete|","13.0"
"`3330 <https://wg21.link/LWG3330>`__","Include ``<compare>``\ from most library headers","Prague","|Complete|","13.0","|spaceship|"
"`3331 <https://wg21.link/LWG3331>`__","Define ``totally_ordered/_with``\ in terms of ``partially-ordered-with``\ ","Prague","|Complete|","13.0"
@@ -271,7 +271,7 @@
"`3358 <https://wg21.link/LWG3358>`__","|sect|\ [span.cons] is mistaken that ``to_address``\ can throw","Prague","|Complete|","17.0"
"`3359 <https://wg21.link/LWG3359>`__","``<chrono>``\ leap second support should allow for negative leap seconds","Prague","|In Progress|","","|chrono|"
"`3360 <https://wg21.link/LWG3360>`__","``three_way_comparable_with``\ is inconsistent with similar concepts","Prague","|Nothing To Do|","","|spaceship|"
-"`3362 <https://wg21.link/LWG3362>`__","Strike ``stop_source``\ 's ``operator!=``\ ","Prague","",""
+"`3362 <https://wg21.link/LWG3362>`__","Strike ``stop_source``\ 's ``operator!=``\ ","Prague","|Complete|","17.0"
"`3363 <https://wg21.link/LWG3363>`__","``drop_while_view``\ should opt-out of ``sized_range``\ ","Prague","|Nothing To Do|","","|ranges|"
"`3364 <https://wg21.link/LWG3364>`__","Initialize data members of ranges and their iterators","Prague","|Complete|","16.0","|ranges|"
"`3367 <https://wg21.link/LWG3367>`__","Integer-class conversions should not throw","Prague","|Nothing To Do|",""
diff --git a/libcxx/docs/Status/Cxx23Issues.csv b/libcxx/docs/Status/Cxx23Issues.csv
index 0e466086..e1a5aa6 100644
--- a/libcxx/docs/Status/Cxx23Issues.csv
+++ b/libcxx/docs/Status/Cxx23Issues.csv
@@ -145,7 +145,7 @@
"`3607 <https://wg21.link/LWG3607>`__","``contiguous_iterator`` should not be allowed to have custom ``iter_move`` and ``iter_swap`` behavior","February 2022","|Nothing to do|","","|ranges|"
"`3610 <https://wg21.link/LWG3610>`__","``iota_view::size`` sometimes rejects integer-class types","February 2022","","","|ranges|"
"`3612 <https://wg21.link/LWG3612>`__","Inconsistent pointer alignment in ``std::format`` ","February 2022","|Complete|","14.0","|format|"
-"`3616 <https://wg21.link/LWG3616>`__","LWG 3498 seems to miss the non-member ``swap`` for ``basic_syncbuf`` ","February 2022","",""
+"`3616 <https://wg21.link/LWG3616>`__","LWG 3498 seems to miss the non-member ``swap`` for ``basic_syncbuf`` ","February 2022","|Complete|","18.0"
"`3618 <https://wg21.link/LWG3618>`__","Unnecessary ``iter_move`` for ``transform_view::iterator`` ","February 2022","|Complete|","19.0","|ranges|"
"`3619 <https://wg21.link/LWG3619>`__","Specification of ``vformat_to`` contains ill-formed ``formatted_size`` calls","February 2022","|Nothing to do|","","|format|"
"`3621 <https://wg21.link/LWG3621>`__","Remove feature-test macro ``__cpp_lib_monadic_optional`` ","February 2022","|Complete|","15.0"
@@ -180,7 +180,7 @@
"`3710 <https://wg21.link/LWG3710>`__","The ``end`` of ``chunk_view`` for input ranges can be ``const``","July 2022","","","|ranges|"
"`3711 <https://wg21.link/LWG3711>`__","Missing preconditions for slide_view constructor","July 2022","","","|ranges|"
"`3712 <https://wg21.link/LWG3712>`__","``chunk_view`` and ``slide_view`` should not be ``default_initializable``","July 2022","","","|ranges|"
-"`3713 <https://wg21.link/LWG3713>`__","Sorted with respect to comparator (only)","July 2022","",""
+"`3713 <https://wg21.link/LWG3713>`__","Sorted with respect to comparator (only)","July 2022","|Nothing To Do|",""
"`3715 <https://wg21.link/LWG3715>`__","``view_interface::empty`` is overconstrained","July 2022","|Complete|","19.0","|ranges|"
"`3719 <https://wg21.link/LWG3719>`__","Directory iterators should be usable with default sentinel","July 2022","|Complete|","17.0","|ranges|"
"`3721 <https://wg21.link/LWG3721>`__","Allow an ``arg-id`` with a value of zero for ``width`` in ``std-format-spec``","July 2022","|Complete|","16.0","|format|"
@@ -228,7 +228,7 @@
"`3778 <https://wg21.link/LWG3778>`__","``vector<bool>`` missing exception specifications", "November 2022","|Complete|","3.7",""
"`3781 <https://wg21.link/LWG3781>`__","The exposition-only alias templates ``cont-key-type`` and ``cont-mapped-type`` should be removed", "November 2022","|Nothing to do|","",""
"`3782 <https://wg21.link/LWG3782>`__","Should ``<math.h>`` declare ``::lerp``?", "November 2022","|Complete|","17.0",""
-"`3784 <https://wg21.link/LWG3784>`__","std.compat should not provide ``::byte`` and its friends", "November 2022","","",""
+"`3784 <https://wg21.link/LWG3784>`__","std.compat should not provide ``::byte`` and its friends", "November 2022","|Complete|","19.0",""
"`3785 <https://wg21.link/LWG3785>`__","``ranges::to`` is over-constrained on the destination type being a range", "November 2022","","","|ranges|"
"`3788 <https://wg21.link/LWG3788>`__","``jthread::operator=(jthread&&)`` postconditions are unimplementable under self-assignment", "November 2022","","",""
"`3792 <https://wg21.link/LWG3792>`__","``__cpp_lib_constexpr_algorithms`` should also be defined in ``<utility>``", "November 2022","|Complete|","16.0",""
@@ -241,9 +241,9 @@
"`3817 <https://wg21.link/LWG3817>`__","Missing preconditions on ``forward_list`` modifiers", "November 2022","","",""
"`3818 <https://wg21.link/LWG3818>`__","Exposition-only concepts are not described in library intro", "November 2022","|Nothing to do|","",""
"`3822 <https://wg21.link/LWG3822>`__","Avoiding normalization in ``filesystem::weakly_canonical``", "November 2022","","",""
-"`3823 <https://wg21.link/LWG3823>`__","Unnecessary precondition for ``is_aggregate``", "November 2022","","",""
+"`3823 <https://wg21.link/LWG3823>`__","Unnecessary precondition for ``is_aggregate``", "November 2022","|Nothing To Do|","",""
"`3824 <https://wg21.link/LWG3824>`__","Number of ``bind`` placeholders is underspecified", "November 2022","|Nothing to do|","",""
-"`3826 <https://wg21.link/LWG3826>`__","Redundant specification [for overload of yield_value]", "November 2022","","",""
+"`3826 <https://wg21.link/LWG3826>`__","Redundant specification [for overload of yield_value]", "November 2022","|Nothing To Do|","",""
"","","","","",""
"`2195 <https://wg21.link/LWG2195>`__","Missing constructors for ``match_results``","February 2023","","",""
"`2295 <https://wg21.link/LWG2295>`__","Locale name when the provided ``Facet`` is a ``nullptr``","February 2023","","",""
@@ -288,7 +288,7 @@
"`3803 <https://wg21.link/LWG3803>`__","``flat_foo`` constructors taking ``KeyContainer`` lack ``KeyCompare`` parameter","February 2023","","",""
"`3810 <https://wg21.link/LWG3810>`__","CTAD for ``std::basic_format_args``","February 2023","|Complete|","17.0","|format|"
"`3827 <https://wg21.link/LWG3827>`__","Deprecate ``<stdalign.h>`` and ``<stdbool.h>`` macros","February 2023","","",""
-"`3828 <https://wg21.link/LWG3828>`__","Sync ``intmax_t`` and ``uintmax_t`` with C2x","February 2023","","",""
+"`3828 <https://wg21.link/LWG3828>`__","Sync ``intmax_t`` and ``uintmax_t`` with C2x","February 2023","|Nothing To Do|","",""
"`3833 <https://wg21.link/LWG3833>`__","Remove specialization ``template<size_t N> struct formatter<const charT[N], charT>``","February 2023","|Complete|","17.0","|format|"
"`3836 <https://wg21.link/LWG3836>`__","``std::expected<bool, E1>`` conversion constructor ``expected(const expected<U, G>&)`` should take precedence over ``expected(U&&)`` with operator ``bool``","February 2023","|Complete|","18.0",""
"`3843 <https://wg21.link/LWG3843>`__","``std::expected<T,E>::value() &`` assumes ``E`` is copy constructible","February 2023","|Complete|","17.0",""
diff --git a/libcxx/docs/Status/Cxx2cIssues.csv b/libcxx/docs/Status/Cxx2cIssues.csv
index dec9af1..29eb163 100644
--- a/libcxx/docs/Status/Cxx2cIssues.csv
+++ b/libcxx/docs/Status/Cxx2cIssues.csv
@@ -18,7 +18,7 @@
"`3940 <https://wg21.link/LWG3940>`__","``std::expected<void, E>::value()`` also needs ``E`` to be copy constructible","Varna June 2023","|Complete|","18.0",""
"","","","","",""
"`2392 <https://wg21.link/LWG2392>`__","""character type"" is used but not defined","Kona November 2023","","",""
-"`3203 <https://wg21.link/LWG3203>`__","``span`` element access invalidation","Kona November 2023","","",""
+"`3203 <https://wg21.link/LWG3203>`__","``span`` element access invalidation","Kona November 2023","|Nothing To Do|","",""
"`3305 <https://wg21.link/LWG3305>`__","``any_cast<void>``","Kona November 2023","|Complete|","18.0",""
"`3431 <https://wg21.link/LWG3431>`__","``<=>`` for containers should require ``three_way_comparable<T>`` instead of ``<=>``","Kona November 2023","","",""
"`3749 <https://wg21.link/LWG3749>`__","``common_iterator`` should handle integer-class difference types","Kona November 2023","","",""
diff --git a/libcxx/include/__thread/thread.h b/libcxx/include/__thread/thread.h
index d2254a6..458c1cd 100644
--- a/libcxx/include/__thread/thread.h
+++ b/libcxx/include/__thread/thread.h
@@ -10,6 +10,7 @@
#ifndef _LIBCPP___THREAD_THREAD_H
#define _LIBCPP___THREAD_THREAD_H
+#include <__assert>
#include <__condition_variable/condition_variable.h>
#include <__config>
#include <__exception/terminate.h>
diff --git a/libcxx/src/CMakeLists.txt b/libcxx/src/CMakeLists.txt
index bfc88c4..fe9d2666 100644
--- a/libcxx/src/CMakeLists.txt
+++ b/libcxx/src/CMakeLists.txt
@@ -147,11 +147,6 @@ if(NOT LIBCXX_INSTALL_LIBRARY)
set(exclude_from_all EXCLUDE_FROM_ALL)
endif()
-if (LIBCXX_GENERATE_COVERAGE AND NOT LIBCXX_COVERAGE_LIBRARY)
- find_compiler_rt_library(profile LIBCXX_COVERAGE_LIBRARY)
-endif()
-add_library_flags_if(LIBCXX_COVERAGE_LIBRARY "${LIBCXX_COVERAGE_LIBRARY}")
-
if (APPLE AND LLVM_USE_SANITIZER)
if (("${LLVM_USE_SANITIZER}" STREQUAL "Address") OR
("${LLVM_USE_SANITIZER}" STREQUAL "Address;Undefined") OR
diff --git a/libcxx/test/CMakeLists.txt b/libcxx/test/CMakeLists.txt
index 001b29e..7440ffa 100644
--- a/libcxx/test/CMakeLists.txt
+++ b/libcxx/test/CMakeLists.txt
@@ -1,12 +1,6 @@
include(HandleLitArguments)
add_subdirectory(tools)
-# By default, libcxx and libcxxabi share a library directory.
-if (NOT LIBCXX_CXX_ABI_LIBRARY_PATH)
- set(LIBCXX_CXX_ABI_LIBRARY_PATH "${LIBCXX_LIBRARY_DIR}" CACHE PATH
- "The path to libc++abi library.")
-endif()
-
set(AUTO_GEN_COMMENT "## Autogenerated by libcxx configuration.\n# Do not edit!")
set(SERIALIZED_LIT_PARAMS "# Lit parameters serialized here for llvm-lit to pick them up\n")
@@ -49,15 +43,3 @@ add_lit_testsuite(check-cxx
"Running libcxx tests"
${CMAKE_CURRENT_BINARY_DIR}
DEPENDS cxx-test-depends)
-
-if (LIBCXX_GENERATE_COVERAGE)
- include(CodeCoverage)
- set(output_dir "${CMAKE_CURRENT_BINARY_DIR}/coverage")
- set(capture_dirs
- "${LIBCXX_LIB_CMAKEFILES_DIR}/cxx_objects.dir/"
- "${LIBCXX_LIB_CMAKEFILES_DIR}/cxx.dir/"
- "${LIBCXX_LIB_CMAKEFILES_DIR}/cxx_experimental.dir/"
- "${CMAKE_CURRENT_BINARY_DIR}")
- set(extract_dirs "${LIBCXX_SOURCE_DIR}/include;${LIBCXX_SOURCE_DIR}/src")
- setup_lcov_test_target_coverage("cxx" "${output_dir}" "${capture_dirs}" "${extract_dirs}")
-endif()
diff --git a/libcxx/test/std/atomics/atomics.ref/required_alignment.pass.cpp b/libcxx/test/std/atomics/atomics.ref/required_alignment.pass.cpp
index 86e0cba..4fc453f 100644
--- a/libcxx/test/std/atomics/atomics.ref/required_alignment.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.ref/required_alignment.pass.cpp
@@ -12,6 +12,7 @@
#include <atomic>
#include <cassert>
#include <concepts>
+#include <cstddef>
template <typename T>
constexpr void check_required_alignment() {
diff --git a/lld/ELF/Arch/LoongArch.cpp b/lld/ELF/Arch/LoongArch.cpp
index 9466e8b..db0bc6c 100644
--- a/lld/ELF/Arch/LoongArch.cpp
+++ b/lld/ELF/Arch/LoongArch.cpp
@@ -511,6 +511,12 @@ RelExpr LoongArch::getRelExpr(const RelType type, const Symbol &s,
return R_TLSDESC;
case R_LARCH_TLS_DESC_CALL:
return R_TLSDESC_CALL;
+ case R_LARCH_TLS_LD_PCREL20_S2:
+ return R_TLSLD_PC;
+ case R_LARCH_TLS_GD_PCREL20_S2:
+ return R_TLSGD_PC;
+ case R_LARCH_TLS_DESC_PCREL20_S2:
+ return R_TLSDESC_PC;
// Other known relocs that are explicitly unimplemented:
//
@@ -557,7 +563,11 @@ void LoongArch::relocate(uint8_t *loc, const Relocation &rel,
write64le(loc, val);
return;
+ // Relocs intended for `pcaddi`.
case R_LARCH_PCREL20_S2:
+ case R_LARCH_TLS_LD_PCREL20_S2:
+ case R_LARCH_TLS_GD_PCREL20_S2:
+ case R_LARCH_TLS_DESC_PCREL20_S2:
checkInt(loc, val, 22, rel);
checkAlignment(loc, val, 4, rel);
write32le(loc, setJ20(read32le(loc), val >> 2));
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index 36857d7..6ad5c3b 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -1308,7 +1308,8 @@ static unsigned handleTlsRelocation(RelType type, Symbol &sym,
// LoongArch does not yet implement transition from TLSDESC to LE/IE, so
// generate TLSDESC dynamic relocation for the dynamic linker to handle.
if (config->emachine == EM_LOONGARCH &&
- oneof<R_LOONGARCH_TLSDESC_PAGE_PC, R_TLSDESC, R_TLSDESC_CALL>(expr)) {
+ oneof<R_LOONGARCH_TLSDESC_PAGE_PC, R_TLSDESC, R_TLSDESC_PC,
+ R_TLSDESC_CALL>(expr)) {
if (expr != R_TLSDESC_CALL) {
sym.setFlags(NEEDS_TLSDESC);
c.addReloc({expr, type, offset, addend, &sym});
diff --git a/lld/test/ELF/loongarch-tls-gd-pcrel20-s2.s b/lld/test/ELF/loongarch-tls-gd-pcrel20-s2.s
new file mode 100644
index 0000000..d4d12b9
--- /dev/null
+++ b/lld/test/ELF/loongarch-tls-gd-pcrel20-s2.s
@@ -0,0 +1,129 @@
+# REQUIRES: loongarch
+# RUN: rm -rf %t && split-file %s %t
+
+# RUN: llvm-mc --filetype=obj --triple=loongarch32 %t/a.s -o %t/a.32.o
+# RUN: llvm-mc --filetype=obj --triple=loongarch32 %t/bc.s -o %t/bc.32.o
+# RUN: ld.lld -shared -soname=bc.so %t/bc.32.o -o %t/bc.32.so
+# RUN: llvm-mc --filetype=obj --triple=loongarch32 %t/tga.s -o %t/tga.32.o
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 %t/a.s -o %t/a.64.o
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 %t/bc.s -o %t/bc.64.o
+# RUN: ld.lld -shared -soname=bc.so %t/bc.64.o -o %t/bc.64.so
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 %t/tga.s -o %t/tga.64.o
+
+## LA32 GD
+# RUN: ld.lld -shared %t/a.32.o %t/bc.32.o -o %t/gd.32.so
+# RUN: llvm-readobj -r %t/gd.32.so | FileCheck --check-prefix=GD32-REL %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t/gd.32.so | FileCheck --check-prefix=GD32 %s
+
+## LA32 GD -> LE
+# RUN: ld.lld %t/a.32.o %t/bc.32.o %t/tga.32.o -o %t/le.32
+# RUN: llvm-readelf -r %t/le.32 | FileCheck --check-prefix=NOREL %s
+# RUN: llvm-readelf -x .got %t/le.32 | FileCheck --check-prefix=LE32-GOT %s
+# RUN: ld.lld -pie %t/a.32.o %t/bc.32.o %t/tga.32.o -o %t/le-pie.32
+# RUN: llvm-readelf -r %t/le-pie.32 | FileCheck --check-prefix=NOREL %s
+# RUN: llvm-readelf -x .got %t/le-pie.32 | FileCheck --check-prefix=LE32-GOT %s
+
+## LA32 GD -> IE
+# RUN: ld.lld %t/a.32.o %t/bc.32.so %t/tga.32.o -o %t/ie.32
+# RUN: llvm-readobj -r %t/ie.32 | FileCheck --check-prefix=IE32-REL %s
+# RUN: llvm-readelf -x .got %t/ie.32 | FileCheck --check-prefix=IE32-GOT %s
+
+## LA64 GD
+# RUN: ld.lld -shared %t/a.64.o %t/bc.64.o -o %t/gd.64.so
+# RUN: llvm-readobj -r %t/gd.64.so | FileCheck --check-prefix=GD64-REL %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t/gd.64.so | FileCheck --check-prefix=GD64 %s
+
+## LA64 GD -> LE
+# RUN: ld.lld %t/a.64.o %t/bc.64.o %t/tga.64.o -o %t/le.64
+# RUN: llvm-readelf -r %t/le.64 | FileCheck --check-prefix=NOREL %s
+# RUN: llvm-readelf -x .got %t/le.64 | FileCheck --check-prefix=LE64-GOT %s
+# RUN: ld.lld -pie %t/a.64.o %t/bc.64.o %t/tga.64.o -o %t/le-pie.64
+# RUN: llvm-readelf -r %t/le-pie.64 | FileCheck --check-prefix=NOREL %s
+# RUN: llvm-readelf -x .got %t/le-pie.64 | FileCheck --check-prefix=LE64-GOT %s
+
+## LA64 GD -> IE
+# RUN: ld.lld %t/a.64.o %t/bc.64.so %t/tga.64.o -o %t/ie.64
+# RUN: llvm-readobj -r %t/ie.64 | FileCheck --check-prefix=IE64-REL %s
+# RUN: llvm-readelf -x .got %t/ie.64 | FileCheck --check-prefix=IE64-GOT %s
+
+# GD32-REL: .rela.dyn {
+# GD32-REL-NEXT: 0x20300 R_LARCH_TLS_DTPMOD32 a 0x0
+# GD32-REL-NEXT: 0x20304 R_LARCH_TLS_DTPREL32 a 0x0
+# GD32-REL-NEXT: 0x20308 R_LARCH_TLS_DTPMOD32 b 0x0
+# GD32-REL-NEXT: 0x2030C R_LARCH_TLS_DTPREL32 b 0x0
+# GD32-REL-NEXT: }
+
+## &DTPMOD(a) - . = 0x20300 - 0x10250 = 16428<<2
+# GD32: 10250: pcaddi $a0, 16428
+# GD32-NEXT: bl 44
+
+## &DTPMOD(b) - . = 0x20308 - 0x10258 = 16428<<2
+# GD32: 10258: pcaddi $a0, 16428
+# GD32-NEXT: bl 36
+
+# GD64-REL: .rela.dyn {
+# GD64-REL-NEXT: 0x204C0 R_LARCH_TLS_DTPMOD64 a 0x0
+# GD64-REL-NEXT: 0x204C8 R_LARCH_TLS_DTPREL64 a 0x0
+# GD64-REL-NEXT: 0x204D0 R_LARCH_TLS_DTPMOD64 b 0x0
+# GD64-REL-NEXT: 0x204D8 R_LARCH_TLS_DTPREL64 b 0x0
+# GD64-REL-NEXT: }
+
+## &DTPMOD(a) - . = 0x204c0 - 0x10398 = 16458<<2
+# GD64: 10398: pcaddi $a0, 16458
+# GD64-NEXT: bl 52
+
+## &DTPMOD(b) - . = 0x204d0 - 0x103a4 = 16460<<2
+# GD64: 103a0: pcaddi $a0, 16460
+# GD64-NEXT: bl 44
+
+# NOREL: no relocations
+
+## .got contains pre-populated values: [a@dtpmod, a@dtprel, b@dtpmod, b@dtprel]
+## a@dtprel = st_value(a) = 0x8
+## b@dtprel = st_value(b) = 0xc
+# LE32-GOT: section '.got':
+# LE32-GOT-NEXT: 0x[[#%x,A:]] 01000000 08000000 01000000 0c000000
+# LE64-GOT: section '.got':
+# LE64-GOT-NEXT: 0x[[#%x,A:]] 01000000 00000000 08000000 00000000
+# LE64-GOT-NEXT: 0x[[#%x,A:]] 01000000 00000000 0c000000 00000000
+
+## a is local - relaxed to LE - its DTPMOD/DTPREL slots are link-time constants.
+## b is external - DTPMOD/DTPREL dynamic relocations are required.
+# IE32-REL: .rela.dyn {
+# IE32-REL-NEXT: 0x30220 R_LARCH_TLS_DTPMOD32 b 0x0
+# IE32-REL-NEXT: 0x30224 R_LARCH_TLS_DTPREL32 b 0x0
+# IE32-REL-NEXT: }
+# IE32-GOT: section '.got':
+# IE32-GOT-NEXT: 0x00030218 01000000 08000000 00000000 00000000
+
+# IE64-REL: .rela.dyn {
+# IE64-REL-NEXT: 0x30380 R_LARCH_TLS_DTPMOD64 b 0x0
+# IE64-REL-NEXT: 0x30388 R_LARCH_TLS_DTPREL64 b 0x0
+# IE64-REL-NEXT: }
+# IE64-GOT: section '.got':
+# IE64-GOT-NEXT: 0x00030370 01000000 00000000 08000000 00000000
+# IE64-GOT-NEXT: 0x00030380 00000000 00000000 00000000 00000000
+
+#--- a.s
+pcaddi $a0, %gd_pcrel_20(a)
+bl %plt(__tls_get_addr)
+
+pcaddi $a0, %gd_pcrel_20(b)
+bl %plt(__tls_get_addr)
+
+.section .tbss,"awT",@nobits
+.globl a
+.zero 8
+a:
+.zero 4
+
+#--- bc.s
+.section .tbss,"awT",@nobits
+.globl b, c
+b:
+.zero 4
+c:
+
+#--- tga.s
+.globl __tls_get_addr
+__tls_get_addr:
diff --git a/lld/test/ELF/loongarch-tls-ld-pcrel20-s2.s b/lld/test/ELF/loongarch-tls-ld-pcrel20-s2.s
new file mode 100644
index 0000000..70186f5
--- /dev/null
+++ b/lld/test/ELF/loongarch-tls-ld-pcrel20-s2.s
@@ -0,0 +1,82 @@
+# REQUIRES: loongarch
+# RUN: rm -rf %t && split-file %s %t
+
+# RUN: llvm-mc --filetype=obj --triple=loongarch32 --position-independent %t/a.s -o %t/a.32.o
+# RUN: llvm-mc --filetype=obj --triple=loongarch32 %t/tga.s -o %t/tga.32.o
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --position-independent %t/a.s -o %t/a.64.o
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 %t/tga.s -o %t/tga.64.o
+
+## LA32 LD
+# RUN: ld.lld -shared %t/a.32.o -o %t/ld.32.so
+# RUN: llvm-readobj -r %t/ld.32.so | FileCheck --check-prefix=LD32-REL %s
+# RUN: llvm-readelf -x .got %t/ld.32.so | FileCheck --check-prefix=LD32-GOT %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t/ld.32.so | FileCheck --check-prefixes=LD32 %s
+
+## LA32 LD -> LE
+# RUN: ld.lld %t/a.32.o %t/tga.32.o -o %t/le.32
+# RUN: llvm-readelf -r %t/le.32 | FileCheck --check-prefix=NOREL %s
+# RUN: llvm-readelf -x .got %t/le.32 | FileCheck --check-prefix=LE32-GOT %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t/le.32 | FileCheck --check-prefixes=LE32 %s
+
+## LA64 LD
+# RUN: ld.lld -shared %t/a.64.o -o %t/ld.64.so
+# RUN: llvm-readobj -r %t/ld.64.so | FileCheck --check-prefix=LD64-REL %s
+# RUN: llvm-readelf -x .got %t/ld.64.so | FileCheck --check-prefix=LD64-GOT %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t/ld.64.so | FileCheck --check-prefixes=LD64 %s
+
+## LA64 LD -> LE
+# RUN: ld.lld %t/a.64.o %t/tga.64.o -o %t/le.64
+# RUN: llvm-readelf -r %t/le.64 | FileCheck --check-prefix=NOREL %s
+# RUN: llvm-readelf -x .got %t/le.64 | FileCheck --check-prefix=LE64-GOT %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t/le.64 | FileCheck --check-prefixes=LE64 %s
+
+## a@dtprel = st_value(a) = 0 is a link-time constant.
+# LD32-REL: .rela.dyn {
+# LD32-REL-NEXT: 0x20280 R_LARCH_TLS_DTPMOD32 - 0x0
+# LD32-REL-NEXT: }
+# LD32-GOT: section '.got':
+# LD32-GOT-NEXT: 0x00020280 00000000 00000000
+
+# LD64-REL: .rela.dyn {
+# LD64-REL-NEXT: 0x20400 R_LARCH_TLS_DTPMOD64 - 0x0
+# LD64-REL-NEXT: }
+# LD64-GOT: section '.got':
+# LD64-GOT-NEXT: 0x00020400 00000000 00000000 00000000 00000000
+
+## LA32: &DTPMOD(a) - . = 0x20280 - 0x101cc = 16429<<2
+# LD32: 101cc: pcaddi $a0, 16429
+# LD32-NEXT: bl 48
+
+## LA64: &DTPMOD(a) - . = 0x20400 - 0x102e0 = 16456<<2
+# LD64: 102e0: pcaddi $a0, 16456
+# LD64-NEXT: bl 44
+
+# NOREL: no relocations
+
+## a is local - its DTPMOD/DTPREL slots are link-time constants.
+## a@dtpmod = 1 (main module)
+# LE32-GOT: section '.got':
+# LE32-GOT-NEXT: 0x0003011c 01000000 00000000
+
+# LE64-GOT: section '.got':
+# LE64-GOT-NEXT: 0x000301d0 01000000 00000000 00000000 00000000
+
+## LA32: DTPMOD(.LANCHOR0) - . = 0x3011c - 0x20114 = 16386<<2
+# LE32: 20114: pcaddi $a0, 16386
+# LE32-NEXT: bl 4
+
+## LA64: DTPMOD(.LANCHOR0) - . = 0x301d0 - 0x201c8 = 16386<<2
+# LE64: 201c8: pcaddi $a0, 16386
+# LE64-NEXT: bl 4
+
+#--- a.s
+pcaddi $a0, %ld_pcrel_20(.LANCHOR0)
+bl %plt(__tls_get_addr)
+
+.section .tbss,"awT",@nobits
+.set .LANCHOR0, . + 0
+.zero 8
+
+#--- tga.s
+.globl __tls_get_addr
+__tls_get_addr:
diff --git a/lld/test/ELF/loongarch-tlsdesc-pcrel20-s2.s b/lld/test/ELF/loongarch-tlsdesc-pcrel20-s2.s
new file mode 100644
index 0000000..99e21d9
--- /dev/null
+++ b/lld/test/ELF/loongarch-tlsdesc-pcrel20-s2.s
@@ -0,0 +1,142 @@
+# REQUIRES: loongarch
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=loongarch64 a.s -o a.64.o
+# RUN: llvm-mc -filetype=obj -triple=loongarch64 c.s -o c.64.o
+# RUN: ld.lld -shared -soname=c.64.so c.64.o -o c.64.so
+# RUN: llvm-mc -filetype=obj -triple=loongarch32 --defsym ELF32=1 a.s -o a.32.o
+# RUN: llvm-mc -filetype=obj -triple=loongarch32 --defsym ELF32=1 c.s -o c.32.o
+# RUN: ld.lld -shared -soname=c.32.so c.32.o -o c.32.so
+
+# RUN: ld.lld -shared -z now a.64.o c.64.o -o a.64.so
+# RUN: llvm-readobj -r -x .got a.64.so | FileCheck --check-prefix=GD64-RELA %s
+# RUN: llvm-objdump --no-show-raw-insn -h -d a.64.so | FileCheck %s --check-prefix=GD64
+
+# RUN: ld.lld -shared -z now a.64.o c.64.o -o rel.64.so -z rel
+# RUN: llvm-readobj -r -x .got rel.64.so | FileCheck --check-prefix=GD64-REL %s
+
+## FIXME: The transition frome TLSDESC to IE/LE has not yet been implemented.
+## Keep the dynamic relocations and hand them over to dynamic linker.
+
+# RUN: ld.lld -e 0 -z now a.64.o c.64.o -o a.64.le
+# RUN: llvm-readobj -r -x .got a.64.le | FileCheck --check-prefix=LE64-RELA %s
+
+# RUN: ld.lld -e 0 -z now a.64.o c.64.so -o a.64.ie
+# RUN: llvm-readobj -r -x .got a.64.ie | FileCheck --check-prefix=IE64-RELA %s
+
+## 32-bit code is mostly the same. We only test a few variants.
+
+# RUN: ld.lld -shared -z now a.32.o c.32.o -o rel.32.so -z rel
+# RUN: llvm-readobj -r -x .got rel.32.so | FileCheck --check-prefix=GD32-REL %s
+
+# GD64-RELA: .rela.dyn {
+# GD64-RELA-NEXT: 0x203F0 R_LARCH_TLS_DESC64 - 0x7FF
+# GD64-RELA-NEXT: 0x203D0 R_LARCH_TLS_DESC64 a 0x0
+# GD64-RELA-NEXT: 0x203E0 R_LARCH_TLS_DESC64 c 0x0
+# GD64-RELA-NEXT: }
+# GD64-RELA: Hex dump of section '.got':
+# GD64-RELA-NEXT: 0x000203d0 00000000 00000000 00000000 00000000 .
+# GD64-RELA-NEXT: 0x000203e0 00000000 00000000 00000000 00000000 .
+# GD64-RELA-NEXT: 0x000203f0 00000000 00000000 00000000 00000000 .
+
+# GD64-REL: .rel.dyn {
+# GD64-REL-NEXT: 0x203D8 R_LARCH_TLS_DESC64 -
+# GD64-REL-NEXT: 0x203B8 R_LARCH_TLS_DESC64 a
+# GD64-REL-NEXT: 0x203C8 R_LARCH_TLS_DESC64 c
+# GD64-REL-NEXT: }
+# GD64-REL: Hex dump of section '.got':
+# GD64-REL-NEXT: 0x000203b8 00000000 00000000 00000000 00000000 .
+# GD64-REL-NEXT: 0x000203c8 00000000 00000000 00000000 00000000 .
+# GD64-REL-NEXT: 0x000203d8 00000000 00000000 ff070000 00000000 .
+
+# GD64: .got 00000030 00000000000203d0
+
+## &.got[a]-. = 0x203d0 - 0x102e0 = 16444<<2
+# GD64: 102e0: pcaddi $a0, 16444
+# GD64-NEXT: ld.d $ra, $a0, 0
+# GD64-NEXT: jirl $ra, $ra, 0
+# GD64-NEXT: add.d $a1, $a0, $tp
+
+## &.got[b]-. = 0x203d0+32 - 0x102f0 = 16448<<2
+# GD64: 102f0: pcaddi $a0, 16448
+# GD64-NEXT: ld.d $ra, $a0, 0
+# GD64-NEXT: jirl $ra, $ra, 0
+# GD64-NEXT: add.d $a2, $a0, $tp
+
+## &.got[c]-. = 0x203d0+16 - 0x10300 = 16440<<2
+# GD64: 10300: pcaddi $a0, 16440
+# GD64-NEXT: ld.d $ra, $a0, 0
+# GD64-NEXT: jirl $ra, $ra, 0
+# GD64-NEXT: add.d $a3, $a0, $tp
+
+# LE64-RELA: .rela.dyn {
+# LE64-RELA-NEXT: 0x30240 R_LARCH_TLS_DESC64 - 0x8
+# LE64-RELA-NEXT: 0x30250 R_LARCH_TLS_DESC64 - 0x800
+# LE64-RELA-NEXT: 0x30260 R_LARCH_TLS_DESC64 - 0x7FF
+# LE64-RELA-NEXT: }
+# LE64-RELA: Hex dump of section '.got':
+# LE64-RELA-NEXT: 0x00030240 00000000 00000000 00000000 00000000 .
+# LE64-RELA-NEXT: 0x00030250 00000000 00000000 00000000 00000000 .
+# LE64-RELA-NEXT: 0x00030260 00000000 00000000 00000000 00000000 .
+
+# IE64-RELA: .rela.dyn {
+# IE64-RELA-NEXT: 0x303C8 R_LARCH_TLS_DESC64 - 0x8
+# IE64-RELA-NEXT: 0x303E8 R_LARCH_TLS_DESC64 - 0x7FF
+# IE64-RELA-NEXT: 0x303D8 R_LARCH_TLS_DESC64 c 0x0
+# IE64-RELA-NEXT: }
+# IE64-RELA: Hex dump of section '.got':
+# IE64-RELA-NEXT: 0x000303c8 00000000 00000000 00000000 00000000 .
+# IE64-RELA-NEXT: 0x000303d8 00000000 00000000 00000000 00000000 .
+# IE64-RELA-NEXT: 0x000303e8 00000000 00000000 00000000 00000000 .
+
+# GD32-REL: .rel.dyn {
+# GD32-REL-NEXT: 0x20264 R_LARCH_TLS_DESC32 -
+# GD32-REL-NEXT: 0x20254 R_LARCH_TLS_DESC32 a
+# GD32-REL-NEXT: 0x2025C R_LARCH_TLS_DESC32 c
+# GD32-REL-NEXT: }
+# GD32-REL: Hex dump of section '.got':
+# GD32-REL-NEXT: 0x00020254 00000000 00000000 00000000 00000000 .
+# GD32-REL-NEXT: 0x00020264 00000000 ff070000 .
+
+#--- a.s
+.macro add dst, src1, src2
+.ifdef ELF32
+add.w \dst, \src1, \src2
+.else
+add.d \dst, \src1, \src2
+.endif
+.endm
+.macro load dst, src1, src2
+.ifdef ELF32
+ld.w \dst, \src1, \src2
+.else
+ld.d \dst, \src1, \src2
+.endif
+.endm
+
+pcaddi $a0, %desc_pcrel_20(a)
+load $ra, $a0, %desc_ld(a)
+jirl $ra, $ra, %desc_call(a)
+add $a1, $a0, $tp
+
+pcaddi $a0, %desc_pcrel_20(b)
+load $ra, $a0, %desc_ld(b)
+jirl $ra, $ra, %desc_call(b)
+add $a2, $a0, $tp
+
+pcaddi $a0, %desc_pcrel_20(c)
+load $ra, $a0, %desc_ld(c)
+jirl $ra, $ra, %desc_call(c)
+add $a3, $a0, $tp
+
+.section .tbss,"awT",@nobits
+.globl a
+.zero 8
+a:
+.zero 2039 ## Place b at 0x7ff
+b:
+.zero 1
+
+#--- c.s
+.section .tbss,"awT",@nobits
+.globl c
+c: .zero 4
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
index 187370e..5d0a3e3 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
@@ -1145,8 +1145,8 @@ Status GDBRemoteCommunication::StartDebugserverProcess(
if (socket_pipe.CanWrite())
socket_pipe.CloseWriteFileDescriptor();
if (socket_pipe.CanRead()) {
- char port_cstr[PATH_MAX] = {0};
- port_cstr[0] = '\0';
+ // The port number may be up to "65535\0".
+ char port_cstr[6] = {0};
size_t num_bytes = sizeof(port_cstr);
// Read port from pipe with 10 second timeout.
error = socket_pipe.ReadWithTimeout(
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
index f70efe5..0386e3b 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -4726,66 +4726,67 @@ TypeSystemClang::GetFloatTypeSemantics(size_t byte_size) {
}
std::optional<uint64_t>
+TypeSystemClang::GetObjCBitSize(QualType qual_type,
+ ExecutionContextScope *exe_scope) {
+ assert(qual_type->isObjCObjectOrInterfaceType());
+ ExecutionContext exe_ctx(exe_scope);
+ if (Process *process = exe_ctx.GetProcessPtr()) {
+ if (ObjCLanguageRuntime *objc_runtime =
+ ObjCLanguageRuntime::Get(*process)) {
+ if (std::optional<uint64_t> bit_size =
+ objc_runtime->GetTypeBitSize(GetType(qual_type)))
+ return *bit_size;
+ }
+ } else {
+ static bool g_printed = false;
+ if (!g_printed) {
+ StreamString s;
+ DumpTypeDescription(qual_type.getAsOpaquePtr(), s);
+
+ llvm::outs() << "warning: trying to determine the size of type ";
+ llvm::outs() << s.GetString() << "\n";
+ llvm::outs() << "without a valid ExecutionContext. this is not "
+ "reliable. please file a bug against LLDB.\n";
+ llvm::outs() << "backtrace:\n";
+ llvm::sys::PrintStackTrace(llvm::outs());
+ llvm::outs() << "\n";
+ g_printed = true;
+ }
+ }
+
+ return getASTContext().getTypeSize(qual_type) +
+ getASTContext().getTypeSize(getASTContext().ObjCBuiltinClassTy);
+}
+
+std::optional<uint64_t>
TypeSystemClang::GetBitSize(lldb::opaque_compiler_type_t type,
ExecutionContextScope *exe_scope) {
- if (GetCompleteType(type)) {
- clang::QualType qual_type(GetCanonicalQualType(type));
- const clang::Type::TypeClass type_class = qual_type->getTypeClass();
- switch (type_class) {
- case clang::Type::Record:
- if (GetCompleteType(type))
- return getASTContext().getTypeSize(qual_type);
- else
- return std::nullopt;
- break;
+ if (!GetCompleteType(type))
+ return std::nullopt;
- case clang::Type::ObjCInterface:
- case clang::Type::ObjCObject: {
- ExecutionContext exe_ctx(exe_scope);
- Process *process = exe_ctx.GetProcessPtr();
- if (process) {
- if (ObjCLanguageRuntime *objc_runtime =
- ObjCLanguageRuntime::Get(*process)) {
- if (std::optional<uint64_t> bit_size =
- objc_runtime->GetTypeBitSize(GetType(qual_type)))
- return *bit_size;
- }
- } else {
- static bool g_printed = false;
- if (!g_printed) {
- StreamString s;
- DumpTypeDescription(type, s);
-
- llvm::outs() << "warning: trying to determine the size of type ";
- llvm::outs() << s.GetString() << "\n";
- llvm::outs() << "without a valid ExecutionContext. this is not "
- "reliable. please file a bug against LLDB.\n";
- llvm::outs() << "backtrace:\n";
- llvm::sys::PrintStackTrace(llvm::outs());
- llvm::outs() << "\n";
- g_printed = true;
- }
- }
- }
- [[fallthrough]];
- default:
- const uint32_t bit_size = getASTContext().getTypeSize(qual_type);
- if (bit_size == 0) {
- if (qual_type->isIncompleteArrayType())
- return getASTContext().getTypeSize(
- qual_type->getArrayElementTypeNoTypeQual()
- ->getCanonicalTypeUnqualified());
- }
- if (qual_type->isObjCObjectOrInterfaceType())
- return bit_size +
- getASTContext().getTypeSize(getASTContext().ObjCBuiltinClassTy);
- // Function types actually have a size of 0, that's not an error.
- if (qual_type->isFunctionProtoType())
- return bit_size;
- if (bit_size)
- return bit_size;
- }
+ clang::QualType qual_type(GetCanonicalQualType(type));
+ const clang::Type::TypeClass type_class = qual_type->getTypeClass();
+ switch (type_class) {
+ case clang::Type::FunctionProto:
+ case clang::Type::Record:
+ return getASTContext().getTypeSize(qual_type);
+ case clang::Type::ObjCInterface:
+ case clang::Type::ObjCObject:
+ return GetObjCBitSize(qual_type, exe_scope);
+ case clang::Type::IncompleteArray: {
+ const uint64_t bit_size = getASTContext().getTypeSize(qual_type);
+ if (bit_size == 0)
+ return getASTContext().getTypeSize(
+ qual_type->getArrayElementTypeNoTypeQual()
+ ->getCanonicalTypeUnqualified());
+
+ return bit_size;
+ }
+ default:
+ if (const uint64_t bit_size = getASTContext().getTypeSize(qual_type))
+ return bit_size;
}
+
return std::nullopt;
}
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
index d67b7a4..70722eb 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
@@ -1172,6 +1172,9 @@ private:
/// on creation of a new instance.
void LogCreation() const;
+ std::optional<uint64_t> GetObjCBitSize(clang::QualType qual_type,
+ ExecutionContextScope *exe_scope);
+
// Classes that inherit from TypeSystemClang can see and modify these
std::string m_target_triple;
std::unique_ptr<clang::ASTContext> m_ast_up;
diff --git a/lldb/test/API/lang/cpp/class-template-non-type-parameter-pack/TestClassTemplateNonTypeParameterPack.py b/lldb/test/API/lang/cpp/class-template-non-type-parameter-pack/TestClassTemplateNonTypeParameterPack.py
index 9e484e0..730537d 100644
--- a/lldb/test/API/lang/cpp/class-template-non-type-parameter-pack/TestClassTemplateNonTypeParameterPack.py
+++ b/lldb/test/API/lang/cpp/class-template-non-type-parameter-pack/TestClassTemplateNonTypeParameterPack.py
@@ -5,9 +5,6 @@ from lldbsuite.test import lldbutil
class TestCaseClassTemplateNonTypeParameterPack(TestBase):
- @expectedFailureAll(
- oslist=["windows"], archs=["i[3-6]86", "x86_64"]
- ) # Fails to read memory from target.
@no_debug_info_test
def test(self):
self.build()
diff --git a/lldb/test/API/lang/cpp/class-template-type-parameter-pack/TestClassTemplateTypeParameterPack.py b/lldb/test/API/lang/cpp/class-template-type-parameter-pack/TestClassTemplateTypeParameterPack.py
index 102c00d..1ed643e 100644
--- a/lldb/test/API/lang/cpp/class-template-type-parameter-pack/TestClassTemplateTypeParameterPack.py
+++ b/lldb/test/API/lang/cpp/class-template-type-parameter-pack/TestClassTemplateTypeParameterPack.py
@@ -5,9 +5,6 @@ from lldbsuite.test import lldbutil
class TestCaseClassTemplateTypeParameterPack(TestBase):
- @expectedFailureAll(
- oslist=["windows"], archs=["i[3-6]86", "x86_64"]
- ) # Fails to read memory from target.
@no_debug_info_test
def test(self):
self.build()
diff --git a/lldb/tools/lldb-dap/README.md b/lldb/tools/lldb-dap/README.md
index 8ecbaf7..11a14d2 100644
--- a/lldb/tools/lldb-dap/README.md
+++ b/lldb/tools/lldb-dap/README.md
@@ -157,6 +157,20 @@ locally on port `2345`.
}
```
+You can also use the `gdb-remote-port` parameter to send an attach request
+to a debug server running on the current machine,
+instead of using the custom command `attachCommands`.
+
+```javascript
+{
+ "name": "Local Debug Server",
+ "type": "lldb-dap",
+ "request": "attach",
+ "program": "/tmp/a.out",
+ "gdb-remote-port": 2345,
+}
+```
+
#### Connect to a Debug Server on Another Machine
This connects to a debug server running on another machine with hostname
@@ -173,6 +187,23 @@ port `5678` of that other machine.
}
```
+You can also use the `gdb-remote-hostname` and `gdb-remote-port` parameters
+to send an attach request to a debug server running on a different machine,
+instead of custom command `attachCommands`.
+The default hostname being used `localhost`.
+
+
+```javascript
+{
+ "name": "Local Debug Server",
+ "type": "lldb-dap",
+ "request": "attach",
+ "program": "/tmp/a.out",
+ "gdb-remote-port": 5678,
+ "gdb-remote-hostname": "hostname",
+}
+```
+
## Custom debugger commands
The `lldb-dap` tool includes additional custom commands to support the Debug
diff --git a/lldb/tools/lldb-dap/package.json b/lldb/tools/lldb-dap/package.json
index fd5de30..97e4efe 100644
--- a/lldb/tools/lldb-dap/package.json
+++ b/lldb/tools/lldb-dap/package.json
@@ -1,7 +1,7 @@
{
"name": "lldb-dap",
"displayName": "LLDB DAP",
- "version": "0.2.2",
+ "version": "0.2.3",
"publisher": "llvm-vs-code-extensions",
"homepage": "https://lldb.llvm.org",
"description": "LLDB debugging from VSCode",
@@ -353,7 +353,7 @@
"number",
"string"
],
- "description": "TCP/IP port to attach to. Specifying both pid and port is an error."
+ "description": "TCP/IP port to attach to a remote system. Specifying both pid and port is an error."
},
"gdb-remote-hostname": {
"type": "string",
diff --git a/llvm/bindings/ocaml/llvm/llvm.ml b/llvm/bindings/ocaml/llvm/llvm.ml
index 86b010e..908e665 100644
--- a/llvm/bindings/ocaml/llvm/llvm.ml
+++ b/llvm/bindings/ocaml/llvm/llvm.ml
@@ -519,7 +519,6 @@ external vector_size : lltype -> int = "llvm_vector_size"
(*--... Operations on other types ..........................................--*)
external void_type : llcontext -> lltype = "llvm_void_type"
external label_type : llcontext -> lltype = "llvm_label_type"
-external x86_mmx_type : llcontext -> lltype = "llvm_x86_mmx_type"
external type_by_name : llmodule -> string -> lltype option = "llvm_type_by_name"
external classify_value : llvalue -> ValueKind.t = "llvm_classify_value"
diff --git a/llvm/docs/ProgrammersManual.rst b/llvm/docs/ProgrammersManual.rst
index e00165c..231de56 100644
--- a/llvm/docs/ProgrammersManual.rst
+++ b/llvm/docs/ProgrammersManual.rst
@@ -1392,6 +1392,7 @@ How to use reduce-chunk-list:
First, Figure out the number of calls to the debug counter you want to minimize.
To do so, run the compilation command causing you want to minimize with `-print-debug-counter` adding a `-mllvm` if needed.
Than find the line with the counter of interest. it should look like:
+
.. code-block:: none
my-counter : {5678,empty}
@@ -1400,6 +1401,7 @@ The number of calls to `my-counter` is 5678
Than Find the minimum set of chunks that is interesting, with `reduce-chunk-list`.
Build a reproducer script like:
+
.. code-block:: bash
#! /bin/bash
diff --git a/llvm/include/llvm/ADT/DenseMap.h b/llvm/include/llvm/ADT/DenseMap.h
index 67d474d..f71cd5b 100644
--- a/llvm/include/llvm/ADT/DenseMap.h
+++ b/llvm/include/llvm/ADT/DenseMap.h
@@ -118,12 +118,13 @@ public:
return;
}
- const KeyT EmptyKey = getEmptyKey(), TombstoneKey = getTombstoneKey();
+ const KeyT EmptyKey = getEmptyKey();
if (std::is_trivially_destructible<ValueT>::value) {
// Use a simpler loop when values don't need destruction.
for (BucketT *P = getBuckets(), *E = getBucketsEnd(); P != E; ++P)
P->getFirst() = EmptyKey;
} else {
+ const KeyT TombstoneKey = getTombstoneKey();
unsigned NumEntries = getNumEntries();
for (BucketT *P = getBuckets(), *E = getBucketsEnd(); P != E; ++P) {
if (!KeyInfoT::isEqual(P->getFirst(), EmptyKey)) {
diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index cc40d2e..43eac14 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -200,9 +200,7 @@ public:
///
/// Only checks sets with elements in \p CheckDeps.
bool areDepsSafe(const DepCandidates &AccessSets,
- const MemAccessInfoList &CheckDeps,
- const DenseMap<Value *, SmallVector<const Value *, 16>>
- &UnderlyingObjects);
+ const MemAccessInfoList &CheckDeps);
/// No memory dependence was encountered that would inhibit
/// vectorization.
@@ -352,11 +350,8 @@ private:
/// element access it records this distance in \p MinDepDistBytes (if this
/// distance is smaller than any other distance encountered so far).
/// Otherwise, this function returns true signaling a possible dependence.
- Dependence::DepType
- isDependent(const MemAccessInfo &A, unsigned AIdx, const MemAccessInfo &B,
- unsigned BIdx,
- const DenseMap<Value *, SmallVector<const Value *, 16>>
- &UnderlyingObjects);
+ Dependence::DepType isDependent(const MemAccessInfo &A, unsigned AIdx,
+ const MemAccessInfo &B, unsigned BIdx);
/// Check whether the data dependence could prevent store-load
/// forwarding.
@@ -393,11 +388,9 @@ private:
/// determined, or a struct containing (Distance, Stride, TypeSize, AIsWrite,
/// BIsWrite).
std::variant<Dependence::DepType, DepDistanceStrideAndSizeInfo>
- getDependenceDistanceStrideAndSize(
- const MemAccessInfo &A, Instruction *AInst, const MemAccessInfo &B,
- Instruction *BInst,
- const DenseMap<Value *, SmallVector<const Value *, 16>>
- &UnderlyingObjects);
+ getDependenceDistanceStrideAndSize(const MemAccessInfo &A, Instruction *AInst,
+ const MemAccessInfo &B,
+ Instruction *BInst);
};
class RuntimePointerChecking;
@@ -799,7 +792,8 @@ replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
Value *Ptr);
/// If the pointer has a constant stride return it in units of the access type
-/// size. Otherwise return std::nullopt.
+/// size. If the pointer is loop-invariant, return 0. Otherwise return
+/// std::nullopt.
///
/// Ensure that it does not wrap in the address space, assuming the predicate
/// associated with \p PSE is true.
diff --git a/llvm/include/llvm/CodeGen/AsmPrinter.h b/llvm/include/llvm/CodeGen/AsmPrinter.h
index f57be39..36d1b47 100644
--- a/llvm/include/llvm/CodeGen/AsmPrinter.h
+++ b/llvm/include/llvm/CodeGen/AsmPrinter.h
@@ -225,6 +225,9 @@ private:
/// split stack prologue.
bool HasNoSplitStack = false;
+ /// True if debugging information is available in this module.
+ bool DbgInfoAvailable = false;
+
protected:
explicit AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer);
@@ -430,6 +433,9 @@ public:
/// Get the CFISection type for the module.
CFISection getModuleCFISectionType() const { return ModuleCFISection; }
+ /// Returns true if valid debug info is present.
+ bool hasDebugInfo() const { return DbgInfoAvailable; }
+
bool needsSEHMoves();
/// Since emitting CFI unwind information is entangled with supporting the
diff --git a/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h b/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
index 45a47d7..04667c0 100644
--- a/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
+++ b/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
@@ -188,6 +188,9 @@ public:
/// SelectionDAGISel::PrepareEHLandingPad().
unsigned ExceptionPointerVirtReg, ExceptionSelectorVirtReg;
+ /// The current call site index being processed, if any. 0 if none.
+ unsigned CurCallSite = 0;
+
/// Collection of dbg.declare instructions handled after argument
/// lowering and before ISel proper.
SmallPtrSet<const DbgDeclareInst *, 8> PreprocessedDbgDeclares;
@@ -281,6 +284,12 @@ public:
Register getCatchPadExceptionPointerVReg(const Value *CPI,
const TargetRegisterClass *RC);
+ /// Set the call site currently being processed.
+ void setCurrentCallSite(unsigned Site) { CurCallSite = Site; }
+
+ /// Get the call site currently being processed, if any. Return zero if none.
+ unsigned getCurrentCallSite() { return CurCallSite; }
+
private:
/// LiveOutRegInfo - Information about live out vregs.
IndexedMap<LiveOutInfo, VirtReg2IndexFunctor> LiveOutRegInfo;
diff --git a/llvm/include/llvm/CodeGen/MIRPrinter.h b/llvm/include/llvm/CodeGen/MIRPrinter.h
index e98ef72..85bd674 100644
--- a/llvm/include/llvm/CodeGen/MIRPrinter.h
+++ b/llvm/include/llvm/CodeGen/MIRPrinter.h
@@ -21,6 +21,7 @@ namespace llvm {
class MachineBasicBlock;
class MachineFunction;
+class MachineModuleInfo;
class Module;
template <typename T> class SmallVectorImpl;
diff --git a/llvm/include/llvm/CodeGen/MachineFunction.h b/llvm/include/llvm/CodeGen/MachineFunction.h
index 6e7292a..e1d03fe 100644
--- a/llvm/include/llvm/CodeGen/MachineFunction.h
+++ b/llvm/include/llvm/CodeGen/MachineFunction.h
@@ -59,7 +59,6 @@ class MachineConstantPool;
class MachineFrameInfo;
class MachineFunction;
class MachineJumpTableInfo;
-class MachineModuleInfo;
class MachineRegisterInfo;
class MCContext;
class MCInstrDesc;
@@ -260,7 +259,6 @@ class LLVM_EXTERNAL_VISIBILITY MachineFunction {
const LLVMTargetMachine &Target;
const TargetSubtargetInfo *STI;
MCContext &Ctx;
- MachineModuleInfo &MMI;
// RegInfo - Information about each register in use in the function.
MachineRegisterInfo *RegInfo;
@@ -395,15 +393,15 @@ class LLVM_EXTERNAL_VISIBILITY MachineFunction {
/// \}
- /// Clear all the members of this MachineFunction, but the ones used
- /// to initialize again the MachineFunction.
- /// More specifically, this deallocates all the dynamically allocated
- /// objects and get rid of all the XXXInfo data structure, but keep
- /// unchanged the references to Fn, Target, MMI, and FunctionNumber.
+ /// Clear all the members of this MachineFunction, but the ones used to
+ /// initialize again the MachineFunction. More specifically, this deallocates
+ /// all the dynamically allocated objects and get rids of all the XXXInfo data
+ /// structure, but keeps unchanged the references to Fn, Target, and
+ /// FunctionNumber.
void clear();
/// Allocate and initialize the different members.
/// In particular, the XXXInfo data structure.
- /// \pre Fn, Target, MMI, and FunctionNumber are properly set.
+ /// \pre Fn, Target, and FunctionNumber are properly set.
void init();
public:
@@ -632,8 +630,8 @@ public:
const static unsigned int DebugOperandMemNumber;
MachineFunction(Function &F, const LLVMTargetMachine &Target,
- const TargetSubtargetInfo &STI, unsigned FunctionNum,
- MachineModuleInfo &MMI);
+ const TargetSubtargetInfo &STI, MCContext &Ctx,
+ unsigned FunctionNum);
MachineFunction(const MachineFunction &) = delete;
MachineFunction &operator=(const MachineFunction &) = delete;
~MachineFunction();
@@ -665,7 +663,6 @@ public:
GISelChangeObserver *getObserver() const { return Observer; }
- MachineModuleInfo &getMMI() const { return MMI; }
MCContext &getContext() const { return Ctx; }
/// Returns the Section this function belongs to.
diff --git a/llvm/include/llvm/CodeGen/MachineInstr.h b/llvm/include/llvm/CodeGen/MachineInstr.h
index 229f515..04c8144 100644
--- a/llvm/include/llvm/CodeGen/MachineInstr.h
+++ b/llvm/include/llvm/CodeGen/MachineInstr.h
@@ -1723,7 +1723,7 @@ public:
/// Return true if it is safe to move this instruction. If
/// SawStore is set to true, it means that there is a store (or call) between
/// the instruction's location and its intended destination.
- bool isSafeToMove(AAResults *AA, bool &SawStore) const;
+ bool isSafeToMove(bool &SawStore) const;
/// Returns true if this instruction's memory access aliases the memory
/// access of Other.
diff --git a/llvm/include/llvm/CodeGen/MachineModuleInfo.h b/llvm/include/llvm/CodeGen/MachineModuleInfo.h
index 97b439c..310cc4b 100644
--- a/llvm/include/llvm/CodeGen/MachineModuleInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineModuleInfo.h
@@ -99,27 +99,6 @@ class MachineModuleInfo {
/// want.
MachineModuleInfoImpl *ObjFileMMI;
- /// \name Exception Handling
- /// \{
-
- /// The current call site index being processed, if any. 0 if none.
- unsigned CurCallSite = 0;
-
- /// \}
-
- // TODO: Ideally, what we'd like is to have a switch that allows emitting
- // synchronous (precise at call-sites only) CFA into .eh_frame. However,
- // even under this switch, we'd like .debug_frame to be precise when using
- // -g. At this moment, there's no way to specify that some CFI directives
- // go into .eh_frame only, while others go into .debug_frame only.
-
- /// True if debugging information is available in this module.
- bool DbgInfoAvailable = false;
-
- /// True if this module is being built for windows/msvc, and uses floating
- /// point. This is used to emit an undefined reference to _fltused.
- bool UsesMSVCFloatingPoint = false;
-
/// Maps IR Functions to their corresponding MachineFunctions.
DenseMap<const Function*, std::unique_ptr<MachineFunction>> MachineFunctions;
/// Next unique number available for a MachineFunction.
@@ -186,23 +165,6 @@ public:
return const_cast<MachineModuleInfo*>(this)->getObjFileInfo<Ty>();
}
- /// Returns true if valid debug info is present.
- bool hasDebugInfo() const { return DbgInfoAvailable; }
-
- bool usesMSVCFloatingPoint() const { return UsesMSVCFloatingPoint; }
-
- void setUsesMSVCFloatingPoint(bool b) { UsesMSVCFloatingPoint = b; }
-
- /// \name Exception Handling
- /// \{
-
- /// Set the call site currently being processed.
- void setCurrentCallSite(unsigned Site) { CurCallSite = Site; }
-
- /// Get the call site currently being processed, if any. return zero if
- /// none.
- unsigned getCurrentCallSite() { return CurCallSite; }
-
/// \}
}; // End class MachineModuleInfo
diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h
index a905c85..966e04f 100644
--- a/llvm/include/llvm/CodeGen/SDPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h
@@ -490,6 +490,12 @@ m_c_SetCC(const T0_P &LHS, const T1_P &RHS, const T2_P &CC) {
CC);
}
+template <typename T0_P, typename T1_P, typename T2_P>
+inline TernaryOpc_match<T0_P, T1_P, T2_P>
+m_Select(const T0_P &Cond, const T1_P &T, const T2_P &F) {
+ return TernaryOpc_match<T0_P, T1_P, T2_P>(ISD::SELECT, Cond, T, F);
+}
+
// === Binary operations ===
template <typename LHS_P, typename RHS_P, bool Commutable = false,
bool ExcludeChain = false>
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 24eab7b..6a80c8c 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -76,6 +76,7 @@ struct KnownBits;
class LLVMContext;
class MachineBasicBlock;
class MachineConstantPoolValue;
+class MachineModuleInfo;
class MCSymbol;
class OptimizationRemarkEmitter;
class ProfileSummaryInfo;
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 54f71c9..882cade 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -46,6 +46,7 @@ class LiveIntervals;
class LiveVariables;
class MachineLoop;
class MachineMemOperand;
+class MachineModuleInfo;
class MachineRegisterInfo;
class MCAsmInfo;
class MCInst;
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 646d2f7..a2b124d 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -725,11 +725,6 @@ public:
const MemAccessInfoList &getDependenciesToCheck() const { return CheckDeps; }
- const DenseMap<Value *, SmallVector<const Value *, 16>> &
- getUnderlyingObjects() const {
- return UnderlyingObjects;
- }
-
private:
typedef MapVector<MemAccessInfo, SmallSetVector<Type *, 1>> PtrAccessMap;
@@ -1455,22 +1450,23 @@ static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR,
}
/// Check whether the access through \p Ptr has a constant stride.
-std::optional<int64_t> llvm::getPtrStride(PredicatedScalarEvolution &PSE,
- Type *AccessTy, Value *Ptr,
- const Loop *Lp,
- const DenseMap<Value *, const SCEV *> &StridesMap,
- bool Assume, bool ShouldCheckWrap) {
+std::optional<int64_t>
+llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr,
+ const Loop *Lp,
+ const DenseMap<Value *, const SCEV *> &StridesMap,
+ bool Assume, bool ShouldCheckWrap) {
+ const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr);
+ if (PSE.getSE()->isLoopInvariant(PtrScev, Lp))
+ return {0};
+
Type *Ty = Ptr->getType();
assert(Ty->isPointerTy() && "Unexpected non-ptr");
-
if (isa<ScalableVectorType>(AccessTy)) {
LLVM_DEBUG(dbgs() << "LAA: Bad stride - Scalable object: " << *AccessTy
<< "\n");
return std::nullopt;
}
- const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr);
-
const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
if (Assume && !AR)
AR = PSE.getAsAddRec(Ptr);
@@ -1897,23 +1893,11 @@ static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride,
return ScaledDist % Stride;
}
-/// Returns true if any of the underlying objects has a loop varying address,
-/// i.e. may change in \p L.
-static bool
-isLoopVariantIndirectAddress(ArrayRef<const Value *> UnderlyingObjects,
- ScalarEvolution &SE, const Loop *L) {
- return any_of(UnderlyingObjects, [&SE, L](const Value *UO) {
- return !SE.isLoopInvariant(SE.getSCEV(const_cast<Value *>(UO)), L);
- });
-}
-
std::variant<MemoryDepChecker::Dependence::DepType,
MemoryDepChecker::DepDistanceStrideAndSizeInfo>
MemoryDepChecker::getDependenceDistanceStrideAndSize(
const AccessAnalysis::MemAccessInfo &A, Instruction *AInst,
- const AccessAnalysis::MemAccessInfo &B, Instruction *BInst,
- const DenseMap<Value *, SmallVector<const Value *, 16>>
- &UnderlyingObjects) {
+ const AccessAnalysis::MemAccessInfo &B, Instruction *BInst) {
const auto &DL = InnermostLoop->getHeader()->getDataLayout();
auto &SE = *PSE.getSE();
const auto &[APtr, AIsWrite] = A;
@@ -1931,12 +1915,10 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
BPtr->getType()->getPointerAddressSpace())
return MemoryDepChecker::Dependence::Unknown;
- int64_t StrideAPtr =
- getPtrStride(PSE, ATy, APtr, InnermostLoop, SymbolicStrides, true)
- .value_or(0);
- int64_t StrideBPtr =
- getPtrStride(PSE, BTy, BPtr, InnermostLoop, SymbolicStrides, true)
- .value_or(0);
+ std::optional<int64_t> StrideAPtr =
+ getPtrStride(PSE, ATy, APtr, InnermostLoop, SymbolicStrides, true, true);
+ std::optional<int64_t> StrideBPtr =
+ getPtrStride(PSE, BTy, BPtr, InnermostLoop, SymbolicStrides, true, true);
const SCEV *Src = PSE.getSCEV(APtr);
const SCEV *Sink = PSE.getSCEV(BPtr);
@@ -1944,26 +1926,19 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
// If the induction step is negative we have to invert source and sink of the
// dependence when measuring the distance between them. We should not swap
// AIsWrite with BIsWrite, as their uses expect them in program order.
- if (StrideAPtr < 0) {
+ if (StrideAPtr && *StrideAPtr < 0) {
std::swap(Src, Sink);
std::swap(AInst, BInst);
+ std::swap(StrideAPtr, StrideBPtr);
}
const SCEV *Dist = SE.getMinusSCEV(Sink, Src);
LLVM_DEBUG(dbgs() << "LAA: Src Scev: " << *Src << "Sink Scev: " << *Sink
- << "(Induction step: " << StrideAPtr << ")\n");
+ << "\n");
LLVM_DEBUG(dbgs() << "LAA: Distance for " << *AInst << " to " << *BInst
<< ": " << *Dist << "\n");
- // Needs accesses where the addresses of the accessed underlying objects do
- // not change within the loop.
- if (isLoopVariantIndirectAddress(UnderlyingObjects.find(APtr)->second, SE,
- InnermostLoop) ||
- isLoopVariantIndirectAddress(UnderlyingObjects.find(BPtr)->second, SE,
- InnermostLoop))
- return MemoryDepChecker::Dependence::IndirectUnsafe;
-
// Check if we can prove that Sink only accesses memory after Src's end or
// vice versa. At the moment this is limited to cases where either source or
// sink are loop invariant to avoid compile-time increases. This is not
@@ -1985,12 +1960,33 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
}
}
- // Need accesses with constant strides and the same direction. We don't want
- // to vectorize "A[B[i]] += ..." and similar code or pointer arithmetic that
- // could wrap in the address space.
- if (!StrideAPtr || !StrideBPtr || (StrideAPtr > 0 && StrideBPtr < 0) ||
- (StrideAPtr < 0 && StrideBPtr > 0)) {
+ // Need accesses with constant strides and the same direction for further
+ // dependence analysis. We don't want to vectorize "A[B[i]] += ..." and
+ // similar code or pointer arithmetic that could wrap in the address space.
+
+ // If either Src or Sink are not strided (i.e. not a non-wrapping AddRec) and
+ // not loop-invariant (stride will be 0 in that case), we cannot analyze the
+ // dependence further and also cannot generate runtime checks.
+ if (!StrideAPtr || !StrideBPtr) {
LLVM_DEBUG(dbgs() << "Pointer access with non-constant stride\n");
+ return MemoryDepChecker::Dependence::IndirectUnsafe;
+ }
+
+ int64_t StrideAPtrInt = *StrideAPtr;
+ int64_t StrideBPtrInt = *StrideBPtr;
+ LLVM_DEBUG(dbgs() << "LAA: Src induction step: " << StrideAPtrInt
+ << " Sink induction step: " << StrideBPtrInt << "\n");
+ // At least Src or Sink are loop invariant and the other is strided or
+ // invariant. We can generate a runtime check to disambiguate the accesses.
+ if (StrideAPtrInt == 0 || StrideBPtrInt == 0)
+ return MemoryDepChecker::Dependence::Unknown;
+
+ // Both Src and Sink have a constant stride, check if they are in the same
+ // direction.
+ if ((StrideAPtrInt > 0 && StrideBPtrInt < 0) ||
+ (StrideAPtrInt < 0 && StrideBPtrInt > 0)) {
+ LLVM_DEBUG(
+ dbgs() << "Pointer access with strides in different directions\n");
return MemoryDepChecker::Dependence::Unknown;
}
@@ -1999,22 +1995,20 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
DL.getTypeStoreSizeInBits(ATy) == DL.getTypeStoreSizeInBits(BTy);
if (!HasSameSize)
TypeByteSize = 0;
- return DepDistanceStrideAndSizeInfo(Dist, std::abs(StrideAPtr),
- std::abs(StrideBPtr), TypeByteSize,
+ return DepDistanceStrideAndSizeInfo(Dist, std::abs(StrideAPtrInt),
+ std::abs(StrideBPtrInt), TypeByteSize,
AIsWrite, BIsWrite);
}
-MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
- const MemAccessInfo &A, unsigned AIdx, const MemAccessInfo &B,
- unsigned BIdx,
- const DenseMap<Value *, SmallVector<const Value *, 16>>
- &UnderlyingObjects) {
+MemoryDepChecker::Dependence::DepType
+MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
+ const MemAccessInfo &B, unsigned BIdx) {
assert(AIdx < BIdx && "Must pass arguments in program order");
// Get the dependence distance, stride, type size and what access writes for
// the dependence between A and B.
- auto Res = getDependenceDistanceStrideAndSize(
- A, InstMap[AIdx], B, InstMap[BIdx], UnderlyingObjects);
+ auto Res =
+ getDependenceDistanceStrideAndSize(A, InstMap[AIdx], B, InstMap[BIdx]);
if (std::holds_alternative<Dependence::DepType>(Res))
return std::get<Dependence::DepType>(Res);
@@ -2248,10 +2242,8 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
return Dependence::BackwardVectorizable;
}
-bool MemoryDepChecker::areDepsSafe(
- const DepCandidates &AccessSets, const MemAccessInfoList &CheckDeps,
- const DenseMap<Value *, SmallVector<const Value *, 16>>
- &UnderlyingObjects) {
+bool MemoryDepChecker::areDepsSafe(const DepCandidates &AccessSets,
+ const MemAccessInfoList &CheckDeps) {
MinDepDistBytes = -1;
SmallPtrSet<MemAccessInfo, 8> Visited;
@@ -2294,8 +2286,8 @@ bool MemoryDepChecker::areDepsSafe(
if (*I1 > *I2)
std::swap(A, B);
- Dependence::DepType Type = isDependent(*A.first, A.second, *B.first,
- B.second, UnderlyingObjects);
+ Dependence::DepType Type =
+ isDependent(*A.first, A.second, *B.first, B.second);
mergeInStatus(Dependence::isSafeForVectorization(Type));
// Gather dependences unless we accumulated MaxDependences
@@ -2650,8 +2642,7 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI,
if (Accesses.isDependencyCheckNeeded()) {
LLVM_DEBUG(dbgs() << "LAA: Checking memory dependencies\n");
DepsAreSafe = DepChecker->areDepsSafe(DependentAccesses,
- Accesses.getDependenciesToCheck(),
- Accesses.getUnderlyingObjects());
+ Accesses.getDependenciesToCheck());
if (!DepsAreSafe && DepChecker->shouldRetryWithRuntimeCheck()) {
LLVM_DEBUG(dbgs() << "LAA: Retrying with memory checks\n");
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 2297b27..4e35664 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -436,6 +436,7 @@ bool AsmPrinter::doInitialization(Module &M) {
MMI = MMIWP ? &MMIWP->getMMI() : nullptr;
HasSplitStack = false;
HasNoSplitStack = false;
+ DbgInfoAvailable = !M.debug_compile_units().empty();
AddrLabelSymbols = nullptr;
@@ -541,8 +542,7 @@ bool AsmPrinter::doInitialization(Module &M) {
if (EmitCodeView && TM.getTargetTriple().isOSWindows())
DebugHandlers.push_back(std::make_unique<CodeViewDebug>(this));
if (!EmitCodeView || M.getDwarfVersion()) {
- assert(MMI && "MMI could not be nullptr here!");
- if (MMI->hasDebugInfo()) {
+ if (hasDebugInfo()) {
DD = new DwarfDebug(this);
DebugHandlers.push_back(std::unique_ptr<DwarfDebug>(DD));
}
@@ -1277,8 +1277,7 @@ AsmPrinter::getFunctionCFISectionType(const Function &F) const {
if (MAI->usesCFIWithoutEH() && F.hasUWTable())
return CFISection::EH;
- assert(MMI != nullptr && "Invalid machine module info");
- if (MMI->hasDebugInfo() || TM.Options.ForceDwarfFrameSection)
+ if (hasDebugInfo() || TM.Options.ForceDwarfFrameSection)
return CFISection::Debug;
return CFISection::None;
@@ -1669,10 +1668,9 @@ void AsmPrinter::emitPCSections(const MachineFunction &MF) {
}
/// Returns true if function begin and end labels should be emitted.
-static bool needFuncLabels(const MachineFunction &MF,
- const MachineModuleInfo &MMI) {
- if (!MF.getLandingPads().empty() || MF.hasEHFunclets() ||
- MMI.hasDebugInfo() ||
+static bool needFuncLabels(const MachineFunction &MF, const AsmPrinter &Asm) {
+ if (Asm.hasDebugInfo() || !MF.getLandingPads().empty() ||
+ MF.hasEHFunclets() ||
MF.getFunction().hasMetadata(LLVMContext::MD_pcsections))
return true;
@@ -1944,7 +1942,7 @@ void AsmPrinter::emitFunctionBody() {
// are automatically sized.
bool EmitFunctionSize = MAI->hasDotTypeDotSizeDirective() && !TT.isWasm();
- if (needFuncLabels(*MF, *MMI) || EmitFunctionSize) {
+ if (EmitFunctionSize || needFuncLabels(*MF, *this)) {
// Create a symbol for the end of function.
CurrentFnEnd = createTempSymbol("func_end");
OutStreamer->emitLabel(CurrentFnEnd);
@@ -2588,7 +2586,7 @@ void AsmPrinter::SetupMachineFunction(MachineFunction &MF) {
if (F.hasFnAttribute("patchable-function-entry") ||
F.hasFnAttribute("function-instrument") ||
F.hasFnAttribute("xray-instruction-threshold") ||
- needFuncLabels(MF, *MMI) || NeedsLocalForSize ||
+ needFuncLabels(MF, *this) || NeedsLocalForSize ||
MF.getTarget().Options.EmitStackSizeSection ||
MF.getTarget().Options.BBAddrMap || MF.hasBBLabels()) {
CurrentFnBegin = createTempSymbol("func_begin");
diff --git a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index dddc08b..7700ffd 100644
--- a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -613,7 +613,7 @@ static SourceLanguage MapDWLangToCVLang(unsigned DWLang) {
void CodeViewDebug::beginModule(Module *M) {
// If module doesn't have named metadata anchors or COFF debug section
// is not available, skip any debug info related stuff.
- if (!MMI->hasDebugInfo() ||
+ if (!Asm->hasDebugInfo() ||
!Asm->getObjFileLowering().getCOFFDebugSymbolsSection()) {
Asm = nullptr;
return;
@@ -636,7 +636,7 @@ void CodeViewDebug::beginModule(Module *M) {
}
void CodeViewDebug::endModule() {
- if (!Asm || !MMI->hasDebugInfo())
+ if (!Asm || !Asm->hasDebugInfo())
return;
// The COFF .debug$S section consists of several subsections, each starting
diff --git a/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
index 6c70c47..de2263c 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
@@ -241,10 +241,7 @@ bool DebugHandlerBase::isUnsignedDIType(const DIType *Ty) {
Ty->getTag() == dwarf::DW_TAG_unspecified_type;
}
-static bool hasDebugInfo(const MachineModuleInfo *MMI,
- const MachineFunction *MF) {
- if (!MMI->hasDebugInfo())
- return false;
+static bool hasDebugInfo(const MachineFunction *MF) {
auto *SP = MF->getFunction().getSubprogram();
if (!SP)
return false;
@@ -258,7 +255,7 @@ static bool hasDebugInfo(const MachineModuleInfo *MMI,
void DebugHandlerBase::beginFunction(const MachineFunction *MF) {
PrevInstBB = nullptr;
- if (!Asm || !hasDebugInfo(MMI, MF)) {
+ if (!Asm || !hasDebugInfo(MF)) {
skippedNonDebugFunction();
return;
}
@@ -354,7 +351,7 @@ void DebugHandlerBase::beginFunction(const MachineFunction *MF) {
}
void DebugHandlerBase::beginInstruction(const MachineInstr *MI) {
- if (!Asm || !MMI->hasDebugInfo())
+ if (!Asm || !Asm->hasDebugInfo())
return;
assert(CurMI == nullptr);
@@ -380,7 +377,7 @@ void DebugHandlerBase::beginInstruction(const MachineInstr *MI) {
}
void DebugHandlerBase::endInstruction() {
- if (!Asm || !MMI->hasDebugInfo())
+ if (!Asm || !Asm->hasDebugInfo())
return;
assert(CurMI != nullptr);
@@ -415,7 +412,7 @@ void DebugHandlerBase::endInstruction() {
}
void DebugHandlerBase::endFunction(const MachineFunction *MF) {
- if (Asm && hasDebugInfo(MMI, MF))
+ if (Asm && hasDebugInfo(MF))
endFunctionImpl(MF);
DbgValues.clear();
DbgLabels.clear();
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 5f1f315..ac4d0f2 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -1148,14 +1148,15 @@ sortGlobalExprs(SmallVectorImpl<DwarfCompileUnit::GlobalExpr> &GVEs) {
void DwarfDebug::beginModule(Module *M) {
DebugHandlerBase::beginModule(M);
- if (!Asm || !MMI->hasDebugInfo())
+ if (!Asm)
return;
unsigned NumDebugCUs = std::distance(M->debug_compile_units_begin(),
M->debug_compile_units_end());
+ if (NumDebugCUs == 0)
+ return;
+
assert(NumDebugCUs > 0 && "Asm unexpectedly initialized");
- assert(MMI->hasDebugInfo() &&
- "DebugInfoAvailabilty unexpectedly not initialized");
SingleCU = NumDebugCUs == 1;
DenseMap<DIGlobalVariable *, SmallVector<DwarfCompileUnit::GlobalExpr, 1>>
GVMap;
@@ -1433,7 +1434,7 @@ void DwarfDebug::endModule() {
// If we aren't actually generating debug info (check beginModule -
// conditionalized on the presence of the llvm.dbg.cu metadata node)
- if (!Asm || !MMI->hasDebugInfo())
+ if (!Asm || !Asm->hasDebugInfo())
return;
// Finalize the debug info for the module.
diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp
index 92a03eb5..1dc2785 100644
--- a/llvm/lib/CodeGen/BranchFolding.cpp
+++ b/llvm/lib/CodeGen/BranchFolding.cpp
@@ -1891,7 +1891,7 @@ MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB,
// Also avoid moving code above predicated instruction since it's hard to
// reason about register liveness with predicated instruction.
bool DontMoveAcrossStore = true;
- if (!PI->isSafeToMove(nullptr, DontMoveAcrossStore) || TII->isPredicated(*PI))
+ if (!PI->isSafeToMove(DontMoveAcrossStore) || TII->isPredicated(*PI))
return MBB->end();
// Find out what registers are live. Note this routine is ignoring other live
@@ -2015,7 +2015,7 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
break;
bool DontMoveAcrossStore = true;
- if (!TIB->isSafeToMove(nullptr, DontMoveAcrossStore))
+ if (!TIB->isSafeToMove(DontMoveAcrossStore))
break;
// Remove kills from ActiveDefsSet, these registers had short live ranges.
diff --git a/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp b/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
index 578854c..7fc25cd 100644
--- a/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
+++ b/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
@@ -92,7 +92,7 @@ bool DeadMachineInstructionElimImpl::isDead(const MachineInstr *MI) const {
// Don't delete instructions with side effects.
bool SawStore = false;
- if (!MI->isSafeToMove(nullptr, SawStore) && !MI->isPHI())
+ if (!MI->isSafeToMove(SawStore) && !MI->isPHI())
return false;
// Examine each operand.
diff --git a/llvm/lib/CodeGen/EarlyIfConversion.cpp b/llvm/lib/CodeGen/EarlyIfConversion.cpp
index a5c9949..d506c62 100644
--- a/llvm/lib/CodeGen/EarlyIfConversion.cpp
+++ b/llvm/lib/CodeGen/EarlyIfConversion.cpp
@@ -235,7 +235,7 @@ bool SSAIfConv::canSpeculateInstrs(MachineBasicBlock *MBB) {
// We never speculate stores, so an AA pointer isn't necessary.
bool DontMoveAcrossStore = true;
- if (!MI.isSafeToMove(nullptr, DontMoveAcrossStore)) {
+ if (!MI.isSafeToMove(DontMoveAcrossStore)) {
LLVM_DEBUG(dbgs() << "Can't speculate: " << MI);
return false;
}
diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index c906f3a..0806648 100644
--- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -236,7 +236,7 @@ bool llvm::isTriviallyDead(const MachineInstr &MI,
// If we can move an instruction, we can remove it. Otherwise, it has
// a side-effect of some sort.
bool SawStore = false;
- if (!MI.isSafeToMove(/*AA=*/nullptr, SawStore) && !MI.isPHI())
+ if (!MI.isSafeToMove(SawStore) && !MI.isPHI())
return false;
// Instructions without side-effects are dead iff they only define dead vregs.
diff --git a/llvm/lib/CodeGen/IfConversion.cpp b/llvm/lib/CodeGen/IfConversion.cpp
index f378956..ba5605a 100644
--- a/llvm/lib/CodeGen/IfConversion.cpp
+++ b/llvm/lib/CodeGen/IfConversion.cpp
@@ -2097,7 +2097,7 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
static bool MaySpeculate(const MachineInstr &MI,
SmallSet<MCPhysReg, 4> &LaterRedefs) {
bool SawStore = true;
- if (!MI.isSafeToMove(nullptr, SawStore))
+ if (!MI.isSafeToMove(SawStore))
return false;
for (const MachineOperand &MO : MI.operands()) {
diff --git a/llvm/lib/CodeGen/LiveRangeEdit.cpp b/llvm/lib/CodeGen/LiveRangeEdit.cpp
index 7b7b545..c3c581d 100644
--- a/llvm/lib/CodeGen/LiveRangeEdit.cpp
+++ b/llvm/lib/CodeGen/LiveRangeEdit.cpp
@@ -238,7 +238,7 @@ bool LiveRangeEdit::foldAsLoad(LiveInterval *LI,
// We also need to make sure it is safe to move the load.
// Assume there are stores between DefMI and UseMI.
bool SawStore = true;
- if (!DefMI->isSafeToMove(nullptr, SawStore))
+ if (!DefMI->isSafeToMove(SawStore))
return false;
LLVM_DEBUG(dbgs() << "Try to fold single def: " << *DefMI
@@ -300,7 +300,7 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink) {
// Use the same criteria as DeadMachineInstructionElim.
bool SawStore = false;
- if (!MI->isSafeToMove(nullptr, SawStore)) {
+ if (!MI->isSafeToMove(SawStore)) {
LLVM_DEBUG(dbgs() << "Can't delete: " << Idx << '\t' << *MI);
return;
}
diff --git a/llvm/lib/CodeGen/LiveRangeShrink.cpp b/llvm/lib/CodeGen/LiveRangeShrink.cpp
index af7d6c4..3e3e3e5 100644
--- a/llvm/lib/CodeGen/LiveRangeShrink.cpp
+++ b/llvm/lib/CodeGen/LiveRangeShrink.cpp
@@ -153,7 +153,7 @@ bool LiveRangeShrink::runOnMachineFunction(MachineFunction &MF) {
}
}
- if (!MI.isSafeToMove(nullptr, SawStore)) {
+ if (!MI.isSafeToMove(SawStore)) {
// If MI has side effects, it should become a barrier for code motion.
// IOM is rebuild from the next instruction to prevent later
// instructions from being moved before this MI.
diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp
index 7f6a752..40bde20 100644
--- a/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/llvm/lib/CodeGen/MachineFunction.cpp
@@ -162,9 +162,9 @@ static inline Align getFnStackAlignment(const TargetSubtargetInfo *STI,
}
MachineFunction::MachineFunction(Function &F, const LLVMTargetMachine &Target,
- const TargetSubtargetInfo &STI,
- unsigned FunctionNum, MachineModuleInfo &mmi)
- : F(F), Target(Target), STI(&STI), Ctx(mmi.getContext()), MMI(mmi) {
+ const TargetSubtargetInfo &STI, MCContext &Ctx,
+ unsigned FunctionNum)
+ : F(F), Target(Target), STI(&STI), Ctx(Ctx) {
FunctionNumber = FunctionNum;
init();
}
@@ -654,9 +654,14 @@ void MachineFunction::print(raw_ostream &OS, const SlotIndexes *Indexes) const {
/// True if this function needs frame moves for debug or exceptions.
bool MachineFunction::needsFrameMoves() const {
- return getMMI().hasDebugInfo() ||
- getTarget().Options.ForceDwarfFrameSection ||
- F.needsUnwindTableEntry();
+ // TODO: Ideally, what we'd like is to have a switch that allows emitting
+ // synchronous (precise at call-sites only) CFA into .eh_frame. However, even
+ // under this switch, we'd like .debug_frame to be precise when using -g. At
+ // this moment, there's no way to specify that some CFI directives go into
+ // .eh_frame only, while others go into .debug_frame only.
+ return getTarget().Options.ForceDwarfFrameSection ||
+ F.needsUnwindTableEntry() ||
+ !F.getParent()->debug_compile_units().empty();
}
namespace llvm {
diff --git a/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp b/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp
index 24eb360..e7a4d6d 100644
--- a/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp
@@ -37,7 +37,7 @@ MachineFunctionAnalysis::run(Function &F, FunctionAnalysisManager &FAM) {
.getCachedResult<MachineModuleAnalysis>(*F.getParent())
->getMMI();
auto MF = std::make_unique<MachineFunction>(
- F, *TM, STI, Context.generateMachineFunctionNum(F), MMI);
+ F, *TM, STI, MMI.getContext(), Context.generateMachineFunctionNum(F));
MF->initTargetMachineFunctionInfo(STI);
// MRI callback for target specific initializations.
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp
index be64e9c..583bc1b 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -1293,7 +1293,7 @@ void MachineInstr::substituteRegister(Register FromReg, Register ToReg,
/// isSafeToMove - Return true if it is safe to move this instruction. If
/// SawStore is set to true, it means that there is a store (or call) between
/// the instruction's location and its intended destination.
-bool MachineInstr::isSafeToMove(AAResults *AA, bool &SawStore) const {
+bool MachineInstr::isSafeToMove(bool &SawStore) const {
// Ignore stuff that we obviously can't move.
//
// Treat volatile loads as stores. This is not strictly necessary for
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index d75df2a..1e4bf4b 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -1075,7 +1075,7 @@ static bool isCopyFeedingInvariantStore(const MachineInstr &MI,
bool MachineLICMBase::IsLICMCandidate(MachineInstr &I, MachineLoop *CurLoop) {
// Check if it's safe to move the instruction.
bool DontMoveAcrossStore = !HoistConstLoads || !AllowedToHoistLoads[CurLoop];
- if ((!I.isSafeToMove(AA, DontMoveAcrossStore)) &&
+ if ((!I.isSafeToMove(DontMoveAcrossStore)) &&
!(HoistConstStores && isInvariantStore(I, TRI, MRI))) {
LLVM_DEBUG(dbgs() << "LICM: Instruction not safe to move.\n");
return false;
diff --git a/llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp b/llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp
index 1f596cd..9d650e4 100644
--- a/llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp
+++ b/llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp
@@ -158,8 +158,7 @@ static bool isCandidate(const MachineInstr *MI, Register &DefedReg,
Register FrameReg) {
DefedReg = MCRegister::NoRegister;
bool SawStore = true;
- if (!MI->isSafeToMove(nullptr, SawStore) || MI->isImplicitDef() ||
- MI->isInlineAsm())
+ if (!MI->isSafeToMove(SawStore) || MI->isImplicitDef() || MI->isInlineAsm())
return false;
for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
const MachineOperand &MO = MI->getOperand(i);
diff --git a/llvm/lib/CodeGen/MachineModuleInfo.cpp b/llvm/lib/CodeGen/MachineModuleInfo.cpp
index 088e7602..c664959 100644
--- a/llvm/lib/CodeGen/MachineModuleInfo.cpp
+++ b/llvm/lib/CodeGen/MachineModuleInfo.cpp
@@ -26,10 +26,7 @@ MachineModuleInfoImpl::~MachineModuleInfoImpl() = default;
void MachineModuleInfo::initialize() {
ObjFileMMI = nullptr;
- CurCallSite = 0;
NextFnNum = 0;
- UsesMSVCFloatingPoint = false;
- DbgInfoAvailable = false;
}
void MachineModuleInfo::finalize() {
@@ -47,7 +44,6 @@ MachineModuleInfo::MachineModuleInfo(MachineModuleInfo &&MMI)
MachineFunctions(std::move(MMI.MachineFunctions)) {
Context.setObjectFileInfo(TM.getObjFileLowering());
ObjFileMMI = MMI.ObjFileMMI;
- CurCallSite = MMI.CurCallSite;
ExternalContext = MMI.ExternalContext;
TheModule = MMI.TheModule;
}
@@ -90,7 +86,7 @@ MachineFunction &MachineModuleInfo::getOrCreateMachineFunction(Function &F) {
if (I.second) {
// No pre-existing machine function, create a new one.
const TargetSubtargetInfo &STI = *TM.getSubtargetImpl(F);
- MF = new MachineFunction(F, TM, STI, NextFnNum++, *this);
+ MF = new MachineFunction(F, TM, STI, getContext(), NextFnNum++);
MF->initTargetMachineFunctionInfo(STI);
// MRI callback for target specific initializations.
@@ -210,7 +206,6 @@ bool MachineModuleInfoWrapperPass::doInitialization(Module &M) {
Ctx.diagnose(
DiagnosticInfoSrcMgr(SMD, M.getName(), IsInlineAsm, LocCookie));
});
- MMI.DbgInfoAvailable = !M.debug_compile_units().empty();
return false;
}
@@ -235,6 +230,5 @@ MachineModuleAnalysis::run(Module &M, ModuleAnalysisManager &) {
Ctx.diagnose(
DiagnosticInfoSrcMgr(SMD, M.getName(), IsInlineAsm, LocCookie));
});
- MMI.DbgInfoAvailable = !M.debug_compile_units().empty();
return Result(MMI);
}
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index 4b3ff57..f10b98c 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -374,7 +374,7 @@ bool MachineSinking::PerformSinkAndFold(MachineInstr &MI,
// Check if it's safe to move the instruction.
bool SawStore = true;
- if (!MI.isSafeToMove(AA, SawStore))
+ if (!MI.isSafeToMove(SawStore))
return false;
// Convergent operations may not be made control-dependent on additional
@@ -687,7 +687,7 @@ void MachineSinking::FindCycleSinkCandidates(
continue;
}
bool DontMoveAcrossStore = true;
- if (!MI.isSafeToMove(AA, DontMoveAcrossStore)) {
+ if (!MI.isSafeToMove(DontMoveAcrossStore)) {
LLVM_DEBUG(dbgs() << "CycleSink: Instruction not safe to move.\n");
continue;
}
@@ -1654,7 +1654,7 @@ bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore,
return false;
// Check if it's safe to move the instruction.
- if (!MI.isSafeToMove(AA, SawStore))
+ if (!MI.isSafeToMove(SawStore))
return false;
// Convergent operations may not be made control-dependent on additional
@@ -1705,7 +1705,7 @@ bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore,
bool TryBreak = false;
bool Store =
MI.mayLoad() ? hasStoreBetween(ParentBlock, SuccToSinkTo, MI) : true;
- if (!MI.isSafeToMove(AA, Store)) {
+ if (!MI.isSafeToMove(Store)) {
LLVM_DEBUG(dbgs() << " *** NOTE: Won't sink load along critical edge.\n");
TryBreak = true;
}
diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp
index 0f29ebe..dae0cb2 100644
--- a/llvm/lib/CodeGen/ModuloSchedule.cpp
+++ b/llvm/lib/CodeGen/ModuloSchedule.cpp
@@ -739,7 +739,7 @@ void ModuloScheduleExpander::removeDeadInstructions(MachineBasicBlock *KernelBB,
bool SawStore = false;
// Check if it's safe to remove the instruction due to side effects.
// We can, and want to, remove Phis here.
- if (!MI->isSafeToMove(nullptr, SawStore) && !MI->isPHI()) {
+ if (!MI->isSafeToMove(SawStore) && !MI->isPHI()) {
++MI;
continue;
}
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index 043ea20..f6c53f3 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -1320,7 +1320,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
if (!definesFullReg(*DefMI, SrcReg))
return false;
bool SawStore = false;
- if (!DefMI->isSafeToMove(AA, SawStore))
+ if (!DefMI->isSafeToMove(SawStore))
return false;
const MCInstrDesc &MCID = DefMI->getDesc();
if (MCID.getNumDefs() != 1)
diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index ef9f783..e255bba 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -1203,11 +1203,6 @@ void FastISel::handleDbgInfo(const Instruction *II) {
if (DbgLabelRecord *DLR = dyn_cast<DbgLabelRecord>(&DR)) {
assert(DLR->getLabel() && "Missing label");
- if (!FuncInfo.MF->getMMI().hasDebugInfo()) {
- LLVM_DEBUG(dbgs() << "Dropping debug info for " << *DLR << "\n");
- continue;
- }
-
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DLR->getDebugLoc(),
TII.get(TargetOpcode::DBG_LABEL))
.addMetadata(DLR->getLabel());
@@ -1402,12 +1397,6 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
case Intrinsic::dbg_declare: {
const DbgDeclareInst *DI = cast<DbgDeclareInst>(II);
assert(DI->getVariable() && "Missing variable");
- if (!FuncInfo.MF->getMMI().hasDebugInfo()) {
- LLVM_DEBUG(dbgs() << "Dropping debug info for " << *DI
- << " (!hasDebugInfo)\n");
- return true;
- }
-
if (FuncInfo.PreprocessedDbgDeclares.contains(DI))
return true;
@@ -1446,11 +1435,6 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
case Intrinsic::dbg_label: {
const DbgLabelInst *DI = cast<DbgLabelInst>(II);
assert(DI->getLabel() && "Missing label");
- if (!FuncInfo.MF->getMMI().hasDebugInfo()) {
- LLVM_DEBUG(dbgs() << "Dropping debug info for " << *DI << "\n");
- return true;
- }
-
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
TII.get(TargetOpcode::DBG_LABEL)).addMetadata(DI->getLabel());
return true;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index af77b00..33a53df 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -3395,13 +3395,13 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N,
Lo = DAG.getNode(ISD::UADDO, dl, VTList, LoOps);
HiOps[2] = Lo.getValue(1);
Hi = DAG.computeKnownBits(HiOps[2]).isZero()
- ? DAG.getNode(ISD::UADDO, dl, VTList, ArrayRef(HiOps, 2))
+ ? DAG.getNode(ISD::ADD, dl, NVT, ArrayRef(HiOps, 2))
: DAG.getNode(ISD::UADDO_CARRY, dl, VTList, HiOps);
} else {
Lo = DAG.getNode(ISD::USUBO, dl, VTList, LoOps);
HiOps[2] = Lo.getValue(1);
Hi = DAG.computeKnownBits(HiOps[2]).isZero()
- ? DAG.getNode(ISD::USUBO, dl, VTList, ArrayRef(HiOps, 2))
+ ? DAG.getNode(ISD::SUB, dl, NVT, ArrayRef(HiOps, 2))
: DAG.getNode(ISD::USUBO_CARRY, dl, VTList, HiOps);
}
return;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 1791f1b..c554c0f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6708,10 +6708,9 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
return;
case Intrinsic::eh_sjlj_callsite: {
ConstantInt *CI = cast<ConstantInt>(I.getArgOperand(0));
- assert(DAG.getMMI()->getCurrentCallSite() == 0 &&
- "Overlapping call sites!");
+ assert(FuncInfo.getCurrentCallSite() == 0 && "Overlapping call sites!");
- DAG.getMMI()->setCurrentCallSite(CI->getZExtValue());
+ FuncInfo.setCurrentCallSite(CI->getZExtValue());
return;
}
case Intrinsic::eh_sjlj_functioncontext: {
@@ -8619,7 +8618,6 @@ SDValue SelectionDAGBuilder::lowerStartEH(SDValue Chain,
const BasicBlock *EHPadBB,
MCSymbol *&BeginLabel) {
MachineFunction &MF = DAG.getMachineFunction();
- MachineModuleInfo &MMI = MF.getMMI();
// Insert a label before the invoke call to mark the try range. This can be
// used to detect deletion of the invoke via the MachineModuleInfo.
@@ -8627,13 +8625,13 @@ SDValue SelectionDAGBuilder::lowerStartEH(SDValue Chain,
// For SjLj, keep track of which landing pads go with which invokes
// so as to maintain the ordering of pads in the LSDA.
- unsigned CallSiteIndex = MMI.getCurrentCallSite();
+ unsigned CallSiteIndex = FuncInfo.getCurrentCallSite();
if (CallSiteIndex) {
MF.setCallSiteBeginLabel(BeginLabel, CallSiteIndex);
LPadToCallSiteMap[FuncInfo.MBBMap[EHPadBB]].push_back(CallSiteIndex);
// Now that the call site is handled, stop tracking it.
- MMI.setCurrentCallSite(0);
+ FuncInfo.setCurrentCallSite(0);
}
return DAG.getEHLabel(getCurSDLoc(), Chain, BeginLabel);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 401d23b..84331d2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -417,30 +417,6 @@ void SelectionDAGISelLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
MachineFunctionPass::getAnalysisUsage(AU);
}
-static void computeUsesMSVCFloatingPoint(const Triple &TT, const Function &F,
- MachineModuleInfo &MMI) {
- // Only needed for MSVC
- if (!TT.isWindowsMSVCEnvironment())
- return;
-
- // If it's already set, nothing to do.
- if (MMI.usesMSVCFloatingPoint())
- return;
-
- for (const Instruction &I : instructions(F)) {
- if (I.getType()->isFPOrFPVectorTy()) {
- MMI.setUsesMSVCFloatingPoint(true);
- return;
- }
- for (const auto &Op : I.operands()) {
- if (Op->getType()->isFPOrFPVectorTy()) {
- MMI.setUsesMSVCFloatingPoint(true);
- return;
- }
- }
- }
-}
-
PreservedAnalyses
SelectionDAGISelPass::run(MachineFunction &MF,
MachineFunctionAnalysisManager &MFAM) {
@@ -802,9 +778,6 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
}
}
- // Determine if floating point is used for msvc
- computeUsesMSVCFloatingPoint(TM.getTargetTriple(), Fn, *CurDAG->getMMI());
-
// Release function-specific state. SDB and CurDAG are already cleared
// at this point.
FuncInfo->clear();
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 6f7905c..fb6274b 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -948,7 +948,7 @@ bool TwoAddressInstructionImpl::rescheduleMIBelowKill(
return false;
bool SeenStore = true;
- if (!MI->isSafeToMove(AA, SeenStore))
+ if (!MI->isSafeToMove(SeenStore))
return false;
if (TII->getInstrLatency(InstrItins, *MI) > 1)
@@ -1131,7 +1131,7 @@ bool TwoAddressInstructionImpl::rescheduleKillAboveMI(
return false;
bool SeenStore = true;
- if (!KillMI->isSafeToMove(AA, SeenStore))
+ if (!KillMI->isSafeToMove(SeenStore))
return false;
SmallVector<Register, 2> Uses;
diff --git a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index fc4d05c..49e5211 100644
--- a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -416,7 +416,7 @@ bool SSACCmpConv::canSpeculateInstrs(MachineBasicBlock *MBB,
// We never speculate stores, so an AA pointer isn't necessary.
bool DontMoveAcrossStore = true;
- if (!I.isSafeToMove(nullptr, DontMoveAcrossStore)) {
+ if (!I.isSafeToMove(DontMoveAcrossStore)) {
LLVM_DEBUG(dbgs() << "Can't speculate: " << I);
return false;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 63048c7..aaa2922 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2169,27 +2169,6 @@ bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
return Ret;
}
-static int sizeToSubRegIndex(unsigned Size) {
- switch (Size) {
- case 32:
- return AMDGPU::sub0;
- case 64:
- return AMDGPU::sub0_sub1;
- case 96:
- return AMDGPU::sub0_sub1_sub2;
- case 128:
- return AMDGPU::sub0_sub1_sub2_sub3;
- case 256:
- return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
- default:
- if (Size < 32)
- return AMDGPU::sub0;
- if (Size > 256)
- return -1;
- return sizeToSubRegIndex(llvm::bit_ceil(Size));
- }
-}
-
bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
Register DstReg = I.getOperand(0).getReg();
Register SrcReg = I.getOperand(1).getReg();
@@ -2293,8 +2272,9 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
return false;
if (SrcSize > 32) {
- int SubRegIdx = sizeToSubRegIndex(DstSize);
- if (SubRegIdx == -1)
+ unsigned SubRegIdx =
+ DstSize < 32 ? AMDGPU::sub0 : TRI.getSubRegFromChannel(0, DstSize / 32);
+ if (SubRegIdx == AMDGPU::NoSubRegister)
return false;
// Deal with weird cases where the class only partially supports the subreg
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index fc7aba3..c31f85d 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -65,11 +65,6 @@ void AMDGPUInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo,
printU32ImmOperand(MI, OpNo, STI, O);
}
-void AMDGPUInstPrinter::printU4ImmDecOperand(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
- O << formatDec(MI->getOperand(OpNo).getImm() & 0xf);
-}
-
void AMDGPUInstPrinter::printU16ImmDecOperand(const MCInst *MI, unsigned OpNo,
raw_ostream &O) {
O << formatDec(MI->getOperand(OpNo).getImm() & 0xffff);
@@ -719,29 +714,25 @@ void AMDGPUInstPrinter::printDefaultVccOperand(bool FirstOperand,
void AMDGPUInstPrinter::printWaitVDST(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
- O << " wait_vdst:";
- printU4ImmDecOperand(MI, OpNo, O);
+ O << " wait_vdst:" << formatDec(MI->getOperand(OpNo).getImm());
}
void AMDGPUInstPrinter::printWaitVAVDst(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
- O << " wait_va_vdst:";
- printU4ImmDecOperand(MI, OpNo, O);
+ O << " wait_va_vdst:" << formatDec(MI->getOperand(OpNo).getImm());
}
void AMDGPUInstPrinter::printWaitVMVSrc(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
- O << " wait_vm_vsrc:";
- printU4ImmDecOperand(MI, OpNo, O);
+ O << " wait_vm_vsrc:" << formatDec(MI->getOperand(OpNo).getImm());
}
void AMDGPUInstPrinter::printWaitEXP(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
- O << " wait_exp:";
- printU4ImmDecOperand(MI, OpNo, O);
+ O << " wait_exp:" << formatDec(MI->getOperand(OpNo).getImm());
}
bool AMDGPUInstPrinter::needsImpliedVcc(const MCInstrDesc &Desc,
@@ -1065,16 +1056,13 @@ void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo,
O << formatDec((Imm & 0xc0) >> 6) << ']';
} else if ((Imm >= DppCtrl::ROW_SHL_FIRST) &&
(Imm <= DppCtrl::ROW_SHL_LAST)) {
- O << "row_shl:";
- printU4ImmDecOperand(MI, OpNo, O);
+ O << "row_shl:" << formatDec(Imm - DppCtrl::ROW_SHL0);
} else if ((Imm >= DppCtrl::ROW_SHR_FIRST) &&
(Imm <= DppCtrl::ROW_SHR_LAST)) {
- O << "row_shr:";
- printU4ImmDecOperand(MI, OpNo, O);
+ O << "row_shr:" << formatDec(Imm - DppCtrl::ROW_SHR0);
} else if ((Imm >= DppCtrl::ROW_ROR_FIRST) &&
(Imm <= DppCtrl::ROW_ROR_LAST)) {
- O << "row_ror:";
- printU4ImmDecOperand(MI, OpNo, O);
+ O << "row_ror:" << formatDec(Imm - DppCtrl::ROW_ROR0);
} else if (Imm == DppCtrl::WAVE_SHL1) {
if (AMDGPU::isGFX10Plus(STI)) {
O << "/* wave_shl is not supported starting from GFX10 */";
@@ -1126,15 +1114,14 @@ void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo,
"than GFX90A/GFX10 */";
return;
}
- printU4ImmDecOperand(MI, OpNo, O);
+ O << formatDec(Imm - DppCtrl::ROW_SHARE_FIRST);
} else if ((Imm >= DppCtrl::ROW_XMASK_FIRST) &&
(Imm <= DppCtrl::ROW_XMASK_LAST)) {
if (!AMDGPU::isGFX10Plus(STI)) {
O << "/* row_xmask is not supported on ASICs earlier than GFX10 */";
return;
}
- O << "row_xmask:";
- printU4ImmDecOperand(MI, OpNo, O);
+ O << "row_xmask:" << formatDec(Imm - DppCtrl::ROW_XMASK_FIRST);
} else {
O << "/* Invalid dpp_ctrl value */";
}
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index c5fad38..4a39022 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -38,7 +38,6 @@ public:
private:
void printU16ImmOperand(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
- void printU4ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printU32ImmOperand(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index 4218b7d..3d6e8ef 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -950,7 +950,7 @@ multiclass SMLoad_Pattern <string Instr, ValueType vt, bit immci = true> {
(vt (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm32 i32:$offset), timm:$cachepolicy)),
(!cast<InstSI>(Instr#"_IMM_ci") SReg_128:$sbase, smrd_literal_offset:$offset,
(extract_cpol $cachepolicy))> {
- let OtherPredicates = [isGFX7Only];
+ let SubtargetPredicate = isGFX7Only;
let AddedComplexity = 1;
}
@@ -958,12 +958,12 @@ multiclass SMLoad_Pattern <string Instr, ValueType vt, bit immci = true> {
def : GCNPat <
(SIsbuffer_load v4i32:$sbase, i32:$soffset, timm:$cachepolicy),
(vt (!cast<SM_Pseudo>(Instr#"_SGPR") SReg_128:$sbase, SReg_32:$soffset, (extract_cpol $cachepolicy)))> {
- let OtherPredicates = [isNotGFX9Plus];
+ let SubtargetPredicate = isNotGFX9Plus;
}
def : GCNPat <
(SIsbuffer_load v4i32:$sbase, i32:$soffset, timm:$cachepolicy),
(vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") SReg_128:$sbase, SReg_32:$soffset, 0, (extract_cpol $cachepolicy)))> {
- let OtherPredicates = [isGFX9Plus];
+ let SubtargetPredicate = isGFX9Plus;
}
// 4. Offset as an 32-bit SGPR + immediate
@@ -972,7 +972,7 @@ multiclass SMLoad_Pattern <string Instr, ValueType vt, bit immci = true> {
timm:$cachepolicy),
(vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") SReg_128:$sbase, SReg_32:$soffset, i32imm:$offset,
(extract_cpol $cachepolicy)))> {
- let OtherPredicates = [isGFX9Plus];
+ let SubtargetPredicate = isGFX9Plus;
}
}
@@ -981,28 +981,28 @@ multiclass ScalarLoadWithExtensionPat <string Instr, SDPatternOperator node, Val
def : GCNPat <
(node (SMRDImm i64:$sbase, i32:$offset)),
(vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0))>{
- let OtherPredicates = [isGFX12Plus];
+ let SubtargetPredicate = isGFX12Plus;
}
// 2. SGPR offset
def : GCNPat <
(node (SMRDSgpr i64:$sbase, i32:$soffset)),
(vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, 0, 0))>{
- let OtherPredicates = [isGFX12Plus];
+ let SubtargetPredicate = isGFX12Plus;
}
// 3. SGPR+IMM offset
def : GCNPat <
(node (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)),
(vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, $offset, 0))>{
- let OtherPredicates = [isGFX12Plus];
+ let SubtargetPredicate = isGFX12Plus;
}
// 4. No offset
def : GCNPat <
(vt (node (i64 SReg_64:$sbase))),
(vt (!cast<SM_Pseudo>(Instr#"_IMM") i64:$sbase, 0, 0))>{
- let OtherPredicates = [isGFX12Plus];
+ let SubtargetPredicate = isGFX12Plus;
}
}
@@ -1012,14 +1012,14 @@ multiclass ScalarBufferLoadIntrinsicPat <SDPatternOperator name, string Instr> {
def : GCNPat <
(name v4i32:$sbase, (SMRDBufferImm i32:$offset), timm:$cachepolicy),
(i32 (!cast<SM_Pseudo>(Instr#"_IMM") SReg_128:$sbase, i32imm:$offset, (extract_cpol $cachepolicy)))> {
- let OtherPredicates = [isGFX12Plus];
+ let SubtargetPredicate = isGFX12Plus;
}
// 2. Offset as an 32-bit SGPR
def : GCNPat <
(name v4i32:$sbase, i32:$soffset, timm:$cachepolicy),
(i32 (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") SReg_128:$sbase, SReg_32:$soffset, 0, (extract_cpol $cachepolicy)))> {
- let OtherPredicates = [isGFX12Plus];
+ let SubtargetPredicate = isGFX12Plus;
}
// 3. Offset as an 32-bit SGPR + immediate
@@ -1028,7 +1028,7 @@ multiclass ScalarBufferLoadIntrinsicPat <SDPatternOperator name, string Instr> {
timm:$cachepolicy),
(i32 (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") SReg_128:$sbase, SReg_32:$soffset, i32imm:$offset,
(extract_cpol $cachepolicy)))> {
- let OtherPredicates = [isGFX12Plus];
+ let SubtargetPredicate = isGFX12Plus;
}
}
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 18cb516..3b9195b 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -2311,7 +2311,7 @@ ARMBaseInstrInfo::canFoldIntoMOVCC(Register Reg, const MachineRegisterInfo &MRI,
return nullptr;
}
bool DontMoveAcrossStores = true;
- if (!MI->isSafeToMove(/* AliasAnalysis = */ nullptr, DontMoveAcrossStores))
+ if (!MI->isSafeToMove(DontMoveAcrossStores))
return nullptr;
return MI;
}
diff --git a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
index 6926b02..1922675 100644
--- a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -1037,7 +1037,7 @@ bool DeadCodeElimination::runOnNode(MachineDomTreeNode *N) {
if (MI->isInlineAsm())
continue;
// Delete PHIs if possible.
- if (!MI->isPHI() && !MI->isSafeToMove(nullptr, Store))
+ if (!MI->isPHI() && !MI->isSafeToMove(Store))
continue;
bool AllDead = true;
diff --git a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
index 8840c27..5e52cf0 100644
--- a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
@@ -1451,7 +1451,7 @@ bool HexagonGenInsert::removeDeadCode(MachineDomTreeNode *N) {
Opc == TargetOpcode::LIFETIME_END)
continue;
bool Store = false;
- if (MI->isInlineAsm() || !MI->isSafeToMove(nullptr, Store))
+ if (MI->isInlineAsm() || !MI->isSafeToMove(Store))
continue;
bool AllDead = true;
diff --git a/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp b/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
index b8a3743..9f3d9b7 100644
--- a/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
+++ b/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
@@ -483,7 +483,7 @@ static MachineInstr *canFoldIntoSelect(Register Reg,
return nullptr;
}
bool DontMoveAcrossStores = true;
- if (!MI->isSafeToMove(/*AliasAnalysis=*/nullptr, DontMoveAcrossStores))
+ if (!MI->isSafeToMove(DontMoveAcrossStores))
return nullptr;
return MI;
}
diff --git a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
index 208bd3d..f52e188 100644
--- a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
+++ b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
@@ -450,6 +450,24 @@ public:
IsValidKind;
}
+ bool isSImm20pcaddi() const {
+ if (!isImm())
+ return false;
+
+ int64_t Imm;
+ LoongArchMCExpr::VariantKind VK = LoongArchMCExpr::VK_LoongArch_None;
+ bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
+ bool IsValidKind = VK == LoongArchMCExpr::VK_LoongArch_None ||
+ VK == LoongArchMCExpr::VK_LoongArch_PCREL20_S2 ||
+ VK == LoongArchMCExpr::VK_LoongArch_TLS_LD_PCREL20_S2 ||
+ VK == LoongArchMCExpr::VK_LoongArch_TLS_GD_PCREL20_S2 ||
+ VK == LoongArchMCExpr::VK_LoongArch_TLS_DESC_PCREL20_S2;
+ return IsConstantImm
+ ? isInt<20>(Imm) && IsValidKind
+ : LoongArchAsmParser::classifySymbolRef(getImm(), VK) &&
+ IsValidKind;
+ }
+
bool isSImm21lsl2() const {
if (!isImm())
return false;
@@ -1676,6 +1694,12 @@ bool LoongArchAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
/*Upper=*/(1 << 19) - 1,
"operand must be a symbol with modifier (e.g. %call36) or an integer "
"in the range");
+ case Match_InvalidSImm20pcaddi:
+ return generateImmOutOfRangeError(
+ Operands, ErrorInfo, /*Lower=*/-(1 << 19),
+ /*Upper=*/(1 << 19) - 1,
+ "operand must be a symbol with modifier (e.g. %pcrel_20) or an integer "
+ "in the range");
case Match_InvalidSImm21lsl2:
return generateImmOutOfRangeError(
Operands, ErrorInfo, /*Lower=*/-(1 << 22), /*Upper=*/(1 << 22) - 4,
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
index ec0d0714..ef647a4 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
@@ -397,6 +397,10 @@ def simm20_pcaddu18i : SImm20Operand {
let ParserMatchClass = SImmAsmOperand<20, "pcaddu18i">;
}
+def simm20_pcaddi : SImm20Operand {
+ let ParserMatchClass = SImmAsmOperand<20, "pcaddi">;
+}
+
def simm21_lsl2 : Operand<OtherVT> {
let ParserMatchClass = SImmAsmOperand<21, "lsl2">;
let EncoderMethod = "getImmOpValueAsr<2>";
@@ -754,7 +758,7 @@ def SLT : ALU_3R<0x00120000>;
def SLTU : ALU_3R<0x00128000>;
def SLTI : ALU_2RI12<0x02000000, simm12>;
def SLTUI : ALU_2RI12<0x02400000, simm12>;
-def PCADDI : ALU_1RI20<0x18000000, simm20>;
+def PCADDI : ALU_1RI20<0x18000000, simm20_pcaddi>;
def PCADDU12I : ALU_1RI20<0x1c000000, simm20>;
def PCALAU12I : ALU_1RI20<0x1a000000, simm20_pcalau12i>;
def AND : ALU_3R<0x00148000>;
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h
index 29ed14f..370f5b0 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h
@@ -111,6 +111,8 @@ enum Fixups {
fixup_loongarch_relax = FirstLiteralRelocationKind + ELF::R_LARCH_RELAX,
// Generate an R_LARCH_ALIGN which indicates the linker may fixup align here.
fixup_loongarch_align = FirstLiteralRelocationKind + ELF::R_LARCH_ALIGN,
+ // 20-bit fixup corresponding to %pcrel_20(foo) for instruction pcaddi.
+ fixup_loongarch_pcrel20_s2,
// 36-bit fixup corresponding to %call36(foo) for a pair instructions:
// pcaddu18i+jirl.
fixup_loongarch_call36 = FirstLiteralRelocationKind + ELF::R_LARCH_CALL36,
@@ -142,6 +144,12 @@ enum Fixups {
fixup_loongarch_tls_le_add_r,
// 12-bit fixup corresponding to %le_lo12_r(foo) for instruction addi.w/d.
fixup_loongarch_tls_le_lo12_r,
+ // 20-bit fixup corresponding to %ld_pcrel_20(foo) for instruction pcaddi.
+ fixup_loongarch_tls_ld_pcrel20_s2,
+ // 20-bit fixup corresponding to %gd_pcrel_20(foo) for instruction pcaddi.
+ fixup_loongarch_tls_gd_pcrel20_s2,
+ // 20-bit fixup corresponding to %desc_pcrel_20(foo) for instruction pcaddi.
+ fixup_loongarch_tls_desc_pcrel20_s2,
};
} // end namespace LoongArch
} // end namespace llvm
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
index efbfce3..4f7f93f 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
@@ -287,6 +287,18 @@ LoongArchMCCodeEmitter::getExprOpValue(const MCInst &MI, const MCOperand &MO,
case LoongArchMCExpr::VK_LoongArch_TLS_LE_LO12_R:
FixupKind = LoongArch::fixup_loongarch_tls_le_lo12_r;
break;
+ case LoongArchMCExpr::VK_LoongArch_PCREL20_S2:
+ FixupKind = LoongArch::fixup_loongarch_pcrel20_s2;
+ break;
+ case LoongArchMCExpr::VK_LoongArch_TLS_LD_PCREL20_S2:
+ FixupKind = LoongArch::fixup_loongarch_tls_ld_pcrel20_s2;
+ break;
+ case LoongArchMCExpr::VK_LoongArch_TLS_GD_PCREL20_S2:
+ FixupKind = LoongArch::fixup_loongarch_tls_gd_pcrel20_s2;
+ break;
+ case LoongArchMCExpr::VK_LoongArch_TLS_DESC_PCREL20_S2:
+ FixupKind = LoongArch::fixup_loongarch_tls_desc_pcrel20_s2;
+ break;
}
} else if (Kind == MCExpr::SymbolRef &&
cast<MCSymbolRefExpr>(Expr)->getKind() ==
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
index 98b9b1a..53d46cc 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
@@ -166,6 +166,14 @@ StringRef LoongArchMCExpr::getVariantKindName(VariantKind Kind) {
return "le_add_r";
case VK_LoongArch_TLS_LE_LO12_R:
return "le_lo12_r";
+ case VK_LoongArch_PCREL20_S2:
+ return "pcrel_20";
+ case VK_LoongArch_TLS_LD_PCREL20_S2:
+ return "ld_pcrel_20";
+ case VK_LoongArch_TLS_GD_PCREL20_S2:
+ return "gd_pcrel_20";
+ case VK_LoongArch_TLS_DESC_PCREL20_S2:
+ return "desc_pcrel_20";
}
}
@@ -222,6 +230,10 @@ LoongArchMCExpr::getVariantKindForName(StringRef name) {
.Case("le_hi20_r", VK_LoongArch_TLS_LE_HI20_R)
.Case("le_add_r", VK_LoongArch_TLS_LE_ADD_R)
.Case("le_lo12_r", VK_LoongArch_TLS_LE_LO12_R)
+ .Case("pcrel_20", VK_LoongArch_PCREL20_S2)
+ .Case("ld_pcrel_20", VK_LoongArch_TLS_LD_PCREL20_S2)
+ .Case("gd_pcrel_20", VK_LoongArch_TLS_GD_PCREL20_S2)
+ .Case("desc_pcrel_20", VK_LoongArch_TLS_DESC_PCREL20_S2)
.Default(VK_LoongArch_Invalid);
}
@@ -264,6 +276,9 @@ void LoongArchMCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
case VK_LoongArch_TLS_GD_HI20:
case VK_LoongArch_TLS_DESC_PC_HI20:
case VK_LoongArch_TLS_DESC_HI20:
+ case VK_LoongArch_TLS_LD_PCREL20_S2:
+ case VK_LoongArch_TLS_GD_PCREL20_S2:
+ case VK_LoongArch_TLS_DESC_PCREL20_S2:
break;
}
fixELFSymbolsInTLSFixupsImpl(getSubExpr(), Asm);
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
index 1546d89..91215b8 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
@@ -75,6 +75,10 @@ public:
VK_LoongArch_TLS_LE_HI20_R,
VK_LoongArch_TLS_LE_ADD_R,
VK_LoongArch_TLS_LE_LO12_R,
+ VK_LoongArch_PCREL20_S2,
+ VK_LoongArch_TLS_LD_PCREL20_S2,
+ VK_LoongArch_TLS_GD_PCREL20_S2,
+ VK_LoongArch_TLS_DESC_PCREL20_S2,
VK_LoongArch_Invalid // Must be the last item.
};
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 0b654ab..d7197a7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -493,7 +493,7 @@ void NVPTXAsmPrinter::emitFunctionEntryLabel() {
// Emit initial .loc debug directive for correct relocation symbol data.
if (const DISubprogram *SP = MF->getFunction().getSubprogram()) {
assert(SP->getUnit());
- if (!SP->getUnit()->isDebugDirectivesOnly() && MMI && MMI->hasDebugInfo())
+ if (!SP->getUnit()->isDebugDirectivesOnly())
emitInitialRawDwarfLocDirective(*MF);
}
}
@@ -912,7 +912,7 @@ void NVPTXAsmPrinter::emitHeader(Module &M, raw_ostream &O,
if (HasFullDebugInfo)
break;
}
- if (MMI && MMI->hasDebugInfo() && HasFullDebugInfo)
+ if (HasFullDebugInfo)
O << ", debug";
O << "\n";
@@ -928,8 +928,6 @@ void NVPTXAsmPrinter::emitHeader(Module &M, raw_ostream &O,
}
bool NVPTXAsmPrinter::doFinalization(Module &M) {
- bool HasDebugInfo = MMI && MMI->hasDebugInfo();
-
// If we did not emit any functions, then the global declarations have not
// yet been emitted.
if (!GlobalsEmitted) {
@@ -945,7 +943,7 @@ bool NVPTXAsmPrinter::doFinalization(Module &M) {
auto *TS =
static_cast<NVPTXTargetStreamer *>(OutStreamer->getTargetStreamer());
// Close the last emitted section
- if (HasDebugInfo) {
+ if (hasDebugInfo()) {
TS->closeLastSection();
// Emit empty .debug_loc section for better support of the empty files.
OutStreamer->emitRawText("\t.section\t.debug_loc\t{\t}");
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 0e2811f..dcde863 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -3137,11 +3137,11 @@ void PPCAIXAsmPrinter::emitInstruction(const MachineInstr *MI) {
break;
MCSymbol *TempSym = OutContext.createNamedTempSymbol();
OutStreamer->emitLabel(TempSym);
- OutStreamer->emitXCOFFExceptDirective(CurrentFnSym, TempSym,
- LangMO.getImm(), ReasonMO.getImm(),
- Subtarget->isPPC64() ? MI->getMF()->getInstructionCount() * 8 :
- MI->getMF()->getInstructionCount() * 4,
- MMI->hasDebugInfo());
+ OutStreamer->emitXCOFFExceptDirective(
+ CurrentFnSym, TempSym, LangMO.getImm(), ReasonMO.getImm(),
+ Subtarget->isPPC64() ? MI->getMF()->getInstructionCount() * 8
+ : MI->getMF()->getInstructionCount() * 4,
+ hasDebugInfo());
break;
}
case PPC::GETtlsMOD32AIX:
@@ -3199,7 +3199,7 @@ void PPCAIXAsmPrinter::emitInstruction(const MachineInstr *MI) {
bool PPCAIXAsmPrinter::doFinalization(Module &M) {
// Do streamer related finalization for DWARF.
- if (!MAI->usesDwarfFileAndLocDirectives() && MMI->hasDebugInfo())
+ if (!MAI->usesDwarfFileAndLocDirectives() && hasDebugInfo())
OutStreamer->doFinalizationAtSectionEnd(
OutStreamer->getContext().getObjectFileInfo()->getTextSection());
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 3c868db..2da03d8 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1370,9 +1370,7 @@ def HasConditionalMoveFusion : Predicate<"Subtarget->hasConditionalMoveFusion()"
def NoConditionalMoveFusion : Predicate<"!Subtarget->hasConditionalMoveFusion()">;
def TuneSiFive7 : SubtargetFeature<"sifive7", "RISCVProcFamily", "SiFive7",
- "SiFive 7-Series processors",
- [TuneNoDefaultUnroll,
- TuneShortForwardBranchOpt]>;
+ "SiFive 7-Series processors">;
def TuneVentanaVeyron : SubtargetFeature<"ventana-veyron", "RISCVProcFamily", "VentanaVeyron",
"Ventana Veyron-Series processors">;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 0339b30..41429ff 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -14309,6 +14309,14 @@ struct NodeExtensionHelper {
return RISCVISD::VFWSUB_VL;
case RISCVISD::FMUL_VL:
return RISCVISD::VFWMUL_VL;
+ case RISCVISD::VFMADD_VL:
+ return RISCVISD::VFWMADD_VL;
+ case RISCVISD::VFMSUB_VL:
+ return RISCVISD::VFWMSUB_VL;
+ case RISCVISD::VFNMADD_VL:
+ return RISCVISD::VFWNMADD_VL;
+ case RISCVISD::VFNMSUB_VL:
+ return RISCVISD::VFWNMSUB_VL;
default:
llvm_unreachable("Unexpected opcode");
}
@@ -14502,6 +14510,11 @@ struct NodeExtensionHelper {
Subtarget.hasStdExtZvbb();
case RISCVISD::SHL_VL:
return Subtarget.hasStdExtZvbb();
+ case RISCVISD::VFMADD_VL:
+ case RISCVISD::VFNMSUB_VL:
+ case RISCVISD::VFNMADD_VL:
+ case RISCVISD::VFMSUB_VL:
+ return true;
default:
return false;
}
@@ -14582,6 +14595,10 @@ struct NodeExtensionHelper {
case RISCVISD::FADD_VL:
case RISCVISD::FMUL_VL:
case RISCVISD::VFWADD_W_VL:
+ case RISCVISD::VFMADD_VL:
+ case RISCVISD::VFNMSUB_VL:
+ case RISCVISD::VFNMADD_VL:
+ case RISCVISD::VFMSUB_VL:
return true;
case ISD::SUB:
case RISCVISD::SUB_VL:
@@ -14797,6 +14814,10 @@ NodeExtensionHelper::getSupportedFoldings(const SDNode *Root) {
Strategies.push_back(canFoldToVW_W);
break;
case RISCVISD::FMUL_VL:
+ case RISCVISD::VFMADD_VL:
+ case RISCVISD::VFMSUB_VL:
+ case RISCVISD::VFNMADD_VL:
+ case RISCVISD::VFNMSUB_VL:
Strategies.push_back(canFoldToVWWithSameExtension);
break;
case ISD::MUL:
@@ -14833,7 +14854,7 @@ NodeExtensionHelper::getSupportedFoldings(const SDNode *Root) {
}
} // End anonymous namespace.
-/// Combine a binary operation to its equivalent VW or VW_W form.
+/// Combine a binary or FMA operation to its equivalent VW or VW_W form.
/// The supported combines are:
/// add | add_vl | or disjoint -> vwadd(u) | vwadd(u)_w
/// sub | sub_vl -> vwsub(u) | vwsub(u)_w
@@ -14846,9 +14867,9 @@ NodeExtensionHelper::getSupportedFoldings(const SDNode *Root) {
/// vwsub_w(u) -> vwsub(u)
/// vfwadd_w -> vfwadd
/// vfwsub_w -> vfwsub
-static SDValue combineBinOp_VLToVWBinOp_VL(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI,
- const RISCVSubtarget &Subtarget) {
+static SDValue combineOp_VLToVWOp_VL(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const RISCVSubtarget &Subtarget) {
SelectionDAG &DAG = DCI.DAG;
if (DCI.isBeforeLegalize())
return SDValue();
@@ -14864,19 +14885,26 @@ static SDValue combineBinOp_VLToVWBinOp_VL(SDNode *N,
while (!Worklist.empty()) {
SDNode *Root = Worklist.pop_back_val();
- if (!NodeExtensionHelper::isSupportedRoot(Root, Subtarget))
- return SDValue();
NodeExtensionHelper LHS(Root, 0, DAG, Subtarget);
NodeExtensionHelper RHS(Root, 1, DAG, Subtarget);
- auto AppendUsersIfNeeded = [&Worklist,
+ auto AppendUsersIfNeeded = [&Worklist, &Subtarget,
&Inserted](const NodeExtensionHelper &Op) {
if (Op.needToPromoteOtherUsers()) {
- for (SDNode *TheUse : Op.OrigOperand->uses()) {
+ for (SDNode::use_iterator UI = Op.OrigOperand->use_begin(),
+ UE = Op.OrigOperand->use_end();
+ UI != UE; ++UI) {
+ SDNode *TheUse = *UI;
+ if (!NodeExtensionHelper::isSupportedRoot(TheUse, Subtarget))
+ return false;
+ // We only support the first 2 operands of FMA.
+ if (UI.getOperandNo() >= 2)
+ return false;
if (Inserted.insert(TheUse).second)
Worklist.push_back(TheUse);
}
}
+ return true;
};
// Control the compile time by limiting the number of node we look at in
@@ -14904,9 +14932,11 @@ static SDValue combineBinOp_VLToVWBinOp_VL(SDNode *N,
// we would be leaving the old input (since it is may still be used),
// and the new one.
if (Res->LHSExt.has_value())
- AppendUsersIfNeeded(LHS);
+ if (!AppendUsersIfNeeded(LHS))
+ return SDValue();
if (Res->RHSExt.has_value())
- AppendUsersIfNeeded(RHS);
+ if (!AppendUsersIfNeeded(RHS))
+ return SDValue();
break;
}
}
@@ -14993,7 +15023,7 @@ static SDValue performVWADDSUBW_VLCombine(SDNode *N,
assert(Opc == RISCVISD::VWADD_W_VL || Opc == RISCVISD::VWADDU_W_VL ||
Opc == RISCVISD::VWSUB_W_VL || Opc == RISCVISD::VWSUBU_W_VL);
- if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
+ if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget))
return V;
return combineVWADDSUBWSelect(N, DCI.DAG);
@@ -15408,8 +15438,11 @@ static SDValue combineVFMADD_VLWithVFNEG_VL(SDNode *N, SelectionDAG &DAG) {
VL);
}
-static SDValue performVFMADD_VLCombine(SDNode *N, SelectionDAG &DAG,
+static SDValue performVFMADD_VLCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
const RISCVSubtarget &Subtarget) {
+ SelectionDAG &DAG = DCI.DAG;
+
if (SDValue V = combineVFMADD_VLWithVFNEG_VL(N, DAG))
return V;
@@ -15421,50 +15454,7 @@ static SDValue performVFMADD_VLCombine(SDNode *N, SelectionDAG &DAG,
if (N->isTargetStrictFPOpcode())
return SDValue();
- // Try to form widening FMA.
- SDValue Op0 = N->getOperand(0);
- SDValue Op1 = N->getOperand(1);
- SDValue Mask = N->getOperand(3);
- SDValue VL = N->getOperand(4);
-
- if (Op0.getOpcode() != RISCVISD::FP_EXTEND_VL ||
- Op1.getOpcode() != RISCVISD::FP_EXTEND_VL)
- return SDValue();
-
- // TODO: Refactor to handle more complex cases similar to
- // combineBinOp_VLToVWBinOp_VL.
- if ((!Op0.hasOneUse() || !Op1.hasOneUse()) &&
- (Op0 != Op1 || !Op0->hasNUsesOfValue(2, 0)))
- return SDValue();
-
- // Check the mask and VL are the same.
- if (Op0.getOperand(1) != Mask || Op0.getOperand(2) != VL ||
- Op1.getOperand(1) != Mask || Op1.getOperand(2) != VL)
- return SDValue();
-
- unsigned NewOpc;
- switch (N->getOpcode()) {
- default:
- llvm_unreachable("Unexpected opcode");
- case RISCVISD::VFMADD_VL:
- NewOpc = RISCVISD::VFWMADD_VL;
- break;
- case RISCVISD::VFNMSUB_VL:
- NewOpc = RISCVISD::VFWNMSUB_VL;
- break;
- case RISCVISD::VFNMADD_VL:
- NewOpc = RISCVISD::VFWNMADD_VL;
- break;
- case RISCVISD::VFMSUB_VL:
- NewOpc = RISCVISD::VFWMSUB_VL;
- break;
- }
-
- Op0 = Op0.getOperand(0);
- Op1 = Op1.getOperand(0);
-
- return DAG.getNode(NewOpc, SDLoc(N), N->getValueType(0), Op0, Op1,
- N->getOperand(2), Mask, VL);
+ return combineOp_VLToVWOp_VL(N, DCI, Subtarget);
}
static SDValue performSRACombine(SDNode *N, SelectionDAG &DAG,
@@ -16661,28 +16651,28 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
break;
}
case ISD::ADD: {
- if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
+ if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget))
return V;
if (SDValue V = combineToVWMACC(N, DAG, Subtarget))
return V;
return performADDCombine(N, DCI, Subtarget);
}
case ISD::SUB: {
- if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
+ if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget))
return V;
return performSUBCombine(N, DAG, Subtarget);
}
case ISD::AND:
return performANDCombine(N, DCI, Subtarget);
case ISD::OR: {
- if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
+ if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget))
return V;
return performORCombine(N, DCI, Subtarget);
}
case ISD::XOR:
return performXORCombine(N, DAG, Subtarget);
case ISD::MUL:
- if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
+ if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget))
return V;
return performMULCombine(N, DAG, DCI, Subtarget);
case ISD::SDIV:
@@ -17107,7 +17097,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
break;
}
case RISCVISD::SHL_VL:
- if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
+ if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget))
return V;
[[fallthrough]];
case RISCVISD::SRA_VL:
@@ -17132,7 +17122,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::SRL:
case ISD::SHL: {
if (N->getOpcode() == ISD::SHL) {
- if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
+ if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget))
return V;
}
SDValue ShAmt = N->getOperand(1);
@@ -17148,7 +17138,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
break;
}
case RISCVISD::ADD_VL:
- if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
+ if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget))
return V;
return combineToVWMACC(N, DAG, Subtarget);
case RISCVISD::VWADD_W_VL:
@@ -17158,7 +17148,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
return performVWADDSUBW_VLCombine(N, DCI, Subtarget);
case RISCVISD::SUB_VL:
case RISCVISD::MUL_VL:
- return combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget);
+ return combineOp_VLToVWOp_VL(N, DCI, Subtarget);
case RISCVISD::VFMADD_VL:
case RISCVISD::VFNMADD_VL:
case RISCVISD::VFMSUB_VL:
@@ -17167,7 +17157,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
case RISCVISD::STRICT_VFNMADD_VL:
case RISCVISD::STRICT_VFMSUB_VL:
case RISCVISD::STRICT_VFNMSUB_VL:
- return performVFMADD_VLCombine(N, DAG, Subtarget);
+ return performVFMADD_VLCombine(N, DCI, Subtarget);
case RISCVISD::FADD_VL:
case RISCVISD::FSUB_VL:
case RISCVISD::FMUL_VL:
@@ -17176,7 +17166,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
if (N->getValueType(0).getVectorElementType() == MVT::f32 &&
!Subtarget.hasVInstructionsF16())
return SDValue();
- return combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget);
+ return combineOp_VLToVWOp_VL(N, DCI, Subtarget);
}
case ISD::LOAD:
case ISD::STORE: {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 1df51c1..0620c3f 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -1365,7 +1365,7 @@ static MachineInstr *canFoldAsPredicatedOp(Register Reg,
return nullptr;
}
bool DontMoveAcrossStores = true;
- if (!MI->isSafeToMove(/* AliasAnalysis = */ nullptr, DontMoveAcrossStores))
+ if (!MI->isSafeToMove(DontMoveAcrossStores))
return nullptr;
return MI;
}
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index 6eed2ae..1729bc0 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -83,9 +83,11 @@ def ROCKET_RV64 : RISCVProcessorModel<"rocket-rv64",
def ROCKET : RISCVTuneProcessorModel<"rocket",
RocketModel>;
+defvar SiFive7TuneFeatures = [TuneSiFive7, TuneNoDefaultUnroll,
+ TuneShortForwardBranchOpt,
+ FeaturePostRAScheduler];
def SIFIVE_7 : RISCVTuneProcessorModel<"sifive-7-series",
- SiFive7Model,
- [TuneSiFive7, FeaturePostRAScheduler]>;
+ SiFive7Model, SiFive7TuneFeatures>;
def SIFIVE_E20 : RISCVProcessorModel<"sifive-e20",
RocketModel,
@@ -145,7 +147,7 @@ def SIFIVE_E76 : RISCVProcessorModel<"sifive-e76",
FeatureStdExtA,
FeatureStdExtF,
FeatureStdExtC],
- [TuneSiFive7, FeaturePostRAScheduler]>;
+ SiFive7TuneFeatures>;
def SIFIVE_S21 : RISCVProcessorModel<"sifive-s21",
RocketModel,
@@ -189,7 +191,7 @@ def SIFIVE_S76 : RISCVProcessorModel<"sifive-s76",
FeatureStdExtD,
FeatureStdExtC,
FeatureStdExtZihintpause],
- [TuneSiFive7, FeaturePostRAScheduler]>;
+ SiFive7TuneFeatures>;
def SIFIVE_U54 : RISCVProcessorModel<"sifive-u54",
RocketModel,
@@ -212,8 +214,11 @@ def SIFIVE_U74 : RISCVProcessorModel<"sifive-u74",
FeatureStdExtF,
FeatureStdExtD,
FeatureStdExtC],
- [TuneSiFive7, FeaturePostRAScheduler]>;
+ SiFive7TuneFeatures>;
+defvar SiFiveX280TuneFeatures = !listconcat(SiFive7TuneFeatures,
+ [TuneDLenFactor2,
+ TuneOptimizedZeroStrideLoad]);
def SIFIVE_X280 : RISCVProcessorModel<"sifive-x280", SiFive7Model,
[Feature64Bit,
FeatureStdExtI,
@@ -229,10 +234,7 @@ def SIFIVE_X280 : RISCVProcessorModel<"sifive-x280", SiFive7Model,
FeatureStdExtZvfh,
FeatureStdExtZba,
FeatureStdExtZbb],
- [TuneSiFive7,
- FeaturePostRAScheduler,
- TuneDLenFactor2,
- TuneOptimizedZeroStrideLoad]>;
+ SiFiveX280TuneFeatures>;
def SIFIVE_P450 : RISCVProcessorModel<"sifive-p450", SiFiveP400Model,
[Feature64Bit,
diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
index f328c55..20c014a 100644
--- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
@@ -56,7 +56,9 @@ public:
MachineFunctionProperties::Property::IsSSA);
}
- StringRef getPassName() const override { return "RISC-V Fold Masks"; }
+ StringRef getPassName() const override {
+ return "RISC-V Vector Peephole Optimization";
+ }
private:
bool convertToVLMAX(MachineInstr &MI) const;
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index b2b8814..3833939 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -6653,7 +6653,8 @@ SDValue SystemZTargetLowering::combineTruncateExtract(
// Defer the creation of the bitcast from X to combineExtract,
// which might be able to optimize the extraction.
- VecVT = MVT::getVectorVT(MVT::getIntegerVT(TruncBytes * 8),
+ VecVT = EVT::getVectorVT(*DCI.DAG.getContext(),
+ MVT::getIntegerVT(TruncBytes * 8),
VecVT.getStoreSize() / TruncBytes);
EVT ResVT = (TruncBytes < 4 ? MVT::i32 : TruncVT);
return combineExtract(DL, ResVT, VecVT, Vec, NewIndex, DCI, true);
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 16bbfd4..d075889 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -631,8 +631,7 @@ MachineInstr *SystemZInstrInfo::optimizeLoadInstr(MachineInstr &MI,
DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
assert(DefMI);
bool SawStore = false;
- if (!DefMI->isSafeToMove(nullptr, SawStore) ||
- !MRI->hasOneNonDBGUse(FoldAsLoadDefReg))
+ if (!DefMI->isSafeToMove(SawStore) || !MRI->hasOneNonDBGUse(FoldAsLoadDefReg))
return nullptr;
int UseOpIdx =
diff --git a/llvm/lib/Target/X86/X86AsmPrinter.cpp b/llvm/lib/Target/X86/X86AsmPrinter.cpp
index 0c2c6bf..957eb21 100644
--- a/llvm/lib/Target/X86/X86AsmPrinter.cpp
+++ b/llvm/lib/Target/X86/X86AsmPrinter.cpp
@@ -28,6 +28,7 @@
#include "llvm/CodeGenTypes/MachineValueType.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Mangler.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
@@ -975,6 +976,33 @@ static void emitNonLazyStubs(MachineModuleInfo *MMI, MCStreamer &OutStreamer) {
}
}
+/// True if this module is being built for windows/msvc, and uses floating
+/// point. This is used to emit an undefined reference to _fltused. This is
+/// needed in Windows kernel or driver contexts to find and prevent code from
+/// modifying non-GPR registers.
+///
+/// TODO: It would be better if this was computed from MIR by looking for
+/// selected floating-point instructions.
+static bool usesMSVCFloatingPoint(const Triple &TT, const Module &M) {
+ // Only needed for MSVC
+ if (!TT.isWindowsMSVCEnvironment())
+ return false;
+
+ for (const Function &F : M) {
+ for (const Instruction &I : instructions(F)) {
+ if (I.getType()->isFPOrFPVectorTy())
+ return true;
+
+ for (const auto &Op : I.operands()) {
+ if (Op->getType()->isFPOrFPVectorTy())
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
void X86AsmPrinter::emitEndOfAsmFile(Module &M) {
const Triple &TT = TM.getTargetTriple();
@@ -993,7 +1021,7 @@ void X86AsmPrinter::emitEndOfAsmFile(Module &M) {
// safe to set.
OutStreamer->emitAssemblerFlag(MCAF_SubsectionsViaSymbols);
} else if (TT.isOSBinFormatCOFF()) {
- if (MMI->usesMSVCFloatingPoint()) {
+ if (usesMSVCFloatingPoint(TT, M)) {
// In Windows' libcmt.lib, there is a file which is linked in only if the
// symbol _fltused is referenced. Linking this in causes some
// side-effects:
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index f3e6aaf..918a608 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -5588,7 +5588,7 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI,
DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
assert(DefMI);
bool SawStore = false;
- if (!DefMI->isSafeToMove(nullptr, SawStore))
+ if (!DefMI->isSafeToMove(SawStore))
return nullptr;
// Collect information about virtual register operands of MI.
diff --git a/llvm/lib/Target/X86/X86SchedBroadwell.td b/llvm/lib/Target/X86/X86SchedBroadwell.td
index 63ac910..697d30a 100644
--- a/llvm/lib/Target/X86/X86SchedBroadwell.td
+++ b/llvm/lib/Target/X86/X86SchedBroadwell.td
@@ -792,10 +792,10 @@ def BWWriteResGroup28 : SchedWriteRes<[BWPort5]> {
def: InstRW<[BWWriteResGroup28], (instrs VPBROADCASTBrr,
VPBROADCASTWrr)>;
-def BWWriteResGroup33 : SchedWriteRes<[BWPort5,BWPort0156]> {
+def BWWriteResGroup33 : SchedWriteRes<[BWPort5]> {
let Latency = 3;
- let NumMicroOps = 3;
- let ReleaseAtCycles = [2,1];
+ let NumMicroOps = 2;
+ let ReleaseAtCycles = [2];
}
def: InstRW<[BWWriteResGroup33], (instrs MMX_PACKSSDWrr,
MMX_PACKSSWBrr,
diff --git a/llvm/lib/Target/X86/X86SchedHaswell.td b/llvm/lib/Target/X86/X86SchedHaswell.td
index 516dc62..c4d2ad7 100644
--- a/llvm/lib/Target/X86/X86SchedHaswell.td
+++ b/llvm/lib/Target/X86/X86SchedHaswell.td
@@ -1247,10 +1247,10 @@ def: InstRW<[HWWriteResGroup53_1], (instrs VPMOVSXBWYrm,
VPMOVSXWDYrm,
VPMOVZXWDYrm)>;
-def HWWriteResGroup57 : SchedWriteRes<[HWPort5,HWPort0156]> {
+def HWWriteResGroup57 : SchedWriteRes<[HWPort5]> {
let Latency = 3;
- let NumMicroOps = 3;
- let ReleaseAtCycles = [2,1];
+ let NumMicroOps = 2;
+ let ReleaseAtCycles = [2];
}
def: InstRW<[HWWriteResGroup57], (instrs MMX_PACKSSDWrr,
MMX_PACKSSWBrr,
diff --git a/llvm/lib/Target/X86/X86SchedSandyBridge.td b/llvm/lib/Target/X86/X86SchedSandyBridge.td
index 7b33aed..6966400 100644
--- a/llvm/lib/Target/X86/X86SchedSandyBridge.td
+++ b/llvm/lib/Target/X86/X86SchedSandyBridge.td
@@ -638,7 +638,8 @@ def: InstRW<[SBWriteResGroup5], (instrs MMX_PABSBrr,
MMX_PALIGNRrri,
MMX_PSIGNBrr,
MMX_PSIGNDrr,
- MMX_PSIGNWrr)>;
+ MMX_PSIGNWrr,
+ MMX_PSUBQrr)>;
def SBWriteResGroup11 : SchedWriteRes<[SBPort015]> {
let Latency = 2;
@@ -898,7 +899,8 @@ def SBWriteResGroup59 : SchedWriteRes<[SBPort23,SBPort15]> {
let NumMicroOps = 2;
let ReleaseAtCycles = [1,1];
}
-def: InstRW<[SBWriteResGroup59], (instrs MMX_PADDQrm)>;
+def: InstRW<[SBWriteResGroup59], (instrs MMX_PADDQrm,
+ MMX_PSUBQrm)>;
def SBWriteResGroup62 : SchedWriteRes<[SBPort5,SBPort23]> {
let Latency = 7;
diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
index 3ee931f..d764cb3 100644
--- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
@@ -404,7 +404,7 @@ defm : SKLWriteResPair<WritePSADBW, [SKLPort5], 3, [1], 1, 5>; // Vector PSADBW
defm : SKLWriteResPair<WritePSADBWX, [SKLPort5], 3, [1], 1, 6>;
defm : SKLWriteResPair<WritePSADBWY, [SKLPort5], 3, [1], 1, 7>;
defm : X86WriteResPairUnsupported<WritePSADBWZ>;
-defm : SKLWriteResPair<WritePHMINPOS, [SKLPort01], 4, [1], 1, 6>; // Vector PHMINPOS.
+defm : SKLWriteResPair<WritePHMINPOS, [SKLPort0], 4, [1], 1, 6>; // Vector PHMINPOS.
// Vector integer shifts.
defm : SKLWriteResPair<WriteVecShift, [SKLPort0], 1, [1], 1, 5>;
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 161fe13..f710456 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -3596,6 +3596,10 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
{ ISD::SMAX, MVT::v64i8, { 1, 1, 1, 1 } },
{ ISD::SMIN, MVT::v32i16, { 1, 1, 1, 1 } },
{ ISD::SMIN, MVT::v64i8, { 1, 1, 1, 1 } },
+ { ISD::SMULO, MVT::v32i16, { 3, 6, 4, 4 } },
+ { ISD::SMULO, MVT::v64i8, { 8, 21, 17, 18 } },
+ { ISD::UMULO, MVT::v32i16, { 2, 5, 3, 3 } },
+ { ISD::UMULO, MVT::v64i8, { 8, 15, 15, 16 } },
{ ISD::SSUBSAT, MVT::v32i16, { 1, 1, 1, 1 } },
{ ISD::SSUBSAT, MVT::v64i8, { 1, 1, 1, 1 } },
{ ISD::UADDSAT, MVT::v32i16, { 1, 1, 1, 1 } },
@@ -3674,8 +3678,10 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
{ ISD::SMIN, MVT::v64i8, { 3, 7, 5, 5 } },
{ ISD::SMIN, MVT::v4i64, { 1, 3, 1, 1 } },
{ ISD::SMIN, MVT::v2i64, { 1, 3, 1, 1 } },
+ { ISD::SMULO, MVT::v8i64, { 44, 44, 81, 93 } },
+ { ISD::SMULO, MVT::v16i32, { 5, 12, 9, 11 } },
{ ISD::SSUBSAT, MVT::v2i64, { 2, 13, 9, 10 } },
- { ISD::SSUBSAT, MVT::v4i64, { 2, 15, 7, 8} },
+ { ISD::SSUBSAT, MVT::v4i64, { 2, 15, 7, 8 } },
{ ISD::SSUBSAT, MVT::v8i64, { 2, 14, 7, 8 } },
{ ISD::SSUBSAT, MVT::v4i32, { 2, 14, 7, 8 } },
{ ISD::SSUBSAT, MVT::v8i32, { 2, 15, 7, 8 } },
@@ -3694,6 +3700,8 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
{ ISD::UMIN, MVT::v64i8, { 3, 7, 5, 5 } },
{ ISD::UMIN, MVT::v4i64, { 1, 3, 1, 1 } },
{ ISD::UMIN, MVT::v2i64, { 1, 3, 1, 1 } },
+ { ISD::UMULO, MVT::v8i64, { 52, 52, 95, 104} },
+ { ISD::UMULO, MVT::v16i32, { 5, 12, 8, 10 } },
{ ISD::UADDSAT, MVT::v2i64, { 1, 4, 4, 4 } },
{ ISD::UADDSAT, MVT::v4i64, { 1, 4, 4, 4 } },
{ ISD::UADDSAT, MVT::v8i64, { 1, 4, 4, 4 } },
@@ -3828,6 +3836,14 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
{ ISD::SMIN, MVT::v8i32, { 1, 1, 1, 2 } },
{ ISD::SMIN, MVT::v16i16, { 1, 1, 1, 2 } },
{ ISD::SMIN, MVT::v32i8, { 1, 1, 1, 2 } },
+ { ISD::SMULO, MVT::v4i64, { 20, 20, 33, 37 } },
+ { ISD::SMULO, MVT::v2i64, { 8, 8, 13, 15 } },
+ { ISD::SMULO, MVT::v8i32, { 8, 20, 13, 24 } },
+ { ISD::SMULO, MVT::v4i32, { 5, 15, 11, 12 } },
+ { ISD::SMULO, MVT::v16i16, { 4, 14, 8, 14 } },
+ { ISD::SMULO, MVT::v8i16, { 3, 9, 6, 6 } },
+ { ISD::SMULO, MVT::v32i8, { 9, 15, 18, 35 } },
+ { ISD::SMULO, MVT::v16i8, { 6, 22, 14, 21 } },
{ ISD::SSUBSAT, MVT::v2i64, { 4, 13, 9, 13 } },
{ ISD::SSUBSAT, MVT::v4i64, { 4, 15, 9, 13 } },
{ ISD::SSUBSAT, MVT::v4i32, { 3, 14, 9, 11 } },
@@ -3849,6 +3865,14 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
{ ISD::UMIN, MVT::v8i32, { 1, 1, 1, 2 } },
{ ISD::UMIN, MVT::v16i16, { 1, 1, 1, 2 } },
{ ISD::UMIN, MVT::v32i8, { 1, 1, 1, 2 } },
+ { ISD::UMULO, MVT::v4i64, { 24, 24, 39, 43 } },
+ { ISD::UMULO, MVT::v2i64, { 10, 10, 15, 19 } },
+ { ISD::UMULO, MVT::v8i32, { 8, 11, 13, 23 } },
+ { ISD::UMULO, MVT::v4i32, { 5, 12, 11, 12 } },
+ { ISD::UMULO, MVT::v16i16, { 4, 6, 8, 13 } },
+ { ISD::UMULO, MVT::v8i16, { 2, 8, 6, 6 } },
+ { ISD::UMULO, MVT::v32i8, { 9, 13, 17, 33 } },
+ { ISD::UMULO, MVT::v16i8, { 6, 19, 13, 20 } },
{ ISD::USUBSAT, MVT::v2i64, { 2, 7, 6, 6 } },
{ ISD::USUBSAT, MVT::v4i64, { 3, 7, 6, 10 } },
{ ISD::USUBSAT, MVT::v8i32, { 2, 2, 2, 4 } },
@@ -3925,6 +3949,14 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
{ ISD::SMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
{ ISD::SMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
{ ISD::SMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
+ { ISD::SMULO, MVT::v4i64, { 20, 20, 33, 37 } },
+ { ISD::SMULO, MVT::v2i64, { 9, 9, 13, 17 } },
+ { ISD::SMULO, MVT::v8i32, { 15, 20, 24, 29 } },
+ { ISD::SMULO, MVT::v4i32, { 7, 15, 11, 13 } },
+ { ISD::SMULO, MVT::v16i16, { 8, 14, 14, 15 } },
+ { ISD::SMULO, MVT::v8i16, { 3, 9, 6, 6 } },
+ { ISD::SMULO, MVT::v32i8, { 20, 20, 37, 39 } },
+ { ISD::SMULO, MVT::v16i8, { 9, 22, 18, 21 } },
{ ISD::SSUBSAT, MVT::v2i64, { 7, 13, 9, 13 } },
{ ISD::SSUBSAT, MVT::v4i64, { 15, 21, 18, 29 } }, // 2 x 128-bit Op + extract/insert
{ ISD::SSUBSAT, MVT::v8i32, { 15, 19, 18, 29 } }, // 2 x 128-bit Op + extract/insert
@@ -3945,6 +3977,14 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
{ ISD::UMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
{ ISD::UMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
{ ISD::UMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
+ { ISD::UMULO, MVT::v4i64, { 24, 26, 39, 45 } },
+ { ISD::UMULO, MVT::v2i64, { 10, 12, 15, 20 } },
+ { ISD::UMULO, MVT::v8i32, { 14, 15, 23, 28 } },
+ { ISD::UMULO, MVT::v4i32, { 7, 12, 11, 13 } },
+ { ISD::UMULO, MVT::v16i16, { 7, 11, 13, 14 } },
+ { ISD::UMULO, MVT::v8i16, { 3, 8, 6, 6 } },
+ { ISD::UMULO, MVT::v32i8, { 19, 19, 35, 37 } },
+ { ISD::UMULO, MVT::v16i8, { 9, 19, 17, 20 } },
{ ISD::USUBSAT, MVT::v2i64, { 3, 7, 6, 6 } },
{ ISD::USUBSAT, MVT::v4i64, { 8, 10, 14, 15 } }, // 2 x 128-bit Op + extract/insert
{ ISD::USUBSAT, MVT::v8i32, { 4, 4, 7, 8 } }, // 2 x 128-bit Op + extract/insert
@@ -4020,6 +4060,10 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
{ ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
{ ISD::SMIN, MVT::v4i32, { 1, 1, 1, 1 } },
{ ISD::SMIN, MVT::v16i8, { 1, 1, 1, 1 } },
+ { ISD::SMULO, MVT::v2i64, { 9, 11, 13, 17 } },
+ { ISD::SMULO, MVT::v4i32, { 20, 24, 13, 19 } },
+ { ISD::SMULO, MVT::v8i16, { 5, 9, 8, 8 } },
+ { ISD::SMULO, MVT::v16i8, { 13, 22, 24, 25 } },
{ ISD::UADDSAT, MVT::v2i64, { 6, 13, 14, 14 } },
{ ISD::UADDSAT, MVT::v4i32, { 2, 2, 4, 4 } },
{ ISD::USUBSAT, MVT::v2i64, { 6, 10, 14, 14 } },
@@ -4030,6 +4074,10 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
{ ISD::UMIN, MVT::v2i64, { 2, 11, 6, 7 } },
{ ISD::UMIN, MVT::v4i32, { 1, 1, 1, 1 } },
{ ISD::UMIN, MVT::v8i16, { 1, 1, 1, 1 } },
+ { ISD::UMULO, MVT::v2i64, { 14, 20, 15, 20 } },
+ { ISD::UMULO, MVT::v4i32, { 19, 22, 12, 18 } },
+ { ISD::UMULO, MVT::v8i16, { 4, 9, 7, 7 } },
+ { ISD::UMULO, MVT::v16i8, { 13, 19, 18, 20 } },
};
static const CostKindTblEntry SSSE3CostTbl[] = {
{ ISD::ABS, MVT::v4i32, { 1, 2, 1, 1 } },
@@ -4091,6 +4139,10 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
{ ISD::SMIN, MVT::v4i32, { 2, 4, 5, 5 } },
{ ISD::SMIN, MVT::v8i16, { 1, 1, 1, 1 } },
{ ISD::SMIN, MVT::v16i8, { 2, 4, 5, 5 } },
+ { ISD::SMULO, MVT::v2i64, { 30, 33, 13, 23 } },
+ { ISD::SMULO, MVT::v4i32, { 20, 24, 23, 23 } },
+ { ISD::SMULO, MVT::v8i16, { 5, 10, 8, 8 } },
+ { ISD::SMULO, MVT::v16i8, { 13, 23, 24, 25 } },
{ ISD::SSUBSAT, MVT::v2i64, { 16, 19, 31, 31 } },
{ ISD::SSUBSAT, MVT::v4i32, { 6, 14, 12, 13 } },
{ ISD::SSUBSAT, MVT::v8i16, { 1, 2, 1, 1 } },
@@ -4107,6 +4159,10 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
{ ISD::UMIN, MVT::v4i32, { 2, 5, 8, 8 } },
{ ISD::UMIN, MVT::v8i16, { 1, 3, 3, 3 } },
{ ISD::UMIN, MVT::v16i8, { 1, 1, 1, 1 } },
+ { ISD::UMULO, MVT::v2i64, { 30, 33, 15, 29 } },
+ { ISD::UMULO, MVT::v4i32, { 19, 22, 14, 18 } },
+ { ISD::UMULO, MVT::v8i16, { 4, 9, 7, 7 } },
+ { ISD::UMULO, MVT::v16i8, { 13, 19, 20, 20 } },
{ ISD::USUBSAT, MVT::v2i64, { 7, 10, 14, 14 } },
{ ISD::USUBSAT, MVT::v4i32, { 4, 4, 7, 7 } },
{ ISD::USUBSAT, MVT::v8i16, { 1, 2, 1, 1 } },
diff --git a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp
index b382fed..832506f 100644
--- a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp
@@ -1725,6 +1725,9 @@ Value *NumericalStabilitySanitizer::createShadowValueWithOperandsAvailable(
Map.getShadow(S->getTrueValue()),
Map.getShadow(S->getFalseValue()));
+ if (auto *Freeze = dyn_cast<FreezeInst>(&Inst))
+ return Builder.CreateFreeze(Map.getShadow(Freeze->getOperand(0)));
+
if (auto *Extract = dyn_cast<ExtractElementInst>(&Inst))
return Builder.CreateExtractElement(
Map.getShadow(Extract->getVectorOperand()), Extract->getIndexOperand());
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index 91ef2b4..329b3ef 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -113,6 +113,8 @@ STATISTIC(NumFPAssociationsHoisted, "Number of invariant FP expressions "
STATISTIC(NumIntAssociationsHoisted,
"Number of invariant int expressions "
"reassociated and hoisted out of the loop");
+STATISTIC(NumBOAssociationsHoisted, "Number of invariant BinaryOp expressions "
+ "reassociated and hoisted out of the loop");
/// Memory promotion is enabled by default.
static cl::opt<bool>
@@ -2779,6 +2781,68 @@ static bool hoistMulAddAssociation(Instruction &I, Loop &L,
return true;
}
+/// Reassociate associative binary expressions of the form
+///
+/// 1. "(LV op C1) op C2" ==> "LV op (C1 op C2)"
+///
+/// where op is an associative binary op, LV is a loop variant, and C1 and C2
+/// are loop invariants that we want to hoist.
+///
+/// TODO: This can be extended to more cases such as
+/// 2. "C1 op (C2 op LV)" ==> "(C1 op C2) op LV"
+/// 3. "(C1 op LV) op C2" ==> "LV op (C1 op C2)" if op is commutative
+/// 4. "C1 op (LV op C2)" ==> "(C1 op C2) op LV" if op is commutative
+static bool hoistBOAssociation(Instruction &I, Loop &L,
+ ICFLoopSafetyInfo &SafetyInfo,
+ MemorySSAUpdater &MSSAU, AssumptionCache *AC,
+ DominatorTree *DT) {
+ auto *BO = dyn_cast<BinaryOperator>(&I);
+ if (!BO || !BO->isAssociative())
+ return false;
+
+ // Only fold ADDs for now.
+ Instruction::BinaryOps Opcode = BO->getOpcode();
+ if (Opcode != Instruction::Add)
+ return false;
+
+ auto *BO0 = dyn_cast<BinaryOperator>(BO->getOperand(0));
+ if (!BO0 || BO0->getOpcode() != Opcode || !BO0->isAssociative())
+ return false;
+
+ // Transform: "(LV op C1) op C2" ==> "LV op (C1 op C2)"
+ Value *LV = BO0->getOperand(0);
+ Value *C1 = BO0->getOperand(1);
+ Value *C2 = BO->getOperand(1);
+
+ if (L.isLoopInvariant(LV) || !L.isLoopInvariant(C1) || !L.isLoopInvariant(C2))
+ return false;
+
+ auto *Preheader = L.getLoopPreheader();
+ assert(Preheader && "Loop is not in simplify form?");
+
+ auto *Inv = BinaryOperator::Create(Opcode, C1, C2, "invariant.op",
+ Preheader->getTerminator());
+ auto *NewBO =
+ BinaryOperator::Create(Opcode, LV, Inv, BO->getName() + ".reass", BO);
+
+ // Copy NUW for ADDs if both instructions have it.
+ if (Opcode == Instruction::Add && BO->hasNoUnsignedWrap() &&
+ BO0->hasNoUnsignedWrap()) {
+ Inv->setHasNoUnsignedWrap(true);
+ NewBO->setHasNoUnsignedWrap(true);
+ }
+
+ BO->replaceAllUsesWith(NewBO);
+ eraseInstruction(*BO, SafetyInfo, MSSAU);
+
+ // (LV op C1) might not be erased if it has more uses than the one we just
+ // replaced.
+ if (BO0->use_empty())
+ eraseInstruction(*BO0, SafetyInfo, MSSAU);
+
+ return true;
+}
+
static bool hoistArithmetics(Instruction &I, Loop &L,
ICFLoopSafetyInfo &SafetyInfo,
MemorySSAUpdater &MSSAU, AssumptionCache *AC,
@@ -2816,6 +2880,12 @@ static bool hoistArithmetics(Instruction &I, Loop &L,
return true;
}
+ if (hoistBOAssociation(I, L, SafetyInfo, MSSAU, AC, DT)) {
+ ++NumHoisted;
+ ++NumBOAssociationsHoisted;
+ return true;
+ }
+
return false;
}
diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index b38db41..ee0d95b 100644
--- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -347,6 +347,30 @@ void llvm::createMemCpyLoopUnknownSize(
}
}
+// If \p Addr1 and \p Addr2 are pointers to different address spaces, create an
+// addresspacecast to obtain a pair of pointers in the same addressspace. The
+// caller needs to ensure that addrspacecasting is possible.
+// No-op if the pointers are in the same address space.
+static std::pair<Value *, Value *>
+tryInsertCastToCommonAddrSpace(IRBuilderBase &B, Value *Addr1, Value *Addr2,
+ const TargetTransformInfo &TTI) {
+ Value *ResAddr1 = Addr1;
+ Value *ResAddr2 = Addr2;
+
+ unsigned AS1 = cast<PointerType>(Addr1->getType())->getAddressSpace();
+ unsigned AS2 = cast<PointerType>(Addr2->getType())->getAddressSpace();
+ if (AS1 != AS2) {
+ if (TTI.isValidAddrSpaceCast(AS2, AS1))
+ ResAddr2 = B.CreateAddrSpaceCast(Addr2, Addr1->getType());
+ else if (TTI.isValidAddrSpaceCast(AS1, AS2))
+ ResAddr1 = B.CreateAddrSpaceCast(Addr1, Addr2->getType());
+ else
+ llvm_unreachable("Can only lower memmove between address spaces if they "
+ "support addrspacecast");
+ }
+ return {ResAddr1, ResAddr2};
+}
+
// Lower memmove to IR. memmove is required to correctly copy overlapping memory
// regions; therefore, it has to check the relative positions of the source and
// destination pointers and choose the copy direction accordingly.
@@ -369,17 +393,61 @@ void llvm::createMemCpyLoopUnknownSize(
// }
// return dst;
// }
-static void createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr,
- Value *DstAddr, Value *CopyLen, Align SrcAlign,
- Align DstAlign, bool SrcIsVolatile,
- bool DstIsVolatile,
- const TargetTransformInfo &TTI) {
+//
+// If the TargetTransformInfo specifies a wider MemcpyLoopLoweringType, it is
+// used for the memory accesses in the loops. Then, additional loops with
+// byte-wise accesses are added for the remaining bytes.
+static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
+ Value *SrcAddr, Value *DstAddr,
+ Value *CopyLen, Align SrcAlign,
+ Align DstAlign, bool SrcIsVolatile,
+ bool DstIsVolatile,
+ const TargetTransformInfo &TTI) {
Type *TypeOfCopyLen = CopyLen->getType();
BasicBlock *OrigBB = InsertBefore->getParent();
Function *F = OrigBB->getParent();
const DataLayout &DL = F->getDataLayout();
- // TODO: Use different element type if possible?
- Type *EltTy = Type::getInt8Ty(F->getContext());
+ LLVMContext &Ctx = OrigBB->getContext();
+ unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
+ unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
+
+ Type *LoopOpType = TTI.getMemcpyLoopLoweringType(
+ Ctx, CopyLen, SrcAS, DstAS, SrcAlign.value(), DstAlign.value());
+ unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
+ Type *Int8Type = Type::getInt8Ty(Ctx);
+ bool LoopOpIsInt8 = LoopOpType == Int8Type;
+
+ // If the memory accesses are wider than one byte, residual loops with
+ // i8-accesses are required to move remaining bytes.
+ bool RequiresResidual = !LoopOpIsInt8;
+
+ Type *ResidualLoopOpType = Int8Type;
+ unsigned ResidualLoopOpSize = DL.getTypeStoreSize(ResidualLoopOpType);
+
+ // Calculate the loop trip count and remaining bytes to copy after the loop.
+ IntegerType *ILengthType = cast<IntegerType>(TypeOfCopyLen);
+ ConstantInt *CILoopOpSize = ConstantInt::get(ILengthType, LoopOpSize);
+ ConstantInt *Zero = ConstantInt::get(ILengthType, 0);
+ ConstantInt *One = ConstantInt::get(ILengthType, 1);
+
+ IRBuilder<> PLBuilder(InsertBefore);
+
+ Value *RuntimeLoopCount = CopyLen;
+ Value *RuntimeLoopRemainder = nullptr;
+ Value *RuntimeBytesCopiedMainLoop = CopyLen;
+ Value *SkipResidualCondition = nullptr;
+ if (RequiresResidual) {
+ RuntimeLoopCount =
+ getRuntimeLoopCount(DL, PLBuilder, CopyLen, CILoopOpSize, LoopOpSize);
+ RuntimeLoopRemainder = getRuntimeLoopRemainder(DL, PLBuilder, CopyLen,
+ CILoopOpSize, LoopOpSize);
+ RuntimeBytesCopiedMainLoop =
+ PLBuilder.CreateSub(CopyLen, RuntimeLoopRemainder);
+ SkipResidualCondition =
+ PLBuilder.CreateICmpEQ(RuntimeLoopRemainder, Zero, "skip_residual");
+ }
+ Value *SkipMainCondition =
+ PLBuilder.CreateICmpEQ(RuntimeLoopCount, Zero, "skip_main");
// Create the a comparison of src and dst, based on which we jump to either
// the forward-copy part of the function (if src >= dst) or the backwards-copy
@@ -387,76 +455,374 @@ static void createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr,
// SplitBlockAndInsertIfThenElse conveniently creates the basic if-then-else
// structure. Its block terminators (unconditional branches) are replaced by
// the appropriate conditional branches when the loop is built.
- ICmpInst *PtrCompare = new ICmpInst(InsertBefore->getIterator(), ICmpInst::ICMP_ULT,
- SrcAddr, DstAddr, "compare_src_dst");
+ // If the pointers are in different address spaces, they need to be converted
+ // to a compatible one. Cases where memory ranges in the different address
+ // spaces cannot overlap are lowered as memcpy and not handled here.
+ auto [CmpSrcAddr, CmpDstAddr] =
+ tryInsertCastToCommonAddrSpace(PLBuilder, SrcAddr, DstAddr, TTI);
+ Value *PtrCompare =
+ PLBuilder.CreateICmpULT(CmpSrcAddr, CmpDstAddr, "compare_src_dst");
Instruction *ThenTerm, *ElseTerm;
- SplitBlockAndInsertIfThenElse(PtrCompare, InsertBefore->getIterator(), &ThenTerm,
- &ElseTerm);
-
- // Each part of the function consists of two blocks:
- // copy_backwards: used to skip the loop when n == 0
- // copy_backwards_loop: the actual backwards loop BB
- // copy_forward: used to skip the loop when n == 0
- // copy_forward_loop: the actual forward loop BB
+ SplitBlockAndInsertIfThenElse(PtrCompare, InsertBefore->getIterator(),
+ &ThenTerm, &ElseTerm);
+
+ // If the LoopOpSize is greater than 1, each part of the function consist of
+ // four blocks:
+ // memmove_copy_backwards:
+ // skip the residual loop when 0 iterations are required
+ // memmove_bwd_residual_loop:
+ // copy the last few bytes individually so that the remaining length is
+ // a multiple of the LoopOpSize
+ // memmove_bwd_middle: skip the main loop when 0 iterations are required
+ // memmove_bwd_main_loop: the actual backwards loop BB with wide accesses
+ // memmove_copy_forward: skip the main loop when 0 iterations are required
+ // memmove_fwd_main_loop: the actual forward loop BB with wide accesses
+ // memmove_fwd_middle: skip the residual loop when 0 iterations are required
+ // memmove_fwd_residual_loop: copy the last few bytes individually
+ //
+ // The main and residual loop are switched between copying forward and
+ // backward so that the residual loop always operates on the end of the moved
+ // range. This is based on the assumption that buffers whose start is aligned
+ // with the LoopOpSize are more common than buffers whose end is.
+ //
+ // If the LoopOpSize is 1, each part of the function consists of two blocks:
+ // memmove_copy_backwards: skip the loop when 0 iterations are required
+ // memmove_bwd_main_loop: the actual backwards loop BB
+ // memmove_copy_forward: skip the loop when 0 iterations are required
+ // memmove_fwd_main_loop: the actual forward loop BB
BasicBlock *CopyBackwardsBB = ThenTerm->getParent();
- CopyBackwardsBB->setName("copy_backwards");
+ CopyBackwardsBB->setName("memmove_copy_backwards");
BasicBlock *CopyForwardBB = ElseTerm->getParent();
- CopyForwardBB->setName("copy_forward");
+ CopyForwardBB->setName("memmove_copy_forward");
BasicBlock *ExitBB = InsertBefore->getParent();
ExitBB->setName("memmove_done");
- unsigned PartSize = DL.getTypeStoreSize(EltTy);
- Align PartSrcAlign(commonAlignment(SrcAlign, PartSize));
- Align PartDstAlign(commonAlignment(DstAlign, PartSize));
+ Align PartSrcAlign(commonAlignment(SrcAlign, LoopOpSize));
+ Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
- // Initial comparison of n == 0 that lets us skip the loops altogether. Shared
- // between both backwards and forward copy clauses.
- ICmpInst *CompareN =
- new ICmpInst(OrigBB->getTerminator()->getIterator(), ICmpInst::ICMP_EQ, CopyLen,
- ConstantInt::get(TypeOfCopyLen, 0), "compare_n_to_0");
+ // Accesses in the residual loops do not share the same alignment as those in
+ // the main loops.
+ Align ResidualSrcAlign(commonAlignment(PartSrcAlign, ResidualLoopOpSize));
+ Align ResidualDstAlign(commonAlignment(PartDstAlign, ResidualLoopOpSize));
// Copying backwards.
- BasicBlock *LoopBB =
- BasicBlock::Create(F->getContext(), "copy_backwards_loop", F, CopyForwardBB);
- IRBuilder<> LoopBuilder(LoopBB);
+ {
+ BasicBlock *MainLoopBB = BasicBlock::Create(
+ F->getContext(), "memmove_bwd_main_loop", F, CopyForwardBB);
+
+ // The predecessor of the memmove_bwd_main_loop. Updated in the
+ // following if a residual loop is emitted first.
+ BasicBlock *PredBB = CopyBackwardsBB;
+
+ if (RequiresResidual) {
+ // backwards residual loop
+ BasicBlock *ResidualLoopBB = BasicBlock::Create(
+ F->getContext(), "memmove_bwd_residual_loop", F, MainLoopBB);
+ IRBuilder<> ResidualLoopBuilder(ResidualLoopBB);
+ PHINode *ResidualLoopPhi = ResidualLoopBuilder.CreatePHI(ILengthType, 0);
+ Value *ResidualIndex = ResidualLoopBuilder.CreateSub(
+ ResidualLoopPhi, One, "bwd_residual_index");
+ Value *LoadGEP = ResidualLoopBuilder.CreateInBoundsGEP(
+ ResidualLoopOpType, SrcAddr, ResidualIndex);
+ Value *Element = ResidualLoopBuilder.CreateAlignedLoad(
+ ResidualLoopOpType, LoadGEP, ResidualSrcAlign, SrcIsVolatile,
+ "element");
+ Value *StoreGEP = ResidualLoopBuilder.CreateInBoundsGEP(
+ ResidualLoopOpType, DstAddr, ResidualIndex);
+ ResidualLoopBuilder.CreateAlignedStore(Element, StoreGEP,
+ ResidualDstAlign, DstIsVolatile);
+
+ // After the residual loop, go to an intermediate block.
+ BasicBlock *IntermediateBB = BasicBlock::Create(
+ F->getContext(), "memmove_bwd_middle", F, MainLoopBB);
+ // Later code expects a terminator in the PredBB.
+ IRBuilder<> IntermediateBuilder(IntermediateBB);
+ IntermediateBuilder.CreateUnreachable();
+ ResidualLoopBuilder.CreateCondBr(
+ ResidualLoopBuilder.CreateICmpEQ(ResidualIndex,
+ RuntimeBytesCopiedMainLoop),
+ IntermediateBB, ResidualLoopBB);
+
+ ResidualLoopPhi->addIncoming(ResidualIndex, ResidualLoopBB);
+ ResidualLoopPhi->addIncoming(CopyLen, CopyBackwardsBB);
+
+ // How to get to the residual:
+ BranchInst::Create(IntermediateBB, ResidualLoopBB, SkipResidualCondition,
+ ThenTerm->getIterator());
+ ThenTerm->eraseFromParent();
+
+ PredBB = IntermediateBB;
+ }
- PHINode *LoopPhi = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
- Value *IndexPtr = LoopBuilder.CreateSub(
- LoopPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_ptr");
- Value *Element = LoopBuilder.CreateAlignedLoad(
- EltTy, LoopBuilder.CreateInBoundsGEP(EltTy, SrcAddr, IndexPtr),
- PartSrcAlign, SrcIsVolatile, "element");
- LoopBuilder.CreateAlignedStore(
- Element, LoopBuilder.CreateInBoundsGEP(EltTy, DstAddr, IndexPtr),
- PartDstAlign, DstIsVolatile);
- LoopBuilder.CreateCondBr(
- LoopBuilder.CreateICmpEQ(IndexPtr, ConstantInt::get(TypeOfCopyLen, 0)),
- ExitBB, LoopBB);
- LoopPhi->addIncoming(IndexPtr, LoopBB);
- LoopPhi->addIncoming(CopyLen, CopyBackwardsBB);
- BranchInst::Create(ExitBB, LoopBB, CompareN, ThenTerm->getIterator());
- ThenTerm->eraseFromParent();
+ // main loop
+ IRBuilder<> MainLoopBuilder(MainLoopBB);
+ PHINode *MainLoopPhi = MainLoopBuilder.CreatePHI(ILengthType, 0);
+ Value *MainIndex =
+ MainLoopBuilder.CreateSub(MainLoopPhi, One, "bwd_main_index");
+ Value *LoadGEP =
+ MainLoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, MainIndex);
+ Value *Element = MainLoopBuilder.CreateAlignedLoad(
+ LoopOpType, LoadGEP, PartSrcAlign, SrcIsVolatile, "element");
+ Value *StoreGEP =
+ MainLoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, MainIndex);
+ MainLoopBuilder.CreateAlignedStore(Element, StoreGEP, PartDstAlign,
+ DstIsVolatile);
+ MainLoopBuilder.CreateCondBr(MainLoopBuilder.CreateICmpEQ(MainIndex, Zero),
+ ExitBB, MainLoopBB);
+ MainLoopPhi->addIncoming(MainIndex, MainLoopBB);
+ MainLoopPhi->addIncoming(RuntimeLoopCount, PredBB);
+
+ // How to get to the main loop:
+ Instruction *PredBBTerm = PredBB->getTerminator();
+ BranchInst::Create(ExitBB, MainLoopBB, SkipMainCondition,
+ PredBBTerm->getIterator());
+ PredBBTerm->eraseFromParent();
+ }
// Copying forward.
- BasicBlock *FwdLoopBB =
- BasicBlock::Create(F->getContext(), "copy_forward_loop", F, ExitBB);
- IRBuilder<> FwdLoopBuilder(FwdLoopBB);
- PHINode *FwdCopyPhi = FwdLoopBuilder.CreatePHI(TypeOfCopyLen, 0, "index_ptr");
- Value *SrcGEP = FwdLoopBuilder.CreateInBoundsGEP(EltTy, SrcAddr, FwdCopyPhi);
- Value *FwdElement = FwdLoopBuilder.CreateAlignedLoad(
- EltTy, SrcGEP, PartSrcAlign, SrcIsVolatile, "element");
- Value *DstGEP = FwdLoopBuilder.CreateInBoundsGEP(EltTy, DstAddr, FwdCopyPhi);
- FwdLoopBuilder.CreateAlignedStore(FwdElement, DstGEP, PartDstAlign,
- DstIsVolatile);
- Value *FwdIndexPtr = FwdLoopBuilder.CreateAdd(
- FwdCopyPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_increment");
- FwdLoopBuilder.CreateCondBr(FwdLoopBuilder.CreateICmpEQ(FwdIndexPtr, CopyLen),
- ExitBB, FwdLoopBB);
- FwdCopyPhi->addIncoming(FwdIndexPtr, FwdLoopBB);
- FwdCopyPhi->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), CopyForwardBB);
-
- BranchInst::Create(ExitBB, FwdLoopBB, CompareN, ElseTerm->getIterator());
- ElseTerm->eraseFromParent();
+ // main loop
+ {
+ BasicBlock *MainLoopBB =
+ BasicBlock::Create(F->getContext(), "memmove_fwd_main_loop", F, ExitBB);
+ IRBuilder<> MainLoopBuilder(MainLoopBB);
+ PHINode *MainLoopPhi =
+ MainLoopBuilder.CreatePHI(ILengthType, 0, "fwd_main_index");
+ Value *LoadGEP =
+ MainLoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, MainLoopPhi);
+ Value *Element = MainLoopBuilder.CreateAlignedLoad(
+ LoopOpType, LoadGEP, PartSrcAlign, SrcIsVolatile, "element");
+ Value *StoreGEP =
+ MainLoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, MainLoopPhi);
+ MainLoopBuilder.CreateAlignedStore(Element, StoreGEP, PartDstAlign,
+ DstIsVolatile);
+ Value *MainIndex = MainLoopBuilder.CreateAdd(MainLoopPhi, One);
+ MainLoopPhi->addIncoming(MainIndex, MainLoopBB);
+ MainLoopPhi->addIncoming(Zero, CopyForwardBB);
+
+ Instruction *CopyFwdBBTerm = CopyForwardBB->getTerminator();
+ BasicBlock *SuccessorBB = ExitBB;
+ if (RequiresResidual)
+ SuccessorBB =
+ BasicBlock::Create(F->getContext(), "memmove_fwd_middle", F, ExitBB);
+
+ // leaving or staying in the main loop
+ MainLoopBuilder.CreateCondBr(
+ MainLoopBuilder.CreateICmpEQ(MainIndex, RuntimeLoopCount), SuccessorBB,
+ MainLoopBB);
+
+ // getting in or skipping the main loop
+ BranchInst::Create(SuccessorBB, MainLoopBB, SkipMainCondition,
+ CopyFwdBBTerm->getIterator());
+ CopyFwdBBTerm->eraseFromParent();
+
+ if (RequiresResidual) {
+ BasicBlock *IntermediateBB = SuccessorBB;
+ IRBuilder<> IntermediateBuilder(IntermediateBB);
+ BasicBlock *ResidualLoopBB = BasicBlock::Create(
+ F->getContext(), "memmove_fwd_residual_loop", F, ExitBB);
+ IntermediateBuilder.CreateCondBr(SkipResidualCondition, ExitBB,
+ ResidualLoopBB);
+
+ // Residual loop
+ IRBuilder<> ResidualLoopBuilder(ResidualLoopBB);
+ PHINode *ResidualLoopPhi =
+ ResidualLoopBuilder.CreatePHI(ILengthType, 0, "fwd_residual_index");
+ Value *LoadGEP = ResidualLoopBuilder.CreateInBoundsGEP(
+ ResidualLoopOpType, SrcAddr, ResidualLoopPhi);
+ Value *Element = ResidualLoopBuilder.CreateAlignedLoad(
+ ResidualLoopOpType, LoadGEP, ResidualSrcAlign, SrcIsVolatile,
+ "element");
+ Value *StoreGEP = ResidualLoopBuilder.CreateInBoundsGEP(
+ ResidualLoopOpType, DstAddr, ResidualLoopPhi);
+ ResidualLoopBuilder.CreateAlignedStore(Element, StoreGEP,
+ ResidualDstAlign, DstIsVolatile);
+ Value *ResidualIndex =
+ ResidualLoopBuilder.CreateAdd(ResidualLoopPhi, One);
+ ResidualLoopBuilder.CreateCondBr(
+ ResidualLoopBuilder.CreateICmpEQ(ResidualIndex, CopyLen), ExitBB,
+ ResidualLoopBB);
+ ResidualLoopPhi->addIncoming(ResidualIndex, ResidualLoopBB);
+ ResidualLoopPhi->addIncoming(RuntimeBytesCopiedMainLoop, IntermediateBB);
+ }
+ }
+}
+
+// Similar to createMemMoveLoopUnknownSize, only the trip counts are computed at
+// compile time, obsolete loops and branches are omitted, and the residual code
+// is straight-line code instead of a loop.
+static void createMemMoveLoopKnownSize(Instruction *InsertBefore,
+ Value *SrcAddr, Value *DstAddr,
+ ConstantInt *CopyLen, Align SrcAlign,
+ Align DstAlign, bool SrcIsVolatile,
+ bool DstIsVolatile,
+ const TargetTransformInfo &TTI) {
+ // No need to expand zero length moves.
+ if (CopyLen->isZero())
+ return;
+
+ Type *TypeOfCopyLen = CopyLen->getType();
+ BasicBlock *OrigBB = InsertBefore->getParent();
+ Function *F = OrigBB->getParent();
+ const DataLayout &DL = F->getDataLayout();
+ LLVMContext &Ctx = OrigBB->getContext();
+ unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
+ unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
+
+ Type *LoopOpType = TTI.getMemcpyLoopLoweringType(
+ Ctx, CopyLen, SrcAS, DstAS, SrcAlign.value(), DstAlign.value());
+ unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
+
+ // Calculate the loop trip count and remaining bytes to copy after the loop.
+ uint64_t LoopEndCount = CopyLen->getZExtValue() / LoopOpSize;
+ uint64_t BytesCopiedInLoop = LoopEndCount * LoopOpSize;
+ uint64_t RemainingBytes = CopyLen->getZExtValue() - BytesCopiedInLoop;
+
+ IntegerType *ILengthType = cast<IntegerType>(TypeOfCopyLen);
+ ConstantInt *Zero = ConstantInt::get(ILengthType, 0);
+ ConstantInt *One = ConstantInt::get(ILengthType, 1);
+ ConstantInt *TripCount = ConstantInt::get(ILengthType, LoopEndCount);
+
+ IRBuilder<> PLBuilder(InsertBefore);
+
+ auto [CmpSrcAddr, CmpDstAddr] =
+ tryInsertCastToCommonAddrSpace(PLBuilder, SrcAddr, DstAddr, TTI);
+ Value *PtrCompare =
+ PLBuilder.CreateICmpULT(CmpSrcAddr, CmpDstAddr, "compare_src_dst");
+ Instruction *ThenTerm, *ElseTerm;
+ SplitBlockAndInsertIfThenElse(PtrCompare, InsertBefore->getIterator(),
+ &ThenTerm, &ElseTerm);
+
+ BasicBlock *CopyBackwardsBB = ThenTerm->getParent();
+ BasicBlock *CopyForwardBB = ElseTerm->getParent();
+ BasicBlock *ExitBB = InsertBefore->getParent();
+ ExitBB->setName("memmove_done");
+
+ Align PartSrcAlign(commonAlignment(SrcAlign, LoopOpSize));
+ Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
+
+ // Helper function to generate a load/store pair of a given type in the
+ // residual. Used in the forward and backward branches.
+ auto GenerateResidualLdStPair = [&](Type *OpTy, IRBuilderBase &Builder,
+ uint64_t &BytesCopied) {
+ Align ResSrcAlign(commonAlignment(SrcAlign, BytesCopied));
+ Align ResDstAlign(commonAlignment(DstAlign, BytesCopied));
+
+ // Calculate the new index
+ unsigned OperandSize = DL.getTypeStoreSize(OpTy);
+
+ uint64_t GepIndex = BytesCopied / OperandSize;
+ assert(GepIndex * OperandSize == BytesCopied &&
+ "Division should have no Remainder!");
+
+ Value *SrcGEP = Builder.CreateInBoundsGEP(
+ OpTy, SrcAddr, ConstantInt::get(TypeOfCopyLen, GepIndex));
+ LoadInst *Load =
+ Builder.CreateAlignedLoad(OpTy, SrcGEP, ResSrcAlign, SrcIsVolatile);
+ Value *DstGEP = Builder.CreateInBoundsGEP(
+ OpTy, DstAddr, ConstantInt::get(TypeOfCopyLen, GepIndex));
+ Builder.CreateAlignedStore(Load, DstGEP, ResDstAlign, DstIsVolatile);
+ BytesCopied += OperandSize;
+ };
+
+ // Copying backwards.
+ if (RemainingBytes != 0) {
+ CopyBackwardsBB->setName("memmove_bwd_residual");
+ uint64_t BytesCopied = BytesCopiedInLoop;
+
+ // Residual code is required to move the remaining bytes. We need the same
+ // instructions as in the forward case, only in reverse. So we generate code
+ // the same way, except that we change the IRBuilder insert point for each
+ // load/store pair so that each one is inserted before the previous one
+ // instead of after it.
+ IRBuilder<> BwdResBuilder(CopyBackwardsBB->getFirstNonPHI());
+ SmallVector<Type *, 5> RemainingOps;
+ TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
+ SrcAS, DstAS, PartSrcAlign.value(),
+ PartDstAlign.value());
+ for (auto *OpTy : RemainingOps) {
+ // reverse the order of the emitted operations
+ BwdResBuilder.SetInsertPoint(CopyBackwardsBB->getFirstNonPHI());
+ GenerateResidualLdStPair(OpTy, BwdResBuilder, BytesCopied);
+ }
+ }
+ if (LoopEndCount != 0) {
+ BasicBlock *LoopBB = CopyBackwardsBB;
+ BasicBlock *PredBB = OrigBB;
+ if (RemainingBytes != 0) {
+ // if we introduce residual code, it needs its separate BB
+ LoopBB = CopyBackwardsBB->splitBasicBlock(
+ CopyBackwardsBB->getTerminator(), "memmove_bwd_loop");
+ PredBB = CopyBackwardsBB;
+ } else {
+ CopyBackwardsBB->setName("memmove_bwd_loop");
+ }
+ IRBuilder<> LoopBuilder(LoopBB->getTerminator());
+ PHINode *LoopPhi = LoopBuilder.CreatePHI(ILengthType, 0);
+ Value *Index = LoopBuilder.CreateSub(LoopPhi, One, "bwd_index");
+ Value *LoadGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, Index);
+ Value *Element = LoopBuilder.CreateAlignedLoad(
+ LoopOpType, LoadGEP, PartSrcAlign, SrcIsVolatile, "element");
+ Value *StoreGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, Index);
+ LoopBuilder.CreateAlignedStore(Element, StoreGEP, PartDstAlign,
+ DstIsVolatile);
+
+ // Replace the unconditional branch introduced by
+ // SplitBlockAndInsertIfThenElse to turn LoopBB into a loop.
+ Instruction *UncondTerm = LoopBB->getTerminator();
+ LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpEQ(Index, Zero), ExitBB,
+ LoopBB);
+ UncondTerm->eraseFromParent();
+
+ LoopPhi->addIncoming(Index, LoopBB);
+ LoopPhi->addIncoming(TripCount, PredBB);
+ }
+
+ // Copying forward.
+ BasicBlock *FwdResidualBB = CopyForwardBB;
+ if (LoopEndCount != 0) {
+ CopyForwardBB->setName("memmove_fwd_loop");
+ BasicBlock *LoopBB = CopyForwardBB;
+ BasicBlock *SuccBB = ExitBB;
+ if (RemainingBytes != 0) {
+ // if we introduce residual code, it needs its separate BB
+ SuccBB = CopyForwardBB->splitBasicBlock(CopyForwardBB->getTerminator(),
+ "memmove_fwd_residual");
+ FwdResidualBB = SuccBB;
+ }
+ IRBuilder<> LoopBuilder(LoopBB->getTerminator());
+ PHINode *LoopPhi = LoopBuilder.CreatePHI(ILengthType, 0, "fwd_index");
+ Value *LoadGEP =
+ LoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, LoopPhi);
+ Value *Element = LoopBuilder.CreateAlignedLoad(
+ LoopOpType, LoadGEP, PartSrcAlign, SrcIsVolatile, "element");
+ Value *StoreGEP =
+ LoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, LoopPhi);
+ LoopBuilder.CreateAlignedStore(Element, StoreGEP, PartDstAlign,
+ DstIsVolatile);
+ Value *Index = LoopBuilder.CreateAdd(LoopPhi, One);
+ LoopPhi->addIncoming(Index, LoopBB);
+ LoopPhi->addIncoming(Zero, OrigBB);
+
+ // Replace the unconditional branch to turn LoopBB into a loop.
+ Instruction *UncondTerm = LoopBB->getTerminator();
+ LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpEQ(Index, TripCount), SuccBB,
+ LoopBB);
+ UncondTerm->eraseFromParent();
+ }
+
+ if (RemainingBytes != 0) {
+ uint64_t BytesCopied = BytesCopiedInLoop;
+
+ // Residual code is required to move the remaining bytes. In the forward
+ // case, we emit it in the normal order.
+ IRBuilder<> FwdResBuilder(FwdResidualBB->getTerminator());
+ SmallVector<Type *, 5> RemainingOps;
+ TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
+ SrcAS, DstAS, PartSrcAlign.value(),
+ PartDstAlign.value());
+ for (auto *OpTy : RemainingOps)
+ GenerateResidualLdStPair(OpTy, FwdResBuilder, BytesCopied);
+ }
}
static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr,
@@ -572,11 +938,8 @@ bool llvm::expandMemMoveAsLoop(MemMoveInst *Memmove,
return true;
}
- if (TTI.isValidAddrSpaceCast(DstAS, SrcAS))
- DstAddr = CastBuilder.CreateAddrSpaceCast(DstAddr, SrcAddr->getType());
- else if (TTI.isValidAddrSpaceCast(SrcAS, DstAS))
- SrcAddr = CastBuilder.CreateAddrSpaceCast(SrcAddr, DstAddr->getType());
- else {
+ if (!(TTI.isValidAddrSpaceCast(DstAS, SrcAS) ||
+ TTI.isValidAddrSpaceCast(SrcAS, DstAS))) {
// We don't know generically if it's legal to introduce an
// addrspacecast. We need to know either if it's legal to insert an
// addrspacecast, or if the address spaces cannot alias.
@@ -587,9 +950,15 @@ bool llvm::expandMemMoveAsLoop(MemMoveInst *Memmove,
}
}
- createMemMoveLoop(
- /*InsertBefore=*/Memmove, SrcAddr, DstAddr, CopyLen, SrcAlign, DstAlign,
- SrcIsVolatile, DstIsVolatile, TTI);
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(CopyLen)) {
+ createMemMoveLoopKnownSize(
+ /*InsertBefore=*/Memmove, SrcAddr, DstAddr, CI, SrcAlign, DstAlign,
+ SrcIsVolatile, DstIsVolatile, TTI);
+ } else {
+ createMemMoveLoopUnknownSize(
+ /*InsertBefore=*/Memmove, SrcAddr, DstAddr, CopyLen, SrcAlign, DstAlign,
+ SrcIsVolatile, DstIsVolatile, TTI);
+ }
return true;
}
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index f23e288..1a17524 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -3246,7 +3246,12 @@ bool SimplifyCFGOpt::SpeculativelyExecuteBB(BranchInst *BI,
}
/// Return true if we can thread a branch across this block.
-static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) {
+static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB,
+ const TargetTransformInfo &TTI) {
+ // Skip threading if the branch may be divergent.
+ if (TTI.hasBranchDivergence(BB->getParent()))
+ return false;
+
int Size = 0;
EphemeralValueTracker EphTracker;
@@ -3301,10 +3306,9 @@ static ConstantInt *getKnownValueOnEdge(Value *V, BasicBlock *From,
/// If we have a conditional branch on something for which we know the constant
/// value in predecessors (e.g. a phi node in the current block), thread edges
/// from the predecessor to their ultimate destination.
-static std::optional<bool>
-FoldCondBranchOnValueKnownInPredecessorImpl(BranchInst *BI, DomTreeUpdater *DTU,
- const DataLayout &DL,
- AssumptionCache *AC) {
+static std::optional<bool> FoldCondBranchOnValueKnownInPredecessorImpl(
+ BranchInst *BI, DomTreeUpdater *DTU, const DataLayout &DL,
+ const TargetTransformInfo &TTI, AssumptionCache *AC) {
SmallMapVector<ConstantInt *, SmallSetVector<BasicBlock *, 2>, 2> KnownValues;
BasicBlock *BB = BI->getParent();
Value *Cond = BI->getCondition();
@@ -3332,7 +3336,7 @@ FoldCondBranchOnValueKnownInPredecessorImpl(BranchInst *BI, DomTreeUpdater *DTU,
// Now we know that this block has multiple preds and two succs.
// Check that the block is small enough and values defined in the block are
// not used outside of it.
- if (!BlockIsSimpleEnoughToThreadThrough(BB))
+ if (!BlockIsSimpleEnoughToThreadThrough(BB, TTI))
return false;
for (const auto &Pair : KnownValues) {
@@ -3459,15 +3463,14 @@ FoldCondBranchOnValueKnownInPredecessorImpl(BranchInst *BI, DomTreeUpdater *DTU,
return false;
}
-static bool FoldCondBranchOnValueKnownInPredecessor(BranchInst *BI,
- DomTreeUpdater *DTU,
- const DataLayout &DL,
- AssumptionCache *AC) {
+static bool FoldCondBranchOnValueKnownInPredecessor(
+ BranchInst *BI, DomTreeUpdater *DTU, const DataLayout &DL,
+ const TargetTransformInfo &TTI, AssumptionCache *AC) {
std::optional<bool> Result;
bool EverChanged = false;
do {
// Note that None means "we changed things, but recurse further."
- Result = FoldCondBranchOnValueKnownInPredecessorImpl(BI, DTU, DL, AC);
+ Result = FoldCondBranchOnValueKnownInPredecessorImpl(BI, DTU, DL, TTI, AC);
EverChanged |= Result == std::nullopt || *Result;
} while (Result == std::nullopt);
return EverChanged;
@@ -7543,7 +7546,7 @@ bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
// If this is a branch on something for which we know the constant value in
// predecessors (e.g. a phi node in the current block), thread control
// through this block.
- if (FoldCondBranchOnValueKnownInPredecessor(BI, DTU, DL, Options.AC))
+ if (FoldCondBranchOnValueKnownInPredecessor(BI, DTU, DL, TTI, Options.AC))
return requestResimplify();
// Scan predecessor blocks for conditional branches.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index c63cf0c..590f315 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -365,8 +365,8 @@ public:
/// Return the best VPlan for \p VF.
VPlan &getBestPlanFor(ElementCount VF) const;
- /// Return the most profitable plan and fix its VF to the most profitable one.
- VPlan &getBestPlan() const;
+ /// Return the most profitable vectorization factor.
+ ElementCount getBestVF() const;
/// Generate the IR code for the vectorized loop captured in VPlan \p BestPlan
/// according to the best selected \p VF and \p UF.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 09ca859..9733ac0 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7162,13 +7162,12 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
return Cost;
}
-VPlan &LoopVectorizationPlanner::getBestPlan() const {
+ElementCount LoopVectorizationPlanner::getBestVF() const {
// If there is a single VPlan with a single VF, return it directly.
VPlan &FirstPlan = *VPlans[0];
if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)
- return FirstPlan;
+ return *FirstPlan.vectorFactors().begin();
- VPlan *BestPlan = &FirstPlan;
ElementCount ScalarVF = ElementCount::getFixed(1);
assert(hasPlanWithVF(ScalarVF) &&
"More than a single plan/VF w/o any plan having scalar VF");
@@ -7199,14 +7198,11 @@ VPlan &LoopVectorizationPlanner::getBestPlan() const {
InstructionCost Cost = cost(*P, VF);
VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
- if (isMoreProfitable(CurrentFactor, BestFactor)) {
+ if (isMoreProfitable(CurrentFactor, BestFactor))
BestFactor = CurrentFactor;
- BestPlan = &*P;
- }
}
}
- BestPlan->setVF(BestFactor.Width);
- return *BestPlan;
+ return BestFactor.Width;
}
VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
@@ -10001,10 +9997,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
&CM, BFI, PSI, Checks);
- VPlan &BestPlan = LVP.getBestPlan();
- assert(BestPlan.hasScalarVFOnly() &&
+ ElementCount BestVF = LVP.getBestVF();
+ assert(BestVF.isScalar() &&
"VPlan cost model and legacy cost model disagreed");
- LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
+ VPlan &BestPlan = LVP.getBestPlanFor(BestVF);
+ LVP.executePlan(BestVF, IC, BestPlan, Unroller, DT, false);
ORE->emit([&]() {
return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
@@ -10015,20 +10012,25 @@ bool LoopVectorizePass::processLoop(Loop *L) {
} else {
// If we decided that it is *legal* to vectorize the loop, then do it.
+ ElementCount BestVF = LVP.getBestVF();
+ LLVM_DEBUG(dbgs() << "VF picked by VPlan cost model: " << BestVF << "\n");
+ assert(VF.Width == BestVF &&
+ "VPlan cost model and legacy cost model disagreed");
+ VPlan &BestPlan = LVP.getBestPlanFor(BestVF);
// Consider vectorizing the epilogue too if it's profitable.
VectorizationFactor EpilogueVF =
- LVP.selectEpilogueVectorizationFactor(VF.Width, IC);
+ LVP.selectEpilogueVectorizationFactor(BestVF, IC);
if (EpilogueVF.Width.isVector()) {
// The first pass vectorizes the main loop and creates a scalar epilogue
// to be vectorized by executing the plan (potentially with a different
// factor) again shortly afterwards.
- EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
+ EpilogueLoopVectorizationInfo EPI(BestVF, IC, EpilogueVF.Width, 1);
EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
EPI, &LVL, &CM, BFI, PSI, Checks);
- std::unique_ptr<VPlan> BestMainPlan(
- LVP.getBestPlanFor(EPI.MainLoopVF).duplicate());
+ assert(EPI.MainLoopVF == VF.Width && "VFs must match");
+ std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan(
EPI.MainLoopVF, EPI.MainLoopUF, *BestMainPlan, MainILV, DT, true);
++LoopsVectorized;
@@ -10119,18 +10121,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {
if (!MainILV.areSafetyChecksAdded())
DisableRuntimeUnroll = true;
} else {
- VPlan &BestPlan = LVP.getBestPlan();
- assert(size(BestPlan.vectorFactors()) == 1 &&
- "Plan should have a single VF");
- ElementCount Width = *BestPlan.vectorFactors().begin();
- LLVM_DEBUG(dbgs() << "VF picked by VPlan cost model: " << Width
- << "\n");
- assert(VF.Width == Width &&
- "VPlan cost model and legacy cost model disagreed");
- InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, Width,
+ InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, BestVF,
VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
PSI, Checks);
- LVP.executePlan(Width, IC, BestPlan, LB, DT, false);
+ LVP.executePlan(BestVF, IC, BestPlan, LB, DT, false);
++LoopsVectorized;
// Add metadata to disable runtime unrolling a scalar loop when there
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index b74417f..3bdd8fd 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -8363,6 +8363,12 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
: TTI.getStridedMemoryOpCost(
Instruction::Load, LoadTy, LI->getPointerOperand(),
/*VariableMask=*/false, Alignment, CostKind, LI);
+ // Add external uses costs.
+ for (auto [Idx, V] : enumerate(VL.slice(
+ P.first, std::min<unsigned>(VL.size() - P.first, VF))))
+ if (!R.areAllUsersVectorized(cast<Instruction>(V)))
+ GatherCost += TTI.getVectorInstrCost(Instruction::ExtractElement,
+ LoadTy, CostKind, Idx);
// Estimate GEP cost.
SmallVector<Value *> PointerOps(VF);
for (auto [I, V] : enumerate(VL.slice(P.first, VF)))
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index e1d18d5..c9da5e5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2246,12 +2246,12 @@ public:
/// The Operands are {ChainOp, VecOp, EVL, [Condition]}.
class VPReductionEVLRecipe : public VPReductionRecipe {
public:
- VPReductionEVLRecipe(VPReductionRecipe *R, VPValue *EVL, VPValue *CondOp)
+ VPReductionEVLRecipe(VPReductionRecipe &R, VPValue &EVL, VPValue *CondOp)
: VPReductionRecipe(
- VPDef::VPReductionEVLSC, R->getRecurrenceDescriptor(),
- cast_or_null<Instruction>(R->getUnderlyingValue()),
- ArrayRef<VPValue *>({R->getChainOp(), R->getVecOp(), EVL}), CondOp,
- R->isOrdered()) {}
+ VPDef::VPReductionEVLSC, R.getRecurrenceDescriptor(),
+ cast_or_null<Instruction>(R.getUnderlyingValue()),
+ ArrayRef<VPValue *>({R.getChainOp(), R.getVecOp(), &EVL}), CondOp,
+ R.isOrdered()) {}
~VPReductionEVLRecipe() override = default;
@@ -2558,10 +2558,10 @@ struct VPWidenLoadRecipe final : public VPWidenMemoryRecipe, public VPValue {
/// using the address to load from, the explicit vector length and an optional
/// mask.
struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
- VPWidenLoadEVLRecipe(VPWidenLoadRecipe *L, VPValue *EVL, VPValue *Mask)
- : VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L->getIngredient(),
- {L->getAddr(), EVL}, L->isConsecutive(),
- L->isReverse(), L->getDebugLoc()),
+ VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue &EVL, VPValue *Mask)
+ : VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L.getIngredient(),
+ {L.getAddr(), &EVL}, L.isConsecutive(),
+ L.isReverse(), L.getDebugLoc()),
VPValue(this, &getIngredient()) {
setMask(Mask);
}
@@ -2634,11 +2634,10 @@ struct VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
/// using the value to store, the address to store to, the explicit vector
/// length and an optional mask.
struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
- VPWidenStoreEVLRecipe(VPWidenStoreRecipe *S, VPValue *EVL, VPValue *Mask)
- : VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S->getIngredient(),
- {S->getAddr(), S->getStoredValue(), EVL},
- S->isConsecutive(), S->isReverse(),
- S->getDebugLoc()) {
+ VPWidenStoreEVLRecipe(VPWidenStoreRecipe &S, VPValue &EVL, VPValue *Mask)
+ : VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(),
+ {S.getAddr(), S.getStoredValue(), &EVL},
+ S.isConsecutive(), S.isReverse(), S.getDebugLoc()) {
setMask(Mask);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index c91fd0f..045f6c3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1488,13 +1488,13 @@ bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) {
if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(CurRecipe)) {
VPValue *NewMask = GetNewMask(MemR->getMask());
if (auto *L = dyn_cast<VPWidenLoadRecipe>(MemR))
- NewRecipe = new VPWidenLoadEVLRecipe(L, VPEVL, NewMask);
+ NewRecipe = new VPWidenLoadEVLRecipe(*L, *VPEVL, NewMask);
else if (auto *S = dyn_cast<VPWidenStoreRecipe>(MemR))
- NewRecipe = new VPWidenStoreEVLRecipe(S, VPEVL, NewMask);
+ NewRecipe = new VPWidenStoreEVLRecipe(*S, *VPEVL, NewMask);
else
llvm_unreachable("unsupported recipe");
} else if (auto *RedR = dyn_cast<VPReductionRecipe>(CurRecipe)) {
- NewRecipe = new VPReductionEVLRecipe(RedR, VPEVL,
+ NewRecipe = new VPReductionEVLRecipe(*RedR, *VPEVL,
GetNewMask(RedR->getCondOp()));
}
diff --git a/llvm/test/Analysis/CostModel/X86/arith-overflow.ll b/llvm/test/Analysis/CostModel/X86/arith-overflow.ll
index c5da46a..963bb8a 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-overflow.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-overflow.ll
@@ -990,192 +990,192 @@ declare {<64 x i8>, <64 x i1>} @llvm.smul.with.overflow.v64i8(<64 x i8>, <64 x
define i32 @smul(i32 %arg) {
; SSSE3-LABEL: 'smul'
; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 116 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 124 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 148 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SSE42-LABEL: 'smul'
; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX1-LABEL: 'smul'
; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 124 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX2-LABEL: 'smul'
; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512F-LABEL: 'smul'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512BW-LABEL: 'smul'
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512DQ-LABEL: 'smul'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SLM-LABEL: 'smul'
; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
-; SLM-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
-; SLM-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SLM-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLM-NEXT: Cost Model: Found an estimated cost of 188 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SLM-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SLM-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
-; SLM-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLM-NEXT: Cost Model: Found an estimated cost of 82 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLM-NEXT: Cost Model: Found an estimated cost of 156 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
-; SLM-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SLM-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SLM-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SLM-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SLM-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SLM-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; GLM-LABEL: 'smul'
; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
-; GLM-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; GLM-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; GLM-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; GLM-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; GLM-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; GLM-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
; GLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
-; GLM-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; GLM-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; GLM-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; GLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; GLM-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; GLM-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
; GLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
-; GLM-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; GLM-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; GLM-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; GLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; GLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; GLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
; GLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
-; GLM-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; GLM-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; GLM-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; GLM-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; GLM-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; GLM-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; BTVER2-LABEL: 'smul'
; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
-; BTVER2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; BTVER2-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; BTVER2-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; BTVER2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; BTVER2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; BTVER2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
-; BTVER2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; BTVER2-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; BTVER2-NEXT: Cost Model: Found an estimated cost of 124 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; BTVER2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; BTVER2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; BTVER2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
; BTVER2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
-; BTVER2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; BTVER2-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; BTVER2-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; BTVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; BTVER2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
-; BTVER2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; BTVER2-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; BTVER2-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; BTVER2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; BTVER2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; BTVER2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
%I64 = call {i64, i1} @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
@@ -1228,192 +1228,192 @@ declare {<64 x i8>, <64 x i1>} @llvm.umul.with.overflow.v64i8(<64 x i8>, <64 x
define i32 @umul(i32 %arg) {
; SSSE3-LABEL: 'umul'
; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SSE42-LABEL: 'umul'
; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX1-LABEL: 'umul'
; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 118 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX2-LABEL: 'umul'
; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512F-LABEL: 'umul'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 77 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512BW-LABEL: 'umul'
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 77 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512DQ-LABEL: 'umul'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 77 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SLM-LABEL: 'umul'
; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
-; SLM-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
-; SLM-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SLM-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLM-NEXT: Cost Model: Found an estimated cost of 184 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SLM-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SLM-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SLM-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
-; SLM-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLM-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLM-NEXT: Cost Model: Found an estimated cost of 152 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
-; SLM-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SLM-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SLM-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SLM-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SLM-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SLM-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; GLM-LABEL: 'umul'
; GLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
-; GLM-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; GLM-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; GLM-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; GLM-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; GLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; GLM-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
; GLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
-; GLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; GLM-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; GLM-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; GLM-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; GLM-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; GLM-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
; GLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
-; GLM-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; GLM-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; GLM-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; GLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; GLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
; GLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
-; GLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; GLM-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; GLM-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; GLM-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; GLM-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; GLM-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; BTVER2-LABEL: 'umul'
; BTVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
-; BTVER2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; BTVER2-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; BTVER2-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; BTVER2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; BTVER2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; BTVER2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
-; BTVER2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; BTVER2-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; BTVER2-NEXT: Cost Model: Found an estimated cost of 118 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; BTVER2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; BTVER2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; BTVER2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
-; BTVER2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; BTVER2-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; BTVER2-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; BTVER2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; BTVER2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
-; BTVER2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; BTVER2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; BTVER2-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; BTVER2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; BTVER2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; BTVER2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
%I64 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
diff --git a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
index bbe3396..88a22f9 100644
--- a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
+++ b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
@@ -51,22 +51,22 @@ declare void @llvm.memcpy.p0.p0.i32(ptr, ptr, i32, i1)
define void @umul(i32 %a, i32 %b, <16 x i32> %va, <16 x i32> %vb) {
; THRU-LABEL: 'umul'
; THRU-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
-; THRU-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
+; THRU-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; LATE-LABEL: 'umul'
; LATE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
-; LATE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
+; LATE-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
; SIZE-LABEL: 'umul'
; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
; SIZE_LATE-LABEL: 'umul'
; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
-; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
+; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
%s = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/load-store-index-loaded-in-loop.ll b/llvm/test/Analysis/LoopAccessAnalysis/load-store-index-loaded-in-loop.ll
index 2e61a28..6d8e296 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/load-store-index-loaded-in-loop.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/load-store-index-loaded-in-loop.ll
@@ -9,21 +9,19 @@
define void @B_indices_loaded_in_loop_A_stored(ptr %A, ptr noalias %B, i64 %N, i64 %off) {
; CHECK-LABEL: 'B_indices_loaded_in_loop_A_stored'
; CHECK-NEXT: loop:
-; CHECK-NEXT: Memory dependences are safe with run-time checks
+; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT: Unsafe indirect dependence.
; CHECK-NEXT: Dependences:
+; CHECK-NEXT: IndirectUnsafe:
+; CHECK-NEXT: %l = load i32, ptr %gep.B, align 4 ->
+; CHECK-NEXT: store i32 %inc, ptr %gep.B, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT: Unknown:
+; CHECK-NEXT: %indices = load i8, ptr %gep.A, align 1 ->
+; CHECK-NEXT: store i32 %l, ptr %gep.C, align 4
+; CHECK-EMPTY:
; CHECK-NEXT: Run-time memory checks:
-; CHECK-NEXT: Check 0:
-; CHECK-NEXT: Comparing group ([[GRP1:0x[0-9a-f]+]]):
-; CHECK-NEXT: %gep.C = getelementptr inbounds i32, ptr %A, i64 %iv
-; CHECK-NEXT: Against group ([[GRP2:0x[0-9a-f]+]]):
-; CHECK-NEXT: %gep.A = getelementptr inbounds i8, ptr %A, i64 %iv.off
; CHECK-NEXT: Grouped accesses:
-; CHECK-NEXT: Group [[GRP1]]:
-; CHECK-NEXT: (Low: %A High: ((4 * %N) + %A))
-; CHECK-NEXT: Member: {%A,+,4}<nuw><%loop>
-; CHECK-NEXT: Group [[GRP2]]:
-; CHECK-NEXT: (Low: (%off + %A) High: (%N + %off + %A))
-; CHECK-NEXT: Member: {(%off + %A),+,1}<nw><%loop>
; CHECK-EMPTY:
; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
; CHECK-NEXT: SCEV assumptions:
@@ -59,9 +57,9 @@ define void @B_indices_loaded_in_loop_A_not_stored(ptr %A, ptr noalias %B, i64 %
; CHECK-LABEL: 'B_indices_loaded_in_loop_A_not_stored'
; CHECK-NEXT: loop:
; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT: Unknown data dependence.
+; CHECK-NEXT: Unsafe indirect dependence.
; CHECK-NEXT: Dependences:
-; CHECK-NEXT: Unknown:
+; CHECK-NEXT: IndirectUnsafe:
; CHECK-NEXT: %l = load i32, ptr %gep.B, align 4 ->
; CHECK-NEXT: store i32 %inc, ptr %gep.B, align 4
; CHECK-EMPTY:
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/pointer-with-unknown-bounds.ll b/llvm/test/Analysis/LoopAccessAnalysis/pointer-with-unknown-bounds.ll
index 546a75c..28ee6c6 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/pointer-with-unknown-bounds.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/pointer-with-unknown-bounds.ll
@@ -13,9 +13,9 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
; CHECK-NEXT: for.body:
; CHECK-NEXT: Report: unsafe dependent memory operations in loop
; CHECK-NOT: Report: cannot identify array bounds
-; CHECK-NEXT: Unknown data dependence.
+; CHECK-NEXT: Unsafe indirect dependence.
; CHECK-NEXT: Dependences:
-; CHECK-NEXT: Unknown:
+; CHECK-NEXT: IndirectUnsafe:
; CHECK-NEXT: %loadA = load i16, ptr %arrayidxA, align 2 ->
; CHECK-NEXT: store i16 %mul, ptr %arrayidxA, align 2
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/print-order.ll b/llvm/test/Analysis/LoopAccessAnalysis/print-order.ll
index 18e45f4..8ca3038 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/print-order.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/print-order.ll
@@ -9,8 +9,9 @@
; CHECK-LABEL: 'negative_step'
; CHECK: LAA: Found an analyzable loop: loop
; CHECK: LAA: Checking memory dependencies
-; CHECK-NEXT: LAA: Src Scev: {(4092 + %A),+,-4}<nw><%loop>Sink Scev: {(4088 + %A)<nuw>,+,-4}<nw><%loop>(Induction step: -1)
+; CHECK-NEXT: LAA: Src Scev: {(4092 + %A),+,-4}<nw><%loop>Sink Scev: {(4088 + %A)<nuw>,+,-4}<nw><%loop>
; CHECK-NEXT: LAA: Distance for store i32 %add, ptr %gep.A.plus.1, align 4 to %l = load i32, ptr %gep.A, align 4: -4
+; CHECK-NEXT: LAA: Src induction step: -1 Sink induction step: -1
; CHECK-NEXT: LAA: Dependence is negative
define void @negative_step(ptr nocapture %A) {
@@ -41,8 +42,9 @@ exit:
; CHECK-LABEL: 'positive_step'
; CHECK: LAA: Found an analyzable loop: loop
; CHECK: LAA: Checking memory dependencies
-; CHECK-NEXT: LAA: Src Scev: {(4 + %A)<nuw>,+,4}<nuw><%loop>Sink Scev: {%A,+,4}<nw><%loop>(Induction step: 1)
+; CHECK-NEXT: LAA: Src Scev: {(4 + %A)<nuw>,+,4}<nuw><%loop>Sink Scev: {%A,+,4}<nw><%loop>
; CHECK-NEXT: LAA: Distance for %l = load i32, ptr %gep.A, align 4 to store i32 %add, ptr %gep.A.minus.1, align 4: -4
+; CHECK-NEXT: LAA: Src induction step: 1 Sink induction step: 1
; CHECK-NEXT: LAA: Dependence is negative
define void @positive_step(ptr nocapture %A) {
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/select-dependence.ll b/llvm/test/Analysis/LoopAccessAnalysis/select-dependence.ll
index 60fe8b4..8bef7583 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/select-dependence.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/select-dependence.ll
@@ -5,9 +5,9 @@ define void @test(ptr noalias %x, ptr noalias %y, ptr noalias %z) {
; CHECK-LABEL: 'test'
; CHECK-NEXT: loop:
; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT: Unknown data dependence.
+; CHECK-NEXT: Unsafe indirect dependence.
; CHECK-NEXT: Dependences:
-; CHECK-NEXT: Unknown:
+; CHECK-NEXT: IndirectUnsafe:
; CHECK-NEXT: %load = load double, ptr %gep.sel, align 8 ->
; CHECK-NEXT: store double %load, ptr %gep.sel2, align 8
; CHECK-EMPTY:
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll b/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll
index 7c1b11e..f0aed24 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll
@@ -276,9 +276,9 @@ define void @single_stride_used_for_trip_count(ptr noalias %A, ptr noalias %B, i
; CHECK-LABEL: 'single_stride_used_for_trip_count'
; CHECK-NEXT: loop:
; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT: Unknown data dependence.
+; CHECK-NEXT: Unsafe indirect dependence.
; CHECK-NEXT: Dependences:
-; CHECK-NEXT: Unknown:
+; CHECK-NEXT: IndirectUnsafe:
; CHECK-NEXT: %load = load i32, ptr %gep.A, align 4 ->
; CHECK-NEXT: store i32 %add, ptr %gep.A.next, align 4
; CHECK-EMPTY:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll
index 4d4da86..de9af52 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll
@@ -9,56 +9,71 @@ define amdgpu_cs void @memmove_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src
; LOOP: ; %bb.0:
; LOOP-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[0:1]
; LOOP-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; LOOP-NEXT: s_xor_b64 s[4:5], exec, s[0:1]
-; LOOP-NEXT: s_cbranch_execz .LBB0_3
-; LOOP-NEXT: ; %bb.1: ; %copy_forward
-; LOOP-NEXT: s_mov_b64 s[6:7], 0
+; LOOP-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; LOOP-NEXT: s_cbranch_execnz .LBB0_3
+; LOOP-NEXT: ; %bb.1: ; %Flow
+; LOOP-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; LOOP-NEXT: s_cbranch_execnz .LBB0_4
+; LOOP-NEXT: .LBB0_2: ; %memmove_done
+; LOOP-NEXT: s_endpgm
+; LOOP-NEXT: .LBB0_3:
+; LOOP-NEXT: s_mov_b32 s6, 0
+; LOOP-NEXT: s_mov_b32 s7, 0xf000
+; LOOP-NEXT: s_mov_b64 s[4:5], 0
+; LOOP-NEXT: buffer_load_ubyte v4, v[2:3], s[4:7], 0 addr64 offset:1
+; LOOP-NEXT: buffer_load_ubyte v5, v[2:3], s[4:7], 0 addr64 offset:3
+; LOOP-NEXT: buffer_load_ubyte v6, v[2:3], s[4:7], 0 addr64 offset:2
+; LOOP-NEXT: buffer_load_ubyte v2, v[2:3], s[4:7], 0 addr64
+; LOOP-NEXT: s_waitcnt vmcnt(3)
+; LOOP-NEXT: v_lshlrev_b32_e32 v3, 8, v4
+; LOOP-NEXT: s_waitcnt vmcnt(2)
+; LOOP-NEXT: v_lshlrev_b32_e32 v4, 24, v5
+; LOOP-NEXT: s_waitcnt vmcnt(1)
+; LOOP-NEXT: v_lshlrev_b32_e32 v5, 16, v6
+; LOOP-NEXT: s_waitcnt vmcnt(0)
+; LOOP-NEXT: v_or_b32_e32 v2, v3, v2
+; LOOP-NEXT: v_or_b32_e32 v3, v4, v5
+; LOOP-NEXT: v_or_b32_e32 v2, v3, v2
+; LOOP-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; LOOP-NEXT: v_bfe_u32 v4, v2, 8, 8
+; LOOP-NEXT: buffer_store_byte v2, v[0:1], s[4:7], 0 addr64
+; LOOP-NEXT: s_waitcnt expcnt(0)
+; LOOP-NEXT: v_lshrrev_b32_e32 v2, 24, v2
+; LOOP-NEXT: buffer_store_byte v4, v[0:1], s[4:7], 0 addr64 offset:1
+; LOOP-NEXT: buffer_store_byte v3, v[0:1], s[4:7], 0 addr64 offset:2
+; LOOP-NEXT: buffer_store_byte v2, v[0:1], s[4:7], 0 addr64 offset:3
+; LOOP-NEXT: ; implicit-def: $vgpr2_vgpr3
+; LOOP-NEXT: ; implicit-def: $vgpr0_vgpr1
+; LOOP-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; LOOP-NEXT: s_cbranch_execz .LBB0_2
+; LOOP-NEXT: .LBB0_4: ; %memmove_bwd_residual
; LOOP-NEXT: s_mov_b32 s2, 0
; LOOP-NEXT: s_mov_b32 s3, 0xf000
; LOOP-NEXT: s_mov_b64 s[0:1], 0
-; LOOP-NEXT: v_mov_b32_e32 v4, s6
-; LOOP-NEXT: v_mov_b32_e32 v5, s7
-; LOOP-NEXT: .LBB0_2: ; %copy_forward_loop
-; LOOP-NEXT: ; =>This Inner Loop Header: Depth=1
-; LOOP-NEXT: v_add_i32_e32 v6, vcc, v2, v4
-; LOOP-NEXT: v_addc_u32_e32 v7, vcc, v3, v5, vcc
+; LOOP-NEXT: s_waitcnt expcnt(2)
+; LOOP-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:1
+; LOOP-NEXT: buffer_load_ubyte v5, v[2:3], s[0:3], 0 addr64 offset:3
+; LOOP-NEXT: buffer_load_ubyte v6, v[2:3], s[0:3], 0 addr64 offset:2
; LOOP-NEXT: s_waitcnt expcnt(0)
-; LOOP-NEXT: buffer_load_ubyte v8, v[6:7], s[0:3], 0 addr64
-; LOOP-NEXT: v_add_i32_e32 v6, vcc, v0, v4
-; LOOP-NEXT: v_addc_u32_e32 v7, vcc, v1, v5, vcc
-; LOOP-NEXT: v_add_i32_e32 v4, vcc, 1, v4
-; LOOP-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; LOOP-NEXT: v_cmp_ne_u32_e32 vcc, 4, v4
+; LOOP-NEXT: buffer_load_ubyte v2, v[2:3], s[0:3], 0 addr64
+; LOOP-NEXT: s_waitcnt vmcnt(3)
+; LOOP-NEXT: v_lshlrev_b32_e32 v3, 8, v4
+; LOOP-NEXT: s_waitcnt vmcnt(2)
+; LOOP-NEXT: v_lshlrev_b32_e32 v4, 24, v5
+; LOOP-NEXT: s_waitcnt vmcnt(1)
+; LOOP-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; LOOP-NEXT: s_waitcnt vmcnt(0)
-; LOOP-NEXT: buffer_store_byte v8, v[6:7], s[0:3], 0 addr64
-; LOOP-NEXT: s_cbranch_vccnz .LBB0_2
-; LOOP-NEXT: .LBB0_3: ; %Flow17
-; LOOP-NEXT: s_andn2_saveexec_b64 s[0:1], s[4:5]
-; LOOP-NEXT: s_cbranch_execz .LBB0_6
-; LOOP-NEXT: ; %bb.4: ; %copy_backwards
-; LOOP-NEXT: v_add_i32_e32 v0, vcc, 3, v0
-; LOOP-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; LOOP-NEXT: v_add_i32_e32 v2, vcc, 3, v2
-; LOOP-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; LOOP-NEXT: s_mov_b32 s0, -4
-; LOOP-NEXT: s_mov_b32 s6, 0
-; LOOP-NEXT: s_mov_b32 s7, 0xf000
-; LOOP-NEXT: s_mov_b64 s[4:5], 0
-; LOOP-NEXT: v_mov_b32_e32 v4, s0
-; LOOP-NEXT: .LBB0_5: ; %copy_backwards_loop
-; LOOP-NEXT: ; =>This Inner Loop Header: Depth=1
+; LOOP-NEXT: v_or_b32_e32 v2, v3, v2
+; LOOP-NEXT: v_or_b32_e32 v3, v4, v5
+; LOOP-NEXT: v_or_b32_e32 v2, v3, v2
+; LOOP-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; LOOP-NEXT: v_bfe_u32 v4, v2, 8, 8
+; LOOP-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64
; LOOP-NEXT: s_waitcnt expcnt(0)
-; LOOP-NEXT: buffer_load_ubyte v5, v[2:3], s[4:7], 0 addr64
-; LOOP-NEXT: v_add_i32_e32 v4, vcc, 1, v4
-; LOOP-NEXT: s_and_b64 vcc, vcc, exec
-; LOOP-NEXT: s_waitcnt vmcnt(0)
-; LOOP-NEXT: buffer_store_byte v5, v[0:1], s[4:7], 0 addr64
-; LOOP-NEXT: v_add_i32_e64 v0, s[0:1], -1, v0
-; LOOP-NEXT: v_addc_u32_e64 v1, s[0:1], -1, v1, s[0:1]
-; LOOP-NEXT: v_add_i32_e64 v2, s[0:1], -1, v2
-; LOOP-NEXT: v_addc_u32_e64 v3, s[0:1], -1, v3, s[0:1]
-; LOOP-NEXT: s_cbranch_vccz .LBB0_5
-; LOOP-NEXT: .LBB0_6: ; %memmove_done
+; LOOP-NEXT: v_lshrrev_b32_e32 v2, 24, v2
+; LOOP-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:1
+; LOOP-NEXT: buffer_store_byte v3, v[0:1], s[0:3], 0 addr64 offset:2
+; LOOP-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:3
; LOOP-NEXT: s_endpgm
;
; UNROLL-LABEL: memmove_p1i8:
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll b/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll
index 91abbff..74a7ca2 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll
@@ -1,9 +1,9 @@
; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=kaveri -filetype=obj | llvm-readobj -S --sd --syms - | FileCheck --check-prefix=ELF %s
; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdpal -mcpu=kaveri | llvm-readobj -S --sd --syms - | FileCheck %s --check-prefix=ELF
-; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 | FileCheck --check-prefix=GFX10 %s
-; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 | FileCheck --check-prefix=GFX10 %s
-; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 | FileCheck --check-prefix=GFX10 %s
-; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 | FileCheck --check-prefix=GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=+wavefrontsize32 | FileCheck --check-prefix=GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=+wavefrontsize64 | FileCheck --check-prefix=GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize32 | FileCheck --check-prefix=GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize64 | FileCheck --check-prefix=GFX10 %s
; ELF: Section {
; ELF: Name: .text
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index 6ec6f6a..17fe3ad 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -2,12 +2,12 @@
; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W32 %s
declare i32 @llvm.amdgcn.workitem.id.x()
declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index da88b67..16f3ff4 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -2,21 +2,21 @@
; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_ITERATIVE %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_ITERATIVE %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232_ITERATIVE %s
; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_DPP %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_DPP %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232_DPP %s
declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index eb05613..c729618 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -2,17 +2,17 @@
; RUN: llc -mtriple=amdgcn - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_ITERATIVE %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_ITERATIVE %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_ITERATIVE %s
; RUN: llc -mtriple=amdgcn - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_DPP %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_DPP %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_DPP %s
declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
index 2970495..f67fcd6 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
@@ -2,10 +2,10 @@
; RUN: llc -mtriple=amdgcn-- - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX7 %s
; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX8 %s
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
declare i1 @llvm.amdgcn.wqm.vote(i1)
declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
index 7ae4766..3a2efad 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -2,12 +2,12 @@
; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W32 %s
declare i32 @llvm.amdgcn.workitem.id.x()
declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
index 62e50b2..d0c0b62 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -2,12 +2,12 @@
; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W32 %s
declare i32 @llvm.amdgcn.workitem.id.x()
declare i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/basic-branch.ll b/llvm/test/CodeGen/AMDGPU/basic-branch.ll
index 2a7206c..bc20665 100644
--- a/llvm/test/CodeGen/AMDGPU/basic-branch.ll
+++ b/llvm/test/CodeGen/AMDGPU/basic-branch.ll
@@ -1,7 +1,7 @@
; RUN: llc -O0 -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNNOOPT -check-prefix=GCN %s
; RUN: llc -O0 -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNNOOPT -check-prefix=GCN %s
-; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global,-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNNOOPT -check-prefix=GCN %s
-; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNNOOPT -check-prefix=GCN %s
+; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNNOOPT -check-prefix=GCN %s
+; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNNOOPT -check-prefix=GCN %s
; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNOPT -check-prefix=GCN %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNOPT -check-prefix=GCN %s
diff --git a/llvm/test/CodeGen/AMDGPU/early-term.mir b/llvm/test/CodeGen/AMDGPU/early-term.mir
index 99ea18c..77bc972 100644
--- a/llvm/test/CodeGen/AMDGPU/early-term.mir
+++ b/llvm/test/CodeGen/AMDGPU/early-term.mir
@@ -1,6 +1,6 @@
# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=si-late-branch-lowering -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GCN,GFX9 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass=si-late-branch-lowering -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GCN,GFX10 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass=si-late-branch-lowering -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GCN,GFX11 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -run-pass=si-late-branch-lowering -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GCN,GFX10 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -run-pass=si-late-branch-lowering -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GCN,GFX11 %s
--- |
define amdgpu_ps void @early_term_scc0_end_block() {
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index c05f9c6..600c35b 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -1,16 +1,16 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
declare float @div.float.value()
declare double @div.double.value()
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index 46f0bb0..954d810 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -1,16 +1,16 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32, -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
declare float @div.float.value()
declare float @div.double.value()
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index bd5e589..4ab8683 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -1,16 +1,16 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
declare float @div.float.value()
declare float @div.double.value()
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index 5ffa71d3..cbe243a 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -1,16 +1,16 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
declare float @div.float.value()
declare double @div.double.value()
diff --git a/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir b/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir
index c17feae..e2635fc 100644
--- a/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir
+++ b/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir
@@ -1,6 +1,6 @@
# RUN: llc -mtriple=amdgcn -mcpu=gfx902 -mattr=+xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK,GFX9 %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX9 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64,-xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64,-xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK %s
# GCN-LABEL: name: break_smem_clause_simple_load_smrd8_ptr_hidden_bundle
# GCN: bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir b/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir
index 8d02f06..401f6e30 100644
--- a/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir
+++ b/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir
@@ -1,6 +1,6 @@
# RUN: llc -mtriple=amdgcn -mcpu=gfx902 -mattr=+xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK,GFX9 %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX9 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64,-xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX10 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64,-xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX10 %s
# GCN-LABEL: name: break_smem_clause_max_look_ahead_in_bundle
# GCN: S_LOAD_DWORDX2_IMM
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-generic-target-features.ll b/llvm/test/CodeGen/AMDGPU/hsa-generic-target-features.ll
index 1cb27dc..1801082 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-generic-target-features.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-generic-target-features.ll
@@ -6,13 +6,13 @@
; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx11-generic < %s | FileCheck -check-prefix=CU %s
; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx12-generic < %s | FileCheck -check-prefix=CU %s
-; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10-1-generic -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefix=W32 %s
-; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10-1-generic -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefix=W64 %s
-; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10-3-generic -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefix=W32 %s
-; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10-3-generic -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefix=W64 %s
-; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx11-generic -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefix=W32 %s
-; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx11-generic -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefix=W64 %s
-; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx12-generic -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefix=W64 %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10-1-generic -mattr=+wavefrontsize32 < %s | FileCheck -check-prefix=W32 %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10-1-generic -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=W64 %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10-3-generic -mattr=+wavefrontsize32 < %s | FileCheck -check-prefix=W32 %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10-3-generic -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=W64 %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx11-generic -mattr=+wavefrontsize32 < %s | FileCheck -check-prefix=W32 %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx11-generic -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=W64 %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx12-generic -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=W64 %s
; Checks 10.1, 10.3 and 11 generic targets allow cumode/wave64.
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-wavefrontsize.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-wavefrontsize.ll
index 3dc9f20..431ba38f 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-wavefrontsize.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-wavefrontsize.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10-32 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10-64 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10-32 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10-64 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize32 < %s | FileCheck -check-prefixes=GCN,GFX10-32 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10-64 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize32 < %s | FileCheck -check-prefixes=GCN,GFX10-32 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10-64 %s
; GCN: amdhsa.kernels:
; GCN: .name: wavefrontsize
diff --git a/llvm/test/CodeGen/AMDGPU/hsa.ll b/llvm/test/CodeGen/AMDGPU/hsa.ll
index 487e62b..3747620 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa.ll
@@ -5,10 +5,10 @@
; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-flat-for-global | FileCheck --check-prefix=HSA-VI %s
; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -filetype=obj | llvm-readobj -S --sd --syms - | FileCheck --check-prefix=ELF %s
; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri --amdhsa-code-object-version=4 | llvm-readobj -S --sd --syms - | FileCheck %s --check-prefix=ELF
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 | FileCheck --check-prefix=GFX10 --check-prefix=GFX10-W32 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 | FileCheck --check-prefix=GFX10 --check-prefix=GFX10-W64 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 | FileCheck --check-prefix=GFX10 --check-prefix=GFX10-W32 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 | FileCheck --check-prefix=GFX10 --check-prefix=GFX10-W64 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize32 | FileCheck --check-prefix=GFX10 --check-prefix=GFX10-W32 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize64 | FileCheck --check-prefix=GFX10 --check-prefix=GFX10-W64 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize32 | FileCheck --check-prefix=GFX10 --check-prefix=GFX10-W32 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize64 | FileCheck --check-prefix=GFX10 --check-prefix=GFX10-W64 %s
; The SHT_NOTE section contains the output from the .hsa_code_object_*
; directives.
diff --git a/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll b/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll
index b9269e2..2cb440b 100644
--- a/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll
@@ -1,9 +1,9 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH128K %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH128K %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH128K %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH256K %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH128K %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH128K %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH256K %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH1024K %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+wavefrontsize64 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH1024K %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH2048K %s
; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo16:
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-literal.mir b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-literal.mir
index 18bc442..266da50 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-literal.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-literal.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass=amdgpu-insert-delay-alu %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -run-pass=amdgpu-insert-delay-alu %s -o - | FileCheck %s
---
name: valu_dep_1
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-parse.mir b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-parse.mir
index af2e600..af7ba4e 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-parse.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-parse.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -start-after=amdgpu-insert-delay-alu %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -start-after=amdgpu-insert-delay-alu %s -o - | FileCheck %s
---
name: valu_dep_1
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir b/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir
index 0bdd249..5c9c0d1 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -start-before=amdgpu-insert-delay-alu %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -start-before=amdgpu-insert-delay-alu %s -o - | FileCheck %s
---
name: valu_dep_1
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll
index 1fd0c67..05fd883 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
declare amdgpu_cs_chain void @callee(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })
declare amdgpu_cs_chain_preserve void @callee_preserve(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll
index fa447f9..28492f1 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
declare amdgpu_cs_chain void @callee(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })
declare amdgpu_cs_chain_preserve void @callee_preserve(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll
index ab160ff..acd48a6 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll
@@ -1,12 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
; Test formal argument lowering as well as calls to amdgpu_gfx functions.
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
index ba946fe..c62b4e5 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
@@ -1,12 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11-WF32 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11-WF64 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10-WF32 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10-WF64 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11-WF32 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11-WF64 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10-WF32 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10-WF64 %s
; We only care about which physical registers the parameters are copied from;
; the function bodies are just some arbitrary uses.
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
index 3ff34c0..047b35b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32 < %s | FileCheck %s
declare i32 @llvm.amdgcn.ballot.i32(i1)
declare i32 @llvm.ctpop.i32(i32)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
index 0e659b7..88e3929 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32 < %s | FileCheck -check-prefix=GFX12 %s
declare <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32.p1(ptr addrspace(1))
declare <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16.p1(ptr addrspace(1))
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
index d941830..a5841e0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s
declare i32 @llvm.amdgcn.global.load.tr.b64.i32.p1(ptr addrspace(1))
declare <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16.p1(ptr addrspace(1))
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll
index 3a540bd..b61ca56 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll
@@ -1,9 +1,9 @@
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}full_mask:
; GCN: s_mov_b64 exec, -1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.wave32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.wave32.ll
index d3961c0..d4ae040 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.wave32.ll
@@ -1,11 +1,11 @@
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
; GCN-LABEL: {{^}}test_init_exec:
; GFX1032: s_mov_b32 exec_lo, 0x12345
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll
index 63d27260..e0dacd4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll
@@ -1,7 +1,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
; GCN-LABEL: {{^}}gs_const:
; GCN-NOT: v_cmpx
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll
index a538364..3d5e6dc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W32
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W32
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half> , <8 x float>)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16> , <8 x float>)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll
index 368d637..d11aaf0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W64
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W64
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half>, <16 x half>, <4 x float>)
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v16i16(<16 x i16>, <16 x i16>, <4 x float>)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
index d37b4b4..684ca3a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-64 %s
define amdgpu_ps void @static_exact(float %arg0, float %arg1) {
; SI-LABEL: static_exact:
diff --git a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
index 5cb57ee..9e2e37a 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
@@ -17,6 +17,9 @@ declare void @llvm.memcpy.p3.p3.i32(ptr addrspace(3) nocapture, ptr addrspace(3)
declare void @llvm.memmove.p1.p1.i64(ptr addrspace(1) nocapture, ptr addrspace(1) nocapture readonly, i64, i1) #1
declare void @llvm.memmove.p1.p3.i32(ptr addrspace(1) nocapture, ptr addrspace(3) nocapture readonly, i32, i1) #1
+declare void @llvm.memmove.p0.p3.i32(ptr nocapture writeonly, ptr addrspace(3) nocapture readonly, i32, i1 immarg) #1
+declare void @llvm.memmove.p3.p0.i32(ptr addrspace(3) nocapture writeonly, ptr nocapture readonly, i32, i1 immarg) #1
+declare void @llvm.memmove.p3.p3.i32(ptr addrspace(3) nocapture writeonly, ptr addrspace(3) nocapture readonly, i32, i1 immarg) #1
declare void @llvm.memmove.p5.p5.i32(ptr addrspace(5) nocapture, ptr addrspace(5) nocapture readonly, i32, i1) #1
declare void @llvm.memmove.p3.p5.i32(ptr addrspace(3) nocapture, ptr addrspace(5) nocapture readonly, i32, i1) #1
declare void @llvm.memmove.p5.p3.i32(ptr addrspace(5) nocapture, ptr addrspace(3) nocapture readonly, i32, i1) #1
@@ -87,30 +90,25 @@ define amdgpu_kernel void @max_size_small_static_memmove_caller0(ptr addrspace(1
;
; ALL-LABEL: @max_size_small_static_memmove_caller0(
; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[DST:%.*]]
-; ALL-NEXT: [[COMPARE_N_TO_0:%.*]] = icmp eq i64 1024, 0
-; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[COPY_BACKWARDS:%.*]], label [[COPY_FORWARD:%.*]]
-; ALL: copy_backwards:
-; ALL-NEXT: br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE:%.*]], label [[COPY_BACKWARDS_LOOP:%.*]]
-; ALL: copy_backwards_loop:
-; ALL-NEXT: [[TMP1:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 1024, [[COPY_BACKWARDS]] ]
-; ALL-NEXT: [[INDEX_PTR]] = sub i64 [[TMP1]], 1
-; ALL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[INDEX_PTR]]
-; ALL-NEXT: [[ELEMENT:%.*]] = load i8, ptr addrspace(1) [[TMP2]], align 1
-; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[INDEX_PTR]]
-; ALL-NEXT: store i8 [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1
-; ALL-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_PTR]], 0
-; ALL-NEXT: br i1 [[TMP4]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]]
-; ALL: copy_forward:
-; ALL-NEXT: br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP:%.*]]
-; ALL: copy_forward_loop:
-; ALL-NEXT: [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ]
-; ALL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[INDEX_PTR1]]
-; ALL-NEXT: [[ELEMENT2:%.*]] = load i8, ptr addrspace(1) [[TMP5]], align 1
-; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[INDEX_PTR1]]
-; ALL-NEXT: store i8 [[ELEMENT2]], ptr addrspace(1) [[TMP6]], align 1
-; ALL-NEXT: [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1
-; ALL-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 1024
-; ALL-NEXT: br i1 [[TMP7]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]]
+; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; ALL: memmove_bwd_loop:
+; ALL-NEXT: [[TMP1:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 64, [[TMP0:%.*]] ]
+; ALL-NEXT: [[BWD_INDEX]] = sub i64 [[TMP1]], 1
+; ALL-NEXT: [[TMP2:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
+; ALL-NEXT: [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP2]], align 1
+; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
+; ALL-NEXT: store <4 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1
+; ALL-NEXT: [[TMP4:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
+; ALL-NEXT: br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; ALL: memmove_fwd_loop:
+; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i64 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
+; ALL-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
+; ALL-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1
+; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
+; ALL-NEXT: store <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP6]], align 1
+; ALL-NEXT: [[TMP7]] = add i64 [[FWD_INDEX]], 1
+; ALL-NEXT: [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 64
+; ALL-NEXT: br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
; ALL: memmove_done:
; ALL-NEXT: ret void
;
@@ -121,30 +119,37 @@ define amdgpu_kernel void @max_size_small_static_memmove_caller0(ptr addrspace(1
define amdgpu_kernel void @min_size_large_static_memmove_caller0(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
; OPT-LABEL: @min_size_large_static_memmove_caller0(
; OPT-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[DST:%.*]]
-; OPT-NEXT: [[COMPARE_N_TO_0:%.*]] = icmp eq i64 1025, 0
-; OPT-NEXT: br i1 [[COMPARE_SRC_DST]], label [[COPY_BACKWARDS:%.*]], label [[COPY_FORWARD:%.*]]
-; OPT: copy_backwards:
-; OPT-NEXT: br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE:%.*]], label [[COPY_BACKWARDS_LOOP:%.*]]
-; OPT: copy_backwards_loop:
-; OPT-NEXT: [[TMP1:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 1025, [[COPY_BACKWARDS]] ]
-; OPT-NEXT: [[INDEX_PTR]] = sub i64 [[TMP1]], 1
-; OPT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[INDEX_PTR]]
-; OPT-NEXT: [[ELEMENT:%.*]] = load i8, ptr addrspace(1) [[TMP2]], align 1
-; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[INDEX_PTR]]
-; OPT-NEXT: store i8 [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1
-; OPT-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_PTR]], 0
-; OPT-NEXT: br i1 [[TMP4]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]]
-; OPT: copy_forward:
-; OPT-NEXT: br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP:%.*]]
-; OPT: copy_forward_loop:
-; OPT-NEXT: [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ]
-; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[INDEX_PTR1]]
-; OPT-NEXT: [[ELEMENT2:%.*]] = load i8, ptr addrspace(1) [[TMP5]], align 1
-; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[INDEX_PTR1]]
-; OPT-NEXT: store i8 [[ELEMENT2]], ptr addrspace(1) [[TMP6]], align 1
-; OPT-NEXT: [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1
-; OPT-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 1025
-; OPT-NEXT: br i1 [[TMP7]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]]
+; OPT-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_RESIDUAL:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; OPT: memmove_bwd_residual:
+; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
+; OPT-NEXT: [[TMP2:%.*]] = load i8, ptr addrspace(1) [[TMP1]], align 1
+; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
+; OPT-NEXT: store i8 [[TMP2]], ptr addrspace(1) [[TMP3]], align 1
+; OPT-NEXT: br label [[MEMMOVE_BWD_LOOP:%.*]]
+; OPT: memmove_bwd_loop:
+; OPT-NEXT: [[TMP4:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 64, [[MEMMOVE_BWD_RESIDUAL]] ]
+; OPT-NEXT: [[BWD_INDEX]] = sub i64 [[TMP4]], 1
+; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
+; OPT-NEXT: [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1
+; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
+; OPT-NEXT: store <4 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP6]], align 1
+; OPT-NEXT: [[TMP7:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
+; OPT-NEXT: br i1 [[TMP7]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; OPT: memmove_fwd_loop:
+; OPT-NEXT: [[FWD_INDEX:%.*]] = phi i64 [ [[TMP10:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0:%.*]] ]
+; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
+; OPT-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP8]], align 1
+; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
+; OPT-NEXT: store <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP9]], align 1
+; OPT-NEXT: [[TMP10]] = add i64 [[FWD_INDEX]], 1
+; OPT-NEXT: [[TMP11:%.*]] = icmp eq i64 [[TMP10]], 64
+; OPT-NEXT: br i1 [[TMP11]], label [[MEMMOVE_FWD_RESIDUAL:%.*]], label [[MEMMOVE_FWD_LOOP]]
+; OPT: memmove_fwd_residual:
+; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
+; OPT-NEXT: [[TMP13:%.*]] = load i8, ptr addrspace(1) [[TMP12]], align 1
+; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
+; OPT-NEXT: store i8 [[TMP13]], ptr addrspace(1) [[TMP14]], align 1
+; OPT-NEXT: br label [[MEMMOVE_DONE]]
; OPT: memmove_done:
; OPT-NEXT: ret void
;
@@ -1333,30 +1338,25 @@ define amdgpu_kernel void @memmove_flat_align1_global_align1(ptr %dst, ptr addrs
; ALL-LABEL: @memmove_flat_align1_global_align1(
; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[SRC:%.*]] to ptr
; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[TMP1]], [[DST:%.*]]
-; ALL-NEXT: [[COMPARE_N_TO_0:%.*]] = icmp eq i64 256, 0
-; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[COPY_BACKWARDS:%.*]], label [[COPY_FORWARD:%.*]]
-; ALL: copy_backwards:
-; ALL-NEXT: br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE:%.*]], label [[COPY_BACKWARDS_LOOP:%.*]]
-; ALL: copy_backwards_loop:
-; ALL-NEXT: [[TMP2:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 256, [[COPY_BACKWARDS]] ]
-; ALL-NEXT: [[INDEX_PTR]] = sub i64 [[TMP2]], 1
-; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR]]
-; ALL-NEXT: [[ELEMENT:%.*]] = load i8, ptr [[TMP3]], align 1
-; ALL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX_PTR]]
-; ALL-NEXT: store i8 [[ELEMENT]], ptr [[TMP4]], align 1
-; ALL-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_PTR]], 0
-; ALL-NEXT: br i1 [[TMP5]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]]
-; ALL: copy_forward:
-; ALL-NEXT: br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP:%.*]]
-; ALL: copy_forward_loop:
-; ALL-NEXT: [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ]
-; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR1]]
-; ALL-NEXT: [[ELEMENT2:%.*]] = load i8, ptr [[TMP6]], align 1
-; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX_PTR1]]
-; ALL-NEXT: store i8 [[ELEMENT2]], ptr [[TMP7]], align 1
-; ALL-NEXT: [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1
-; ALL-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 256
-; ALL-NEXT: br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]]
+; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; ALL: memmove_bwd_loop:
+; ALL-NEXT: [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 16, [[TMP0:%.*]] ]
+; ALL-NEXT: [[BWD_INDEX]] = sub i64 [[TMP2]], 1
+; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
+; ALL-NEXT: [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP3]], align 1
+; ALL-NEXT: [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[DST]], i64 [[BWD_INDEX]]
+; ALL-NEXT: store <4 x i32> [[ELEMENT]], ptr [[TMP4]], align 1
+; ALL-NEXT: [[TMP5:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
+; ALL-NEXT: br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; ALL: memmove_fwd_loop:
+; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i64 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
+; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
+; ALL-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP6]], align 1
+; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, ptr [[DST]], i64 [[FWD_INDEX]]
+; ALL-NEXT: store <4 x i32> [[ELEMENT1]], ptr [[TMP7]], align 1
+; ALL-NEXT: [[TMP8]] = add i64 [[FWD_INDEX]], 1
+; ALL-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 16
+; ALL-NEXT: br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
; ALL: memmove_done:
; ALL-NEXT: ret void
;
@@ -1372,30 +1372,25 @@ define amdgpu_kernel void @memmove_global_align1_flat_align1(ptr addrspace(1) %d
; ALL-LABEL: @memmove_global_align1_flat_align1(
; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[DST:%.*]] to ptr
; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[SRC:%.*]], [[TMP1]]
-; ALL-NEXT: [[COMPARE_N_TO_0:%.*]] = icmp eq i64 256, 0
-; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[COPY_BACKWARDS:%.*]], label [[COPY_FORWARD:%.*]]
-; ALL: copy_backwards:
-; ALL-NEXT: br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE:%.*]], label [[COPY_BACKWARDS_LOOP:%.*]]
-; ALL: copy_backwards_loop:
-; ALL-NEXT: [[TMP2:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 256, [[COPY_BACKWARDS]] ]
-; ALL-NEXT: [[INDEX_PTR]] = sub i64 [[TMP2]], 1
-; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX_PTR]]
-; ALL-NEXT: [[ELEMENT:%.*]] = load i8, ptr [[TMP3]], align 1
-; ALL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR]]
-; ALL-NEXT: store i8 [[ELEMENT]], ptr [[TMP4]], align 1
-; ALL-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_PTR]], 0
-; ALL-NEXT: br i1 [[TMP5]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]]
-; ALL: copy_forward:
-; ALL-NEXT: br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP:%.*]]
-; ALL: copy_forward_loop:
-; ALL-NEXT: [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ]
-; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX_PTR1]]
-; ALL-NEXT: [[ELEMENT2:%.*]] = load i8, ptr [[TMP6]], align 1
-; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR1]]
-; ALL-NEXT: store i8 [[ELEMENT2]], ptr [[TMP7]], align 1
-; ALL-NEXT: [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1
-; ALL-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 256
-; ALL-NEXT: br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]]
+; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; ALL: memmove_bwd_loop:
+; ALL-NEXT: [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 16, [[TMP0:%.*]] ]
+; ALL-NEXT: [[BWD_INDEX]] = sub i64 [[TMP2]], 1
+; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr [[SRC]], i64 [[BWD_INDEX]]
+; ALL-NEXT: [[ELEMENT:%.*]] = load <4 x i32>, ptr [[TMP3]], align 1
+; ALL-NEXT: [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
+; ALL-NEXT: store <4 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP4]], align 1
+; ALL-NEXT: [[TMP5:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
+; ALL-NEXT: br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; ALL: memmove_fwd_loop:
+; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i64 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
+; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr [[SRC]], i64 [[FWD_INDEX]]
+; ALL-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr [[TMP6]], align 1
+; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
+; ALL-NEXT: store <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP7]], align 1
+; ALL-NEXT: [[TMP8]] = add i64 [[FWD_INDEX]], 1
+; ALL-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 16
+; ALL-NEXT: br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
; ALL: memmove_done:
; ALL-NEXT: ret void
;
@@ -1411,30 +1406,25 @@ define amdgpu_kernel void @memmove_flat_align1_private_align1(ptr %dst, ptr addr
; ALL-LABEL: @memmove_flat_align1_private_align1(
; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[SRC:%.*]] to ptr
; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[TMP1]], [[DST:%.*]]
-; ALL-NEXT: [[COMPARE_N_TO_0:%.*]] = icmp eq i64 256, 0
-; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[COPY_BACKWARDS:%.*]], label [[COPY_FORWARD:%.*]]
-; ALL: copy_backwards:
-; ALL-NEXT: br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE:%.*]], label [[COPY_BACKWARDS_LOOP:%.*]]
-; ALL: copy_backwards_loop:
-; ALL-NEXT: [[TMP2:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 256, [[COPY_BACKWARDS]] ]
-; ALL-NEXT: [[INDEX_PTR]] = sub i64 [[TMP2]], 1
-; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR]]
-; ALL-NEXT: [[ELEMENT:%.*]] = load i8, ptr [[TMP3]], align 1
-; ALL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX_PTR]]
-; ALL-NEXT: store i8 [[ELEMENT]], ptr [[TMP4]], align 1
-; ALL-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_PTR]], 0
-; ALL-NEXT: br i1 [[TMP5]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]]
-; ALL: copy_forward:
-; ALL-NEXT: br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP:%.*]]
-; ALL: copy_forward_loop:
-; ALL-NEXT: [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ]
-; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR1]]
-; ALL-NEXT: [[ELEMENT2:%.*]] = load i8, ptr [[TMP6]], align 1
-; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX_PTR1]]
-; ALL-NEXT: store i8 [[ELEMENT2]], ptr [[TMP7]], align 1
-; ALL-NEXT: [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1
-; ALL-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 256
-; ALL-NEXT: br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]]
+; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; ALL: memmove_bwd_loop:
+; ALL-NEXT: [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 16, [[TMP0:%.*]] ]
+; ALL-NEXT: [[BWD_INDEX]] = sub i64 [[TMP2]], 1
+; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[SRC]], i64 [[BWD_INDEX]]
+; ALL-NEXT: [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP3]], align 1
+; ALL-NEXT: [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[DST]], i64 [[BWD_INDEX]]
+; ALL-NEXT: store <4 x i32> [[ELEMENT]], ptr [[TMP4]], align 1
+; ALL-NEXT: [[TMP5:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
+; ALL-NEXT: br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; ALL: memmove_fwd_loop:
+; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i64 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
+; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[SRC]], i64 [[FWD_INDEX]]
+; ALL-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP6]], align 1
+; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, ptr [[DST]], i64 [[FWD_INDEX]]
+; ALL-NEXT: store <4 x i32> [[ELEMENT1]], ptr [[TMP7]], align 1
+; ALL-NEXT: [[TMP8]] = add i64 [[FWD_INDEX]], 1
+; ALL-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 16
+; ALL-NEXT: br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
; ALL: memmove_done:
; ALL-NEXT: ret void
;
@@ -1450,30 +1440,25 @@ define amdgpu_kernel void @memmove_private_align1_flat_align1(ptr addrspace(5) %
; ALL-LABEL: @memmove_private_align1_flat_align1(
; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[DST:%.*]] to ptr
; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[SRC:%.*]], [[TMP1]]
-; ALL-NEXT: [[COMPARE_N_TO_0:%.*]] = icmp eq i64 256, 0
-; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[COPY_BACKWARDS:%.*]], label [[COPY_FORWARD:%.*]]
-; ALL: copy_backwards:
-; ALL-NEXT: br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE:%.*]], label [[COPY_BACKWARDS_LOOP:%.*]]
-; ALL: copy_backwards_loop:
-; ALL-NEXT: [[TMP2:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 256, [[COPY_BACKWARDS]] ]
-; ALL-NEXT: [[INDEX_PTR]] = sub i64 [[TMP2]], 1
-; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX_PTR]]
-; ALL-NEXT: [[ELEMENT:%.*]] = load i8, ptr [[TMP3]], align 1
-; ALL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR]]
-; ALL-NEXT: store i8 [[ELEMENT]], ptr [[TMP4]], align 1
-; ALL-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_PTR]], 0
-; ALL-NEXT: br i1 [[TMP5]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]]
-; ALL: copy_forward:
-; ALL-NEXT: br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP:%.*]]
-; ALL: copy_forward_loop:
-; ALL-NEXT: [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ]
-; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX_PTR1]]
-; ALL-NEXT: [[ELEMENT2:%.*]] = load i8, ptr [[TMP6]], align 1
-; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR1]]
-; ALL-NEXT: store i8 [[ELEMENT2]], ptr [[TMP7]], align 1
-; ALL-NEXT: [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1
-; ALL-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 256
-; ALL-NEXT: br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]]
+; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; ALL: memmove_bwd_loop:
+; ALL-NEXT: [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 16, [[TMP0:%.*]] ]
+; ALL-NEXT: [[BWD_INDEX]] = sub i64 [[TMP2]], 1
+; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr [[SRC]], i64 [[BWD_INDEX]]
+; ALL-NEXT: [[ELEMENT:%.*]] = load <4 x i32>, ptr [[TMP3]], align 1
+; ALL-NEXT: [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[DST]], i64 [[BWD_INDEX]]
+; ALL-NEXT: store <4 x i32> [[ELEMENT]], ptr addrspace(5) [[TMP4]], align 1
+; ALL-NEXT: [[TMP5:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
+; ALL-NEXT: br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; ALL: memmove_fwd_loop:
+; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i64 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
+; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr [[SRC]], i64 [[FWD_INDEX]]
+; ALL-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr [[TMP6]], align 1
+; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[DST]], i64 [[FWD_INDEX]]
+; ALL-NEXT: store <4 x i32> [[ELEMENT1]], ptr addrspace(5) [[TMP7]], align 1
+; ALL-NEXT: [[TMP8]] = add i64 [[FWD_INDEX]], 1
+; ALL-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 16
+; ALL-NEXT: br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
; ALL: memmove_done:
; ALL-NEXT: ret void
;
@@ -1734,6 +1719,477 @@ define amdgpu_kernel void @memmove_private_align1_local_align1_unknown_size(ptr
ret void
}
+
+define amdgpu_kernel void @memmove_flat_align1_local_align1(ptr addrspace(0) %dst, ptr addrspace(3) %src) {
+; MAX1024-LABEL: @memmove_flat_align1_local_align1(
+; MAX1024-NEXT: call void @llvm.memmove.p0.p3.i32(ptr [[DST:%.*]], ptr addrspace(3) [[SRC:%.*]], i32 256, i1 false)
+; MAX1024-NEXT: ret void
+;
+; ALL-LABEL: @memmove_flat_align1_local_align1(
+; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(3) [[SRC:%.*]] to ptr
+; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[TMP1]], [[DST:%.*]]
+; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; ALL: memmove_bwd_loop:
+; ALL-NEXT: [[TMP2:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 32, [[TMP0:%.*]] ]
+; ALL-NEXT: [[BWD_INDEX]] = sub i32 [[TMP2]], 1
+; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC]], i32 [[BWD_INDEX]]
+; ALL-NEXT: [[ELEMENT:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP3]], align 1
+; ALL-NEXT: [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[DST]], i32 [[BWD_INDEX]]
+; ALL-NEXT: store <2 x i32> [[ELEMENT]], ptr [[TMP4]], align 1
+; ALL-NEXT: [[TMP5:%.*]] = icmp eq i32 [[BWD_INDEX]], 0
+; ALL-NEXT: br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; ALL: memmove_fwd_loop:
+; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i32 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
+; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC]], i32 [[FWD_INDEX]]
+; ALL-NEXT: [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP6]], align 1
+; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, ptr [[DST]], i32 [[FWD_INDEX]]
+; ALL-NEXT: store <2 x i32> [[ELEMENT1]], ptr [[TMP7]], align 1
+; ALL-NEXT: [[TMP8]] = add i32 [[FWD_INDEX]], 1
+; ALL-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 32
+; ALL-NEXT: br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
+; ALL: memmove_done:
+; ALL-NEXT: ret void
+;
+ call void @llvm.memmove.p0.p3.i32(ptr addrspace(0) %dst, ptr addrspace(3) %src, i32 256, i1 false)
+ ret void
+}
+
+define amdgpu_kernel void @memmove_flat_align1_local_align1_unknown_size(ptr addrspace(0) %dst, ptr addrspace(3) %src, i32 %size) {
+; OPT-LABEL: @memmove_flat_align1_local_align1_unknown_size(
+; OPT-NEXT: [[TMP1:%.*]] = lshr i32 [[SIZE:%.*]], 3
+; OPT-NEXT: [[TMP2:%.*]] = and i32 [[SIZE]], 7
+; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
+; OPT-NEXT: [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0
+; OPT-NEXT: [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP1]], 0
+; OPT-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(3) [[SRC:%.*]] to ptr
+; OPT-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[TMP4]], [[DST:%.*]]
+; OPT-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]]
+; OPT: memmove_copy_backwards:
+; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_BWD_MIDDLE:%.*]], label [[MEMMOVE_BWD_RESIDUAL_LOOP:%.*]]
+; OPT: memmove_bwd_residual_loop:
+; OPT-NEXT: [[TMP5:%.*]] = phi i32 [ [[BWD_RESIDUAL_INDEX:%.*]], [[MEMMOVE_BWD_RESIDUAL_LOOP]] ], [ [[SIZE]], [[MEMMOVE_COPY_BACKWARDS]] ]
+; OPT-NEXT: [[BWD_RESIDUAL_INDEX]] = sub i32 [[TMP5]], 1
+; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[BWD_RESIDUAL_INDEX]]
+; OPT-NEXT: [[ELEMENT:%.*]] = load i8, ptr addrspace(3) [[TMP6]], align 1
+; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[BWD_RESIDUAL_INDEX]]
+; OPT-NEXT: store i8 [[ELEMENT]], ptr [[TMP7]], align 1
+; OPT-NEXT: [[TMP8:%.*]] = icmp eq i32 [[BWD_RESIDUAL_INDEX]], [[TMP3]]
+; OPT-NEXT: br i1 [[TMP8]], label [[MEMMOVE_BWD_MIDDLE]], label [[MEMMOVE_BWD_RESIDUAL_LOOP]]
+; OPT: memmove_bwd_middle:
+; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]]
+; OPT: memmove_bwd_main_loop:
+; OPT-NEXT: [[TMP9:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP1]], [[MEMMOVE_BWD_MIDDLE]] ]
+; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i32 [[TMP9]], 1
+; OPT-NEXT: [[TMP10:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC]], i32 [[BWD_MAIN_INDEX]]
+; OPT-NEXT: [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP10]], align 1
+; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds <2 x i32>, ptr [[DST]], i32 [[BWD_MAIN_INDEX]]
+; OPT-NEXT: store <2 x i32> [[ELEMENT1]], ptr [[TMP11]], align 1
+; OPT-NEXT: [[TMP12:%.*]] = icmp eq i32 [[BWD_MAIN_INDEX]], 0
+; OPT-NEXT: br i1 [[TMP12]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]]
+; OPT: memmove_copy_forward:
+; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]]
+; OPT: memmove_fwd_main_loop:
+; OPT-NEXT: [[FWD_MAIN_INDEX:%.*]] = phi i32 [ [[TMP15:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ]
+; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC]], i32 [[FWD_MAIN_INDEX]]
+; OPT-NEXT: [[ELEMENT2:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP13]], align 1
+; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds <2 x i32>, ptr [[DST]], i32 [[FWD_MAIN_INDEX]]
+; OPT-NEXT: store <2 x i32> [[ELEMENT2]], ptr [[TMP14]], align 1
+; OPT-NEXT: [[TMP15]] = add i32 [[FWD_MAIN_INDEX]], 1
+; OPT-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], [[TMP1]]
+; OPT-NEXT: br i1 [[TMP16]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]]
+; OPT: memmove_fwd_middle:
+; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]]
+; OPT: memmove_fwd_residual_loop:
+; OPT-NEXT: [[FWD_RESIDUAL_INDEX:%.*]] = phi i32 [ [[TMP19:%.*]], [[MEMMOVE_FWD_RESIDUAL_LOOP]] ], [ [[TMP3]], [[MEMMOVE_FWD_MIDDLE]] ]
+; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[FWD_RESIDUAL_INDEX]]
+; OPT-NEXT: [[ELEMENT3:%.*]] = load i8, ptr addrspace(3) [[TMP17]], align 1
+; OPT-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[FWD_RESIDUAL_INDEX]]
+; OPT-NEXT: store i8 [[ELEMENT3]], ptr [[TMP18]], align 1
+; OPT-NEXT: [[TMP19]] = add i32 [[FWD_RESIDUAL_INDEX]], 1
+; OPT-NEXT: [[TMP20:%.*]] = icmp eq i32 [[TMP19]], [[SIZE]]
+; OPT-NEXT: br i1 [[TMP20]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP]]
+; OPT: memmove_done:
+; OPT-NEXT: ret void
+;
+ call void @llvm.memmove.p0.p3.i32(ptr addrspace(0) %dst, ptr addrspace(3) %src, i32 %size, i1 false)
+ ret void
+}
+
+define amdgpu_kernel void @memmove_local_align1_flat_align1(ptr addrspace(3) %dst, ptr addrspace(0) %src) {
+; MAX1024-LABEL: @memmove_local_align1_flat_align1(
+; MAX1024-NEXT: call void @llvm.memmove.p3.p0.i32(ptr addrspace(3) [[DST:%.*]], ptr [[SRC:%.*]], i32 256, i1 false)
+; MAX1024-NEXT: ret void
+;
+; ALL-LABEL: @memmove_local_align1_flat_align1(
+; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(3) [[DST:%.*]] to ptr
+; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[SRC:%.*]], [[TMP1]]
+; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; ALL: memmove_bwd_loop:
+; ALL-NEXT: [[TMP2:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 32, [[TMP0:%.*]] ]
+; ALL-NEXT: [[BWD_INDEX]] = sub i32 [[TMP2]], 1
+; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds <2 x i32>, ptr [[SRC]], i32 [[BWD_INDEX]]
+; ALL-NEXT: [[ELEMENT:%.*]] = load <2 x i32>, ptr [[TMP3]], align 1
+; ALL-NEXT: [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST]], i32 [[BWD_INDEX]]
+; ALL-NEXT: store <2 x i32> [[ELEMENT]], ptr addrspace(3) [[TMP4]], align 1
+; ALL-NEXT: [[TMP5:%.*]] = icmp eq i32 [[BWD_INDEX]], 0
+; ALL-NEXT: br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; ALL: memmove_fwd_loop:
+; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i32 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
+; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, ptr [[SRC]], i32 [[FWD_INDEX]]
+; ALL-NEXT: [[ELEMENT1:%.*]] = load <2 x i32>, ptr [[TMP6]], align 1
+; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST]], i32 [[FWD_INDEX]]
+; ALL-NEXT: store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP7]], align 1
+; ALL-NEXT: [[TMP8]] = add i32 [[FWD_INDEX]], 1
+; ALL-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 32
+; ALL-NEXT: br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
+; ALL: memmove_done:
+; ALL-NEXT: ret void
+;
+ call void @llvm.memmove.p3.p0.i32(ptr addrspace(3) %dst, ptr addrspace(0) %src, i32 256, i1 false)
+ ret void
+}
+
+define amdgpu_kernel void @memmove_local_align1_flat_align1_unknown_size(ptr addrspace(3) %dst, ptr addrspace(0) %src, i32 %size) {
+; OPT-LABEL: @memmove_local_align1_flat_align1_unknown_size(
+; OPT-NEXT: [[TMP1:%.*]] = lshr i32 [[SIZE:%.*]], 3
+; OPT-NEXT: [[TMP2:%.*]] = and i32 [[SIZE]], 7
+; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
+; OPT-NEXT: [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0
+; OPT-NEXT: [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP1]], 0
+; OPT-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(3) [[DST:%.*]] to ptr
+; OPT-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[SRC:%.*]], [[TMP4]]
+; OPT-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]]
+; OPT: memmove_copy_backwards:
+; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_BWD_MIDDLE:%.*]], label [[MEMMOVE_BWD_RESIDUAL_LOOP:%.*]]
+; OPT: memmove_bwd_residual_loop:
+; OPT-NEXT: [[TMP5:%.*]] = phi i32 [ [[BWD_RESIDUAL_INDEX:%.*]], [[MEMMOVE_BWD_RESIDUAL_LOOP]] ], [ [[SIZE]], [[MEMMOVE_COPY_BACKWARDS]] ]
+; OPT-NEXT: [[BWD_RESIDUAL_INDEX]] = sub i32 [[TMP5]], 1
+; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[BWD_RESIDUAL_INDEX]]
+; OPT-NEXT: [[ELEMENT:%.*]] = load i8, ptr [[TMP6]], align 1
+; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[BWD_RESIDUAL_INDEX]]
+; OPT-NEXT: store i8 [[ELEMENT]], ptr addrspace(3) [[TMP7]], align 1
+; OPT-NEXT: [[TMP8:%.*]] = icmp eq i32 [[BWD_RESIDUAL_INDEX]], [[TMP3]]
+; OPT-NEXT: br i1 [[TMP8]], label [[MEMMOVE_BWD_MIDDLE]], label [[MEMMOVE_BWD_RESIDUAL_LOOP]]
+; OPT: memmove_bwd_middle:
+; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]]
+; OPT: memmove_bwd_main_loop:
+; OPT-NEXT: [[TMP9:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP1]], [[MEMMOVE_BWD_MIDDLE]] ]
+; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i32 [[TMP9]], 1
+; OPT-NEXT: [[TMP10:%.*]] = getelementptr inbounds <2 x i32>, ptr [[SRC]], i32 [[BWD_MAIN_INDEX]]
+; OPT-NEXT: [[ELEMENT1:%.*]] = load <2 x i32>, ptr [[TMP10]], align 1
+; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST]], i32 [[BWD_MAIN_INDEX]]
+; OPT-NEXT: store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP11]], align 1
+; OPT-NEXT: [[TMP12:%.*]] = icmp eq i32 [[BWD_MAIN_INDEX]], 0
+; OPT-NEXT: br i1 [[TMP12]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]]
+; OPT: memmove_copy_forward:
+; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]]
+; OPT: memmove_fwd_main_loop:
+; OPT-NEXT: [[FWD_MAIN_INDEX:%.*]] = phi i32 [ [[TMP15:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ]
+; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds <2 x i32>, ptr [[SRC]], i32 [[FWD_MAIN_INDEX]]
+; OPT-NEXT: [[ELEMENT2:%.*]] = load <2 x i32>, ptr [[TMP13]], align 1
+; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST]], i32 [[FWD_MAIN_INDEX]]
+; OPT-NEXT: store <2 x i32> [[ELEMENT2]], ptr addrspace(3) [[TMP14]], align 1
+; OPT-NEXT: [[TMP15]] = add i32 [[FWD_MAIN_INDEX]], 1
+; OPT-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], [[TMP1]]
+; OPT-NEXT: br i1 [[TMP16]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]]
+; OPT: memmove_fwd_middle:
+; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]]
+; OPT: memmove_fwd_residual_loop:
+; OPT-NEXT: [[FWD_RESIDUAL_INDEX:%.*]] = phi i32 [ [[TMP19:%.*]], [[MEMMOVE_FWD_RESIDUAL_LOOP]] ], [ [[TMP3]], [[MEMMOVE_FWD_MIDDLE]] ]
+; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[FWD_RESIDUAL_INDEX]]
+; OPT-NEXT: [[ELEMENT3:%.*]] = load i8, ptr [[TMP17]], align 1
+; OPT-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[FWD_RESIDUAL_INDEX]]
+; OPT-NEXT: store i8 [[ELEMENT3]], ptr addrspace(3) [[TMP18]], align 1
+; OPT-NEXT: [[TMP19]] = add i32 [[FWD_RESIDUAL_INDEX]], 1
+; OPT-NEXT: [[TMP20:%.*]] = icmp eq i32 [[TMP19]], [[SIZE]]
+; OPT-NEXT: br i1 [[TMP20]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP]]
+; OPT: memmove_done:
+; OPT-NEXT: ret void
+;
+ call void @llvm.memmove.p3.p0.i32(ptr addrspace(3) %dst, ptr addrspace(0) %src, i32 %size, i1 false)
+ ret void
+}
+
+define amdgpu_kernel void @memmove_local_align1_local_align1(ptr addrspace(3) %dst, ptr addrspace(3) %src) {
+; MAX1024-LABEL: @memmove_local_align1_local_align1(
+; MAX1024-NEXT: call void @llvm.memmove.p3.p3.i32(ptr addrspace(3) [[DST:%.*]], ptr addrspace(3) [[SRC:%.*]], i32 256, i1 false)
+; MAX1024-NEXT: ret void
+;
+; ALL-LABEL: @memmove_local_align1_local_align1(
+; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(3) [[SRC:%.*]], [[DST:%.*]]
+; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; ALL: memmove_bwd_loop:
+; ALL-NEXT: [[TMP1:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 32, [[TMP0:%.*]] ]
+; ALL-NEXT: [[BWD_INDEX]] = sub i32 [[TMP1]], 1
+; ALL-NEXT: [[TMP2:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC]], i32 [[BWD_INDEX]]
+; ALL-NEXT: [[ELEMENT:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP2]], align 1
+; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST]], i32 [[BWD_INDEX]]
+; ALL-NEXT: store <2 x i32> [[ELEMENT]], ptr addrspace(3) [[TMP3]], align 1
+; ALL-NEXT: [[TMP4:%.*]] = icmp eq i32 [[BWD_INDEX]], 0
+; ALL-NEXT: br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; ALL: memmove_fwd_loop:
+; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i32 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
+; ALL-NEXT: [[TMP5:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC]], i32 [[FWD_INDEX]]
+; ALL-NEXT: [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 1
+; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST]], i32 [[FWD_INDEX]]
+; ALL-NEXT: store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP6]], align 1
+; ALL-NEXT: [[TMP7]] = add i32 [[FWD_INDEX]], 1
+; ALL-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 32
+; ALL-NEXT: br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
+; ALL: memmove_done:
+; ALL-NEXT: ret void
+;
+ call void @llvm.memmove.p3.p3.i32(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 256, i1 false)
+ ret void
+}
+
+define amdgpu_kernel void @memmove_local_align1_local_align1_unknown_size(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %size) {
+; OPT-LABEL: @memmove_local_align1_local_align1_unknown_size(
+; OPT-NEXT: [[TMP1:%.*]] = lshr i32 [[SIZE:%.*]], 3
+; OPT-NEXT: [[TMP2:%.*]] = and i32 [[SIZE]], 7
+; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
+; OPT-NEXT: [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0
+; OPT-NEXT: [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP1]], 0
+; OPT-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(3) [[SRC:%.*]], [[DST:%.*]]
+; OPT-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]]
+; OPT: memmove_copy_backwards:
+; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_BWD_MIDDLE:%.*]], label [[MEMMOVE_BWD_RESIDUAL_LOOP:%.*]]
+; OPT: memmove_bwd_residual_loop:
+; OPT-NEXT: [[TMP4:%.*]] = phi i32 [ [[BWD_RESIDUAL_INDEX:%.*]], [[MEMMOVE_BWD_RESIDUAL_LOOP]] ], [ [[SIZE]], [[MEMMOVE_COPY_BACKWARDS]] ]
+; OPT-NEXT: [[BWD_RESIDUAL_INDEX]] = sub i32 [[TMP4]], 1
+; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[BWD_RESIDUAL_INDEX]]
+; OPT-NEXT: [[ELEMENT:%.*]] = load i8, ptr addrspace(3) [[TMP5]], align 1
+; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[BWD_RESIDUAL_INDEX]]
+; OPT-NEXT: store i8 [[ELEMENT]], ptr addrspace(3) [[TMP6]], align 1
+; OPT-NEXT: [[TMP7:%.*]] = icmp eq i32 [[BWD_RESIDUAL_INDEX]], [[TMP3]]
+; OPT-NEXT: br i1 [[TMP7]], label [[MEMMOVE_BWD_MIDDLE]], label [[MEMMOVE_BWD_RESIDUAL_LOOP]]
+; OPT: memmove_bwd_middle:
+; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]]
+; OPT: memmove_bwd_main_loop:
+; OPT-NEXT: [[TMP8:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP1]], [[MEMMOVE_BWD_MIDDLE]] ]
+; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i32 [[TMP8]], 1
+; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC]], i32 [[BWD_MAIN_INDEX]]
+; OPT-NEXT: [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP9]], align 1
+; OPT-NEXT: [[TMP10:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST]], i32 [[BWD_MAIN_INDEX]]
+; OPT-NEXT: store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP10]], align 1
+; OPT-NEXT: [[TMP11:%.*]] = icmp eq i32 [[BWD_MAIN_INDEX]], 0
+; OPT-NEXT: br i1 [[TMP11]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]]
+; OPT: memmove_copy_forward:
+; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]]
+; OPT: memmove_fwd_main_loop:
+; OPT-NEXT: [[FWD_MAIN_INDEX:%.*]] = phi i32 [ [[TMP14:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ]
+; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC]], i32 [[FWD_MAIN_INDEX]]
+; OPT-NEXT: [[ELEMENT2:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP12]], align 1
+; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST]], i32 [[FWD_MAIN_INDEX]]
+; OPT-NEXT: store <2 x i32> [[ELEMENT2]], ptr addrspace(3) [[TMP13]], align 1
+; OPT-NEXT: [[TMP14]] = add i32 [[FWD_MAIN_INDEX]], 1
+; OPT-NEXT: [[TMP15:%.*]] = icmp eq i32 [[TMP14]], [[TMP1]]
+; OPT-NEXT: br i1 [[TMP15]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]]
+; OPT: memmove_fwd_middle:
+; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]]
+; OPT: memmove_fwd_residual_loop:
+; OPT-NEXT: [[FWD_RESIDUAL_INDEX:%.*]] = phi i32 [ [[TMP18:%.*]], [[MEMMOVE_FWD_RESIDUAL_LOOP]] ], [ [[TMP3]], [[MEMMOVE_FWD_MIDDLE]] ]
+; OPT-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[FWD_RESIDUAL_INDEX]]
+; OPT-NEXT: [[ELEMENT3:%.*]] = load i8, ptr addrspace(3) [[TMP16]], align 1
+; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[FWD_RESIDUAL_INDEX]]
+; OPT-NEXT: store i8 [[ELEMENT3]], ptr addrspace(3) [[TMP17]], align 1
+; OPT-NEXT: [[TMP18]] = add i32 [[FWD_RESIDUAL_INDEX]], 1
+; OPT-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP18]], [[SIZE]]
+; OPT-NEXT: br i1 [[TMP19]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP]]
+; OPT: memmove_done:
+; OPT-NEXT: ret void
+;
+ call void @llvm.memmove.p3.p3.i32(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %size, i1 false)
+ ret void
+}
+
+define amdgpu_kernel void @memmove_private_align1_private_align1(ptr addrspace(5) %dst, ptr addrspace(5) %src) {
+; MAX1024-LABEL: @memmove_private_align1_private_align1(
+; MAX1024-NEXT: call void @llvm.memmove.p5.p5.i32(ptr addrspace(5) [[DST:%.*]], ptr addrspace(5) [[SRC:%.*]], i32 256, i1 false)
+; MAX1024-NEXT: ret void
+;
+; ALL-LABEL: @memmove_private_align1_private_align1(
+; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(5) [[SRC:%.*]], [[DST:%.*]]
+; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; ALL: memmove_bwd_loop:
+; ALL-NEXT: [[TMP1:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 16, [[TMP0:%.*]] ]
+; ALL-NEXT: [[BWD_INDEX]] = sub i32 [[TMP1]], 1
+; ALL-NEXT: [[TMP2:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[SRC]], i32 [[BWD_INDEX]]
+; ALL-NEXT: [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP2]], align 1
+; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[DST]], i32 [[BWD_INDEX]]
+; ALL-NEXT: store <4 x i32> [[ELEMENT]], ptr addrspace(5) [[TMP3]], align 1
+; ALL-NEXT: [[TMP4:%.*]] = icmp eq i32 [[BWD_INDEX]], 0
+; ALL-NEXT: br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; ALL: memmove_fwd_loop:
+; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i32 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
+; ALL-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[SRC]], i32 [[FWD_INDEX]]
+; ALL-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP5]], align 1
+; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[DST]], i32 [[FWD_INDEX]]
+; ALL-NEXT: store <4 x i32> [[ELEMENT1]], ptr addrspace(5) [[TMP6]], align 1
+; ALL-NEXT: [[TMP7]] = add i32 [[FWD_INDEX]], 1
+; ALL-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 16
+; ALL-NEXT: br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
+; ALL: memmove_done:
+; ALL-NEXT: ret void
+;
+ call void @llvm.memmove.p5.p5.i32(ptr addrspace(5) %dst, ptr addrspace(5) %src, i32 256, i1 false)
+ ret void
+}
+
+define amdgpu_kernel void @memmove_private_align1_private_align1_unknown_size(ptr addrspace(5) %dst, ptr addrspace(5) %src, i32 %size) {
+; OPT-LABEL: @memmove_private_align1_private_align1_unknown_size(
+; OPT-NEXT: [[TMP1:%.*]] = lshr i32 [[SIZE:%.*]], 4
+; OPT-NEXT: [[TMP2:%.*]] = and i32 [[SIZE]], 15
+; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
+; OPT-NEXT: [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0
+; OPT-NEXT: [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP1]], 0
+; OPT-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(5) [[SRC:%.*]], [[DST:%.*]]
+; OPT-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]]
+; OPT: memmove_copy_backwards:
+; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_BWD_MIDDLE:%.*]], label [[MEMMOVE_BWD_RESIDUAL_LOOP:%.*]]
+; OPT: memmove_bwd_residual_loop:
+; OPT-NEXT: [[TMP4:%.*]] = phi i32 [ [[BWD_RESIDUAL_INDEX:%.*]], [[MEMMOVE_BWD_RESIDUAL_LOOP]] ], [ [[SIZE]], [[MEMMOVE_COPY_BACKWARDS]] ]
+; OPT-NEXT: [[BWD_RESIDUAL_INDEX]] = sub i32 [[TMP4]], 1
+; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[BWD_RESIDUAL_INDEX]]
+; OPT-NEXT: [[ELEMENT:%.*]] = load i8, ptr addrspace(5) [[TMP5]], align 1
+; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 [[BWD_RESIDUAL_INDEX]]
+; OPT-NEXT: store i8 [[ELEMENT]], ptr addrspace(5) [[TMP6]], align 1
+; OPT-NEXT: [[TMP7:%.*]] = icmp eq i32 [[BWD_RESIDUAL_INDEX]], [[TMP3]]
+; OPT-NEXT: br i1 [[TMP7]], label [[MEMMOVE_BWD_MIDDLE]], label [[MEMMOVE_BWD_RESIDUAL_LOOP]]
+; OPT: memmove_bwd_middle:
+; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]]
+; OPT: memmove_bwd_main_loop:
+; OPT-NEXT: [[TMP8:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP1]], [[MEMMOVE_BWD_MIDDLE]] ]
+; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i32 [[TMP8]], 1
+; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[SRC]], i32 [[BWD_MAIN_INDEX]]
+; OPT-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP9]], align 1
+; OPT-NEXT: [[TMP10:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[DST]], i32 [[BWD_MAIN_INDEX]]
+; OPT-NEXT: store <4 x i32> [[ELEMENT1]], ptr addrspace(5) [[TMP10]], align 1
+; OPT-NEXT: [[TMP11:%.*]] = icmp eq i32 [[BWD_MAIN_INDEX]], 0
+; OPT-NEXT: br i1 [[TMP11]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]]
+; OPT: memmove_copy_forward:
+; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]]
+; OPT: memmove_fwd_main_loop:
+; OPT-NEXT: [[FWD_MAIN_INDEX:%.*]] = phi i32 [ [[TMP14:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ]
+; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[SRC]], i32 [[FWD_MAIN_INDEX]]
+; OPT-NEXT: [[ELEMENT2:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP12]], align 1
+; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[DST]], i32 [[FWD_MAIN_INDEX]]
+; OPT-NEXT: store <4 x i32> [[ELEMENT2]], ptr addrspace(5) [[TMP13]], align 1
+; OPT-NEXT: [[TMP14]] = add i32 [[FWD_MAIN_INDEX]], 1
+; OPT-NEXT: [[TMP15:%.*]] = icmp eq i32 [[TMP14]], [[TMP1]]
+; OPT-NEXT: br i1 [[TMP15]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]]
+; OPT: memmove_fwd_middle:
+; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]]
+; OPT: memmove_fwd_residual_loop:
+; OPT-NEXT: [[FWD_RESIDUAL_INDEX:%.*]] = phi i32 [ [[TMP18:%.*]], [[MEMMOVE_FWD_RESIDUAL_LOOP]] ], [ [[TMP3]], [[MEMMOVE_FWD_MIDDLE]] ]
+; OPT-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[FWD_RESIDUAL_INDEX]]
+; OPT-NEXT: [[ELEMENT3:%.*]] = load i8, ptr addrspace(5) [[TMP16]], align 1
+; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 [[FWD_RESIDUAL_INDEX]]
+; OPT-NEXT: store i8 [[ELEMENT3]], ptr addrspace(5) [[TMP17]], align 1
+; OPT-NEXT: [[TMP18]] = add i32 [[FWD_RESIDUAL_INDEX]], 1
+; OPT-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP18]], [[SIZE]]
+; OPT-NEXT: br i1 [[TMP19]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP]]
+; OPT: memmove_done:
+; OPT-NEXT: ret void
+;
+ call void @llvm.memmove.p5.p5.i32(ptr addrspace(5) %dst, ptr addrspace(5) %src, i32 %size, i1 false)
+ ret void
+}
+
+define amdgpu_kernel void @memmove_global_align4_static_residual_empty(ptr addrspace(1) %dst, ptr addrspace(1) %src) {
+; OPT-LABEL: @memmove_global_align4_static_residual_empty(
+; OPT-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[DST:%.*]]
+; OPT-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; OPT: memmove_bwd_loop:
+; OPT-NEXT: [[TMP1:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 65, [[TMP0:%.*]] ]
+; OPT-NEXT: [[BWD_INDEX]] = sub i64 [[TMP1]], 1
+; OPT-NEXT: [[TMP2:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
+; OPT-NEXT: [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP2]], align 1
+; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
+; OPT-NEXT: store <4 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1
+; OPT-NEXT: [[TMP4:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
+; OPT-NEXT: br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; OPT: memmove_fwd_loop:
+; OPT-NEXT: [[FWD_INDEX:%.*]] = phi i64 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
+; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
+; OPT-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1
+; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
+; OPT-NEXT: store <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP6]], align 1
+; OPT-NEXT: [[TMP7]] = add i64 [[FWD_INDEX]], 1
+; OPT-NEXT: [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 65
+; OPT-NEXT: br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
+; OPT: memmove_done:
+; OPT-NEXT: ret void
+;
+ call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 1040, i1 false)
+ ret void
+}
+
+define amdgpu_kernel void @memmove_global_align4_static_residual_full(ptr addrspace(1) %dst, ptr addrspace(1) %src) {
+; OPT-LABEL: @memmove_global_align4_static_residual_full(
+; OPT-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[DST:%.*]]
+; OPT-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_RESIDUAL:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; OPT: memmove_bwd_residual:
+; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1038
+; OPT-NEXT: [[TMP2:%.*]] = load i8, ptr addrspace(1) [[TMP1]], align 1
+; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1038
+; OPT-NEXT: store i8 [[TMP2]], ptr addrspace(1) [[TMP3]], align 1
+; OPT-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[SRC]], i64 518
+; OPT-NEXT: [[TMP5:%.*]] = load i16, ptr addrspace(1) [[TMP4]], align 1
+; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[DST]], i64 518
+; OPT-NEXT: store i16 [[TMP5]], ptr addrspace(1) [[TMP6]], align 1
+; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[SRC]], i64 258
+; OPT-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 1
+; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[DST]], i64 258
+; OPT-NEXT: store i32 [[TMP8]], ptr addrspace(1) [[TMP9]], align 1
+; OPT-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[SRC]], i64 128
+; OPT-NEXT: [[TMP11:%.*]] = load i64, ptr addrspace(1) [[TMP10]], align 1
+; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[DST]], i64 128
+; OPT-NEXT: store i64 [[TMP11]], ptr addrspace(1) [[TMP12]], align 1
+; OPT-NEXT: br label [[MEMMOVE_BWD_LOOP:%.*]]
+; OPT: memmove_bwd_loop:
+; OPT-NEXT: [[TMP13:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 64, [[MEMMOVE_BWD_RESIDUAL]] ]
+; OPT-NEXT: [[BWD_INDEX]] = sub i64 [[TMP13]], 1
+; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
+; OPT-NEXT: [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP14]], align 1
+; OPT-NEXT: [[TMP15:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
+; OPT-NEXT: store <4 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP15]], align 1
+; OPT-NEXT: [[TMP16:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
+; OPT-NEXT: br i1 [[TMP16]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; OPT: memmove_fwd_loop:
+; OPT-NEXT: [[FWD_INDEX:%.*]] = phi i64 [ [[TMP19:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0:%.*]] ]
+; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
+; OPT-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP17]], align 1
+; OPT-NEXT: [[TMP18:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
+; OPT-NEXT: store <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP18]], align 1
+; OPT-NEXT: [[TMP19]] = add i64 [[FWD_INDEX]], 1
+; OPT-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], 64
+; OPT-NEXT: br i1 [[TMP20]], label [[MEMMOVE_FWD_RESIDUAL:%.*]], label [[MEMMOVE_FWD_LOOP]]
+; OPT: memmove_fwd_residual:
+; OPT-NEXT: [[TMP21:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[SRC]], i64 128
+; OPT-NEXT: [[TMP22:%.*]] = load i64, ptr addrspace(1) [[TMP21]], align 1
+; OPT-NEXT: [[TMP23:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[DST]], i64 128
+; OPT-NEXT: store i64 [[TMP22]], ptr addrspace(1) [[TMP23]], align 1
+; OPT-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[SRC]], i64 258
+; OPT-NEXT: [[TMP25:%.*]] = load i32, ptr addrspace(1) [[TMP24]], align 1
+; OPT-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[DST]], i64 258
+; OPT-NEXT: store i32 [[TMP25]], ptr addrspace(1) [[TMP26]], align 1
+; OPT-NEXT: [[TMP27:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[SRC]], i64 518
+; OPT-NEXT: [[TMP28:%.*]] = load i16, ptr addrspace(1) [[TMP27]], align 1
+; OPT-NEXT: [[TMP29:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[DST]], i64 518
+; OPT-NEXT: store i16 [[TMP28]], ptr addrspace(1) [[TMP29]], align 1
+; OPT-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1038
+; OPT-NEXT: [[TMP31:%.*]] = load i8, ptr addrspace(1) [[TMP30]], align 1
+; OPT-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1038
+; OPT-NEXT: store i8 [[TMP31]], ptr addrspace(1) [[TMP32]], align 1
+; OPT-NEXT: br label [[MEMMOVE_DONE]]
+; OPT: memmove_done:
+; OPT-NEXT: ret void
+;
+ call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 1039, i1 false)
+ ret void
+}
+
define void @test_umin(i64 %0, i64 %idxprom, ptr %x, ptr %y) {
; OPT-LABEL: @test_umin(
; OPT-NEXT: entry:
@@ -1783,30 +2239,25 @@ define amdgpu_kernel void @memmove_volatile(ptr addrspace(1) %dst, ptr addrspace
;
; ALL-LABEL: @memmove_volatile(
; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[DST:%.*]]
-; ALL-NEXT: [[COMPARE_N_TO_0:%.*]] = icmp eq i64 64, 0
-; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[COPY_BACKWARDS:%.*]], label [[COPY_FORWARD:%.*]]
-; ALL: copy_backwards:
-; ALL-NEXT: br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE:%.*]], label [[COPY_BACKWARDS_LOOP:%.*]]
-; ALL: copy_backwards_loop:
-; ALL-NEXT: [[TMP1:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 64, [[COPY_BACKWARDS]] ]
-; ALL-NEXT: [[INDEX_PTR]] = sub i64 [[TMP1]], 1
-; ALL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[INDEX_PTR]]
-; ALL-NEXT: [[ELEMENT:%.*]] = load volatile i8, ptr addrspace(1) [[TMP2]], align 1
-; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[INDEX_PTR]]
-; ALL-NEXT: store volatile i8 [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1
-; ALL-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_PTR]], 0
-; ALL-NEXT: br i1 [[TMP4]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]]
-; ALL: copy_forward:
-; ALL-NEXT: br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP:%.*]]
-; ALL: copy_forward_loop:
-; ALL-NEXT: [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ]
-; ALL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[INDEX_PTR1]]
-; ALL-NEXT: [[ELEMENT2:%.*]] = load volatile i8, ptr addrspace(1) [[TMP5]], align 1
-; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[INDEX_PTR1]]
-; ALL-NEXT: store volatile i8 [[ELEMENT2]], ptr addrspace(1) [[TMP6]], align 1
-; ALL-NEXT: [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1
-; ALL-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 64
-; ALL-NEXT: br i1 [[TMP7]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]]
+; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; ALL: memmove_bwd_loop:
+; ALL-NEXT: [[TMP1:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 4, [[TMP0:%.*]] ]
+; ALL-NEXT: [[BWD_INDEX]] = sub i64 [[TMP1]], 1
+; ALL-NEXT: [[TMP2:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
+; ALL-NEXT: [[ELEMENT:%.*]] = load volatile <4 x i32>, ptr addrspace(1) [[TMP2]], align 1
+; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
+; ALL-NEXT: store volatile <4 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1
+; ALL-NEXT: [[TMP4:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
+; ALL-NEXT: br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; ALL: memmove_fwd_loop:
+; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i64 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
+; ALL-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
+; ALL-NEXT: [[ELEMENT1:%.*]] = load volatile <4 x i32>, ptr addrspace(1) [[TMP5]], align 1
+; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
+; ALL-NEXT: store volatile <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP6]], align 1
+; ALL-NEXT: [[TMP7]] = add i64 [[FWD_INDEX]], 1
+; ALL-NEXT: [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 4
+; ALL-NEXT: br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
; ALL: memmove_done:
; ALL-NEXT: ret void
;
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
index b5ee668..06ebd86 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX9_W64
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W32
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W64
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W32
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W64
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W32
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W64
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W32
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W64
; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=W64-O0
; Test that we correctly legalize VGPR Rsrc operands in MUBUF instructions.
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
index 162c47f..2591ff4 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX9_W64
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W32
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W64
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W32
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W64
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W32
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W64
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W32
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W64
; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=W64-O0
; Test that we correctly legalize VGPR Rsrc operands in MUBUF instructions.
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir
index ece2e1b..c702de6 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir
@@ -1,10 +1,10 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W64,ADDR64
# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W64,W64-NO-ADDR64
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W64,W64-NO-ADDR64
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W32
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W64,W64-NO-ADDR64
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W32
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W64,W64-NO-ADDR64
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W32
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W64,W64-NO-ADDR64
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W32
# Test that we correctly legalize VGPR Rsrc operands in MUBUF instructions.
#
diff --git a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
index 92b2f51..10d0803 100644
--- a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
+++ b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
@@ -1,5 +1,5 @@
; RUN: opt -mtriple=amdgcn-- -mcpu=gfx600 -S -lowerswitch -amdgpu-unify-divergent-exit-nodes -verify -structurizecfg -verify -si-annotate-control-flow -simplifycfg-require-and-preserve-domtree=1 %s | FileCheck -check-prefix=IR %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -S -lowerswitch -amdgpu-unify-divergent-exit-nodes -verify -structurizecfg -verify -si-annotate-control-flow -simplifycfg-require-and-preserve-domtree=1 %s | FileCheck -check-prefix=IR %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize64 -S -lowerswitch -amdgpu-unify-divergent-exit-nodes -verify -structurizecfg -verify -si-annotate-control-flow -simplifycfg-require-and-preserve-domtree=1 %s | FileCheck -check-prefix=IR %s
; RUN: llc -mtriple=amdgcn -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=GCN %s
; Add an extra verifier runs. There were some cases where invalid IR
diff --git a/llvm/test/CodeGen/AMDGPU/s_or_saveexec_xor_combine.mir b/llvm/test/CodeGen/AMDGPU/s_or_saveexec_xor_combine.mir
index 9ad1660..c1121f9 100644
--- a/llvm/test/CodeGen/AMDGPU/s_or_saveexec_xor_combine.mir
+++ b/llvm/test/CodeGen/AMDGPU/s_or_saveexec_xor_combine.mir
@@ -1,5 +1,5 @@
# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck --check-prefixes=GCN,WAVE32 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck --check-prefixes=GCN,WAVE64 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck --check-prefixes=GCN,WAVE64 %s
---
@@ -124,4 +124,4 @@ body: |
bb.0:
liveins: $sgpr0, $sgpr1
renamable $sgpr1 = S_MOV_B32 renamable $sgpr0
- renamable $sgpr0 = S_OR_SAVEEXEC_B32 killed renamable $sgpr0, implicit-def $exec, implicit-def $scc, implicit $exec \ No newline at end of file
+ renamable $sgpr0 = S_OR_SAVEEXEC_B32 killed renamable $sgpr0, implicit-def $exec, implicit-def $scc, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir
index f19b0a5..eaa3d22 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir
@@ -1,6 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=GCN64-MUBUF %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=GCN32-MUBUF %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=GCN32-MUBUF %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+enable-flat-scratch -run-pass=prologepilog %s -o - | FileCheck -check-prefix=GCN64-FLATSCR %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -filetype=obj -verify-machineinstrs -start-before=prologepilog %s -o /dev/null
# Check not crashing when emitting ISA
diff --git a/llvm/test/CodeGen/AMDGPU/sink-after-control-flow-postra.mir b/llvm/test/CodeGen/AMDGPU/sink-after-control-flow-postra.mir
index 266e673..ff87c28 100644
--- a/llvm/test/CodeGen/AMDGPU/sink-after-control-flow-postra.mir
+++ b/llvm/test/CodeGen/AMDGPU/sink-after-control-flow-postra.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass=postra-machine-sink -mattr=-wavefrontsize32,+wavefrontsize64 -o - %s | FileCheck -check-prefixes=GFX10 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass=postra-machine-sink -mattr=+wavefrontsize64 -o - %s | FileCheck -check-prefixes=GFX10 %s
# Ensure that PostRA Machine Sink does not sink instructions
# past block prologues which would overwrite their uses.
diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
index a440b87..eebd32c 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,WAVE64,SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,WAVE64,GFX10-WAVE64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,WAVE64,GFX10-WAVE64 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX10-WAVE32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
define amdgpu_ps void @test_kill_depth_0_imm_pos() #0 {
; GCN-LABEL: test_kill_depth_0_imm_pos:
diff --git a/llvm/test/CodeGen/AMDGPU/smem-war-hazard.mir b/llvm/test/CodeGen/AMDGPU/smem-war-hazard.mir
index 0f69210..2f1eeb8 100644
--- a/llvm/test/CodeGen/AMDGPU/smem-war-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/smem-war-hazard.mir
@@ -1,5 +1,5 @@
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN,GFX10 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN,GFX10 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN %s
# GCN-LABEL: name: hazard_smem_war
# GCN: S_LOAD_DWORD_IMM
diff --git a/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir b/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir
index 537aca1..639bf6a 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir
@@ -1,7 +1,7 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck --check-prefix=GFX9 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck --check-prefix=GFX10 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck --check-prefix=GFX11 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck --check-prefix=GFX10 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck --check-prefix=GFX11 %s
--- |
define amdgpu_kernel void @check_vcc() #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll b/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll
index 4b9b5f9..39c7538 100644
--- a/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-wavefrontsize32,+wavefrontsize64 --global-isel=0 -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=+wavefrontsize64 --global-isel=0 -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK %s
define amdgpu_kernel void @icmp_test() {
; CHECK-LABEL: icmp_test:
diff --git a/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir b/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
index e143611..17e3d93 100644
--- a/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
+++ b/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
@@ -1,7 +1,7 @@
# RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=tahiti -o - %s | FileCheck %s -check-prefixes=CHECK,SI
# RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx900 -o - %s | FileCheck %s -check-prefixes=CHECK,GFX9
-# RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -o - %s | FileCheck %s
-# RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -o - %s | FileCheck %s
+# RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -o - %s | FileCheck %s
+# RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -o - %s | FileCheck %s
---
# CHECK-LABEL: name: vccz_corrupt_workaround
# CHECK: $vcc = V_CMP_EQ_F32
diff --git a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.mir b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.mir
index 4592498..47f13cb 100644
--- a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.mir
+++ b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.mir
@@ -1,5 +1,5 @@
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck --check-prefixes=GCN,GFX1010 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck --check-prefixes=GCN,GFX1030 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck --check-prefixes=GCN,GFX1010 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck --check-prefixes=GCN,GFX1030 %s
---
# After the Optimize exec masking (post-RA) pass, there's a change of having v_cmpx instructions
diff --git a/llvm/test/CodeGen/AMDGPU/vcmpx-exec-war-hazard.mir b/llvm/test/CodeGen/AMDGPU/vcmpx-exec-war-hazard.mir
index 645d473..e97c518 100644
--- a/llvm/test/CodeGen/AMDGPU/vcmpx-exec-war-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/vcmpx-exec-war-hazard.mir
@@ -1,5 +1,5 @@
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN,GFX10 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN,GFX10 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s
# GCN-LABEL: name: hazard_vcmpx_smov_exec_lo
# GCN: $sgpr0 = S_MOV_B32 $exec_lo
diff --git a/llvm/test/CodeGen/AMDGPU/verify-constant-bus-violations.mir b/llvm/test/CodeGen/AMDGPU/verify-constant-bus-violations.mir
index 81d17a8..76713dd 100644
--- a/llvm/test/CodeGen/AMDGPU/verify-constant-bus-violations.mir
+++ b/llvm/test/CodeGen/AMDGPU/verify-constant-bus-violations.mir
@@ -1,9 +1,9 @@
# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx900 -run-pass machineverifier -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX9-ERR %s
-# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass machineverifier -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX10PLUS-ERR %s
-# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass machineverifier -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX10PLUS-ERR %s
+# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -run-pass machineverifier -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX10PLUS-ERR %s
+# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -run-pass machineverifier -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX10PLUS-ERR %s
# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx900 --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX9-ERR %s
-# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX10PLUS-ERR %s
-# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX10PLUS-ERR %s
+# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX10PLUS-ERR %s
+# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX10PLUS-ERR %s
# GFX9-ERR: *** Bad machine code: VOP* instruction violates constant bus restriction ***
# GFX9-ERR: $vgpr0 = V_CNDMASK_B32_e64 0, $sgpr0, 0, -1, killed $sgpr0_sgpr1, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/verify-vopd-gfx12.mir b/llvm/test/CodeGen/AMDGPU/verify-vopd-gfx12.mir
index 6c55183..bbd0817 100644
--- a/llvm/test/CodeGen/AMDGPU/verify-vopd-gfx12.mir
+++ b/llvm/test/CodeGen/AMDGPU/verify-vopd-gfx12.mir
@@ -1,5 +1,5 @@
-# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -run-pass machineverifier -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX12-ERR %s
-# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX12-ERR %s
+# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -run-pass machineverifier -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX12-ERR %s
+# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX12-ERR %s
# GFX12-ERR: *** Bad machine code: VOP* instruction violates constant bus restriction ***
# GFX12-ERR: $vgpr2, $vgpr3 = V_DUAL_CNDMASK_B32_e32_X_MUL_F32_e32_gfx12 $sgpr0, $vgpr0, $sgpr1, $vgpr1, implicit $exec, implicit $mode, implicit $vcc_lo, implicit $vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/verify-vopd.mir b/llvm/test/CodeGen/AMDGPU/verify-vopd.mir
index dc7d4af..b310f55 100644
--- a/llvm/test/CodeGen/AMDGPU/verify-vopd.mir
+++ b/llvm/test/CodeGen/AMDGPU/verify-vopd.mir
@@ -1,5 +1,5 @@
-# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -run-pass machineverifier -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX11-ERR %s
-# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX11-ERR %s
+# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -run-pass machineverifier -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX11-ERR %s
+# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX11-ERR %s
# GFX11-ERR: *** Bad machine code: VOP* instruction violates constant bus restriction ***
# GFX11-ERR: $vgpr2, $vgpr3 = V_DUAL_CNDMASK_B32_e32_X_MUL_F32_e32_gfx11 $sgpr0, $vgpr0, $sgpr1, $vgpr1, implicit $exec, implicit $mode, implicit $vcc_lo, implicit $vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
index ab1121a..645f498 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll
index 829d770..ad1d66e 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll
index 161d222..c52b079 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll
index 511a116..ebbb3d1 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll
index 5fde11c..0f8df5a 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir
index 277db33..4073964 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir
+++ b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX12 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize64 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX12 %s
# D0 overlaps A1, B1, C1 or Index1. Overlap starts at vgpr0.
# $D0 = wmma0 $A0, $B0, $C0 or $D0 = swmmac0 $A0, $B0, $C0, $Index0
diff --git a/llvm/test/CodeGen/AMDGPU/wmma_modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma_modifiers.ll
index 801bb06..2efbd32 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma_modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma_modifiers.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half>, <16 x half>, <4 x float>)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll b/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll
index 12b6598..ac218a2 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W32
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W32
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half>, <16 x half> , <8 x float>)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16>, <16 x i16> , <8 x float>)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll b/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll
index b43969d..9ec2d8f 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W64
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W64
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half>, <16 x half>, <4 x float>)
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v16i16(<16 x i16>, <16 x i16>, <4 x float>)
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index 11003c4..8f052ef 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10-W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs -mattr=+wavefrontsize32 < %s | FileCheck -check-prefixes=GFX10-W32 %s
; Check that WQM isn't triggered by image load/store intrinsics.
define amdgpu_ps <4 x float> @test1(<8 x i32> inreg %rsrc, i32 %c) {
diff --git a/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll b/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll
index afa7fde..59e755a 100644
--- a/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll
@@ -154,7 +154,7 @@ entry:
; IR-LABEL: @memmove_caller
; IR: icmp ult ptr %src, %dst
; IR: [[PHIVAL:%[0-9a-zA-Z_]+]] = phi i64
-; IR-NEXT: %index_ptr = sub i64 [[PHIVAL]], 1
+; IR-NEXT: %bwd_main_index = sub i64 [[PHIVAL]], 1
; IR: [[FWDPHIVAL:%[0-9a-zA-Z_]+]] = phi i64
; IR: {{%[0-9a-zA-Z_]+}} = add i64 [[FWDPHIVAL]], 1
diff --git a/llvm/test/CodeGen/PowerPC/common-chain.ll b/llvm/test/CodeGen/PowerPC/common-chain.ll
index 5f8c21e..ccf0e45 100644
--- a/llvm/test/CodeGen/PowerPC/common-chain.ll
+++ b/llvm/test/CodeGen/PowerPC/common-chain.ll
@@ -642,8 +642,8 @@ define i64 @two_chain_two_bases_succ(ptr %p, i64 %offset, i64 %base1, i64 %base2
; CHECK-NEXT: cmpdi r7, 0
; CHECK-NEXT: ble cr0, .LBB6_4
; CHECK-NEXT: # %bb.1: # %for.body.preheader
-; CHECK-NEXT: add r6, r6, r4
; CHECK-NEXT: add r5, r5, r4
+; CHECK-NEXT: add r6, r6, r4
; CHECK-NEXT: mtctr r7
; CHECK-NEXT: sldi r4, r4, 1
; CHECK-NEXT: add r5, r3, r5
@@ -743,214 +743,219 @@ define signext i32 @spill_reduce_succ(ptr %input1, ptr %input2, ptr %output, i64
; CHECK-NEXT: std r9, -184(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r8, -176(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r7, -168(r1) # 8-byte Folded Spill
-; CHECK-NEXT: std r3, -160(r1) # 8-byte Folded Spill
+; CHECK-NEXT: std r4, -160(r1) # 8-byte Folded Spill
; CHECK-NEXT: ble cr0, .LBB7_7
; CHECK-NEXT: # %bb.1: # %for.body.preheader
-; CHECK-NEXT: sldi r6, r6, 2
-; CHECK-NEXT: li r7, 1
-; CHECK-NEXT: mr r30, r10
-; CHECK-NEXT: cmpdi r6, 1
-; CHECK-NEXT: iselgt r7, r6, r7
-; CHECK-NEXT: addi r8, r7, -1
-; CHECK-NEXT: clrldi r6, r7, 63
-; CHECK-NEXT: cmpldi r8, 3
+; CHECK-NEXT: sldi r4, r6, 2
+; CHECK-NEXT: li r6, 1
+; CHECK-NEXT: mr r0, r10
+; CHECK-NEXT: std r10, -192(r1) # 8-byte Folded Spill
+; CHECK-NEXT: cmpdi r4, 1
+; CHECK-NEXT: iselgt r4, r4, r6
+; CHECK-NEXT: addi r7, r4, -1
+; CHECK-NEXT: clrldi r6, r4, 63
+; CHECK-NEXT: cmpldi r7, 3
; CHECK-NEXT: blt cr0, .LBB7_4
; CHECK-NEXT: # %bb.2: # %for.body.preheader.new
-; CHECK-NEXT: ld r14, -168(r1) # 8-byte Folded Reload
-; CHECK-NEXT: mulli r24, r30, 24
-; CHECK-NEXT: ld r16, -184(r1) # 8-byte Folded Reload
-; CHECK-NEXT: ld r15, -176(r1) # 8-byte Folded Reload
-; CHECK-NEXT: ld r3, -160(r1) # 8-byte Folded Reload
-; CHECK-NEXT: rldicl r0, r7, 62, 2
-; CHECK-NEXT: sldi r11, r30, 5
-; CHECK-NEXT: sldi r19, r30, 4
-; CHECK-NEXT: sldi r7, r14, 3
-; CHECK-NEXT: add r14, r30, r14
-; CHECK-NEXT: sldi r10, r16, 3
-; CHECK-NEXT: sldi r12, r15, 3
-; CHECK-NEXT: add r16, r30, r16
-; CHECK-NEXT: add r15, r30, r15
-; CHECK-NEXT: add r27, r11, r7
-; CHECK-NEXT: add r22, r24, r7
-; CHECK-NEXT: add r17, r19, r7
-; CHECK-NEXT: sldi r2, r14, 3
-; CHECK-NEXT: add r26, r24, r10
-; CHECK-NEXT: add r25, r24, r12
-; CHECK-NEXT: add r21, r19, r10
-; CHECK-NEXT: add r20, r19, r12
-; CHECK-NEXT: add r8, r11, r10
-; CHECK-NEXT: sldi r16, r16, 3
-; CHECK-NEXT: add r29, r5, r27
-; CHECK-NEXT: add r28, r4, r27
-; CHECK-NEXT: add r27, r3, r27
-; CHECK-NEXT: add r24, r5, r22
-; CHECK-NEXT: add r23, r4, r22
-; CHECK-NEXT: add r22, r3, r22
-; CHECK-NEXT: add r19, r5, r17
-; CHECK-NEXT: add r18, r4, r17
-; CHECK-NEXT: add r17, r3, r17
-; CHECK-NEXT: add r14, r5, r2
-; CHECK-NEXT: add r31, r4, r2
-; CHECK-NEXT: add r2, r3, r2
-; CHECK-NEXT: add r9, r5, r8
-; CHECK-NEXT: add r8, r11, r12
+; CHECK-NEXT: ld r0, -192(r1) # 8-byte Folded Reload
+; CHECK-NEXT: ld r30, -184(r1) # 8-byte Folded Reload
+; CHECK-NEXT: ld r8, -176(r1) # 8-byte Folded Reload
+; CHECK-NEXT: rldicl r7, r4, 62, 2
+; CHECK-NEXT: ld r9, -168(r1) # 8-byte Folded Reload
+; CHECK-NEXT: add r11, r0, r30
+; CHECK-NEXT: add r4, r0, r0
+; CHECK-NEXT: mulli r23, r0, 24
+; CHECK-NEXT: add r14, r0, r8
+; CHECK-NEXT: sldi r12, r0, 5
+; CHECK-NEXT: add r31, r0, r9
+; CHECK-NEXT: sldi r9, r9, 3
+; CHECK-NEXT: sldi r18, r0, 4
+; CHECK-NEXT: sldi r8, r8, 3
+; CHECK-NEXT: add r10, r4, r4
+; CHECK-NEXT: sldi r4, r30, 3
+; CHECK-NEXT: sldi r11, r11, 3
+; CHECK-NEXT: add r26, r12, r9
+; CHECK-NEXT: add r16, r18, r9
+; CHECK-NEXT: add r29, r12, r8
+; CHECK-NEXT: add r19, r18, r8
+; CHECK-NEXT: add r30, r12, r4
+; CHECK-NEXT: mr r20, r4
+; CHECK-NEXT: std r4, -200(r1) # 8-byte Folded Spill
+; CHECK-NEXT: ld r4, -160(r1) # 8-byte Folded Reload
+; CHECK-NEXT: add r15, r5, r11
+; CHECK-NEXT: sldi r11, r14, 3
+; CHECK-NEXT: add r29, r5, r29
+; CHECK-NEXT: add r28, r3, r26
+; CHECK-NEXT: add r19, r5, r19
+; CHECK-NEXT: add r21, r23, r9
+; CHECK-NEXT: add r24, r23, r8
+; CHECK-NEXT: add r14, r5, r11
+; CHECK-NEXT: sldi r11, r31, 3
+; CHECK-NEXT: add r25, r23, r20
+; CHECK-NEXT: add r20, r18, r20
+; CHECK-NEXT: add r30, r5, r30
+; CHECK-NEXT: add r18, r3, r16
+; CHECK-NEXT: add r24, r5, r24
+; CHECK-NEXT: add r23, r3, r21
+; CHECK-NEXT: add r27, r4, r26
+; CHECK-NEXT: add r22, r4, r21
+; CHECK-NEXT: add r17, r4, r16
+; CHECK-NEXT: add r2, r4, r11
+; CHECK-NEXT: rldicl r4, r7, 2, 1
+; CHECK-NEXT: sub r7, r8, r9
+; CHECK-NEXT: ld r8, -200(r1) # 8-byte Folded Reload
; CHECK-NEXT: add r26, r5, r26
; CHECK-NEXT: add r25, r5, r25
; CHECK-NEXT: add r21, r5, r21
; CHECK-NEXT: add r20, r5, r20
; CHECK-NEXT: add r16, r5, r16
-; CHECK-NEXT: add r8, r5, r8
-; CHECK-NEXT: rldicl r3, r0, 2, 1
-; CHECK-NEXT: addi r3, r3, -4
-; CHECK-NEXT: sub r0, r12, r7
-; CHECK-NEXT: sub r12, r10, r7
-; CHECK-NEXT: li r7, 0
-; CHECK-NEXT: mr r10, r30
-; CHECK-NEXT: sldi r15, r15, 3
-; CHECK-NEXT: add r15, r5, r15
-; CHECK-NEXT: rldicl r3, r3, 62, 2
-; CHECK-NEXT: addi r3, r3, 1
-; CHECK-NEXT: mtctr r3
+; CHECK-NEXT: add r31, r5, r11
+; CHECK-NEXT: add r11, r3, r11
+; CHECK-NEXT: addi r4, r4, -4
+; CHECK-NEXT: rldicl r4, r4, 62, 2
+; CHECK-NEXT: sub r8, r8, r9
+; CHECK-NEXT: li r9, 0
+; CHECK-NEXT: addi r4, r4, 1
+; CHECK-NEXT: mtctr r4
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB7_3: # %for.body
; CHECK-NEXT: #
-; CHECK-NEXT: lfd f0, 0(r2)
-; CHECK-NEXT: lfd f1, 0(r31)
-; CHECK-NEXT: add r3, r10, r30
-; CHECK-NEXT: add r3, r3, r30
+; CHECK-NEXT: lfd f0, 0(r11)
+; CHECK-NEXT: lfd f1, 0(r2)
+; CHECK-NEXT: add r0, r0, r10
; CHECK-NEXT: xsmuldp f0, f0, f1
-; CHECK-NEXT: lfd f1, 0(r14)
-; CHECK-NEXT: add r3, r3, r30
-; CHECK-NEXT: add r10, r3, r30
+; CHECK-NEXT: lfd f1, 0(r31)
; CHECK-NEXT: xsadddp f0, f1, f0
-; CHECK-NEXT: stfd f0, 0(r14)
-; CHECK-NEXT: add r14, r14, r11
-; CHECK-NEXT: lfdx f0, r2, r0
-; CHECK-NEXT: lfdx f1, r31, r0
+; CHECK-NEXT: stfd f0, 0(r31)
+; CHECK-NEXT: add r31, r31, r12
+; CHECK-NEXT: lfdx f0, r11, r7
+; CHECK-NEXT: lfdx f1, r2, r7
; CHECK-NEXT: xsmuldp f0, f0, f1
-; CHECK-NEXT: lfdx f1, r15, r7
+; CHECK-NEXT: lfdx f1, r14, r9
; CHECK-NEXT: xsadddp f0, f1, f0
-; CHECK-NEXT: stfdx f0, r15, r7
-; CHECK-NEXT: lfdx f0, r2, r12
-; CHECK-NEXT: lfdx f1, r31, r12
-; CHECK-NEXT: add r2, r2, r11
-; CHECK-NEXT: add r31, r31, r11
+; CHECK-NEXT: stfdx f0, r14, r9
+; CHECK-NEXT: lfdx f0, r11, r8
+; CHECK-NEXT: lfdx f1, r2, r8
+; CHECK-NEXT: add r11, r11, r12
+; CHECK-NEXT: add r2, r2, r12
; CHECK-NEXT: xsmuldp f0, f0, f1
-; CHECK-NEXT: lfdx f1, r16, r7
+; CHECK-NEXT: lfdx f1, r15, r9
; CHECK-NEXT: xsadddp f0, f1, f0
-; CHECK-NEXT: stfdx f0, r16, r7
-; CHECK-NEXT: lfd f0, 0(r17)
-; CHECK-NEXT: lfd f1, 0(r18)
+; CHECK-NEXT: stfdx f0, r15, r9
+; CHECK-NEXT: lfd f0, 0(r18)
+; CHECK-NEXT: lfd f1, 0(r17)
; CHECK-NEXT: xsmuldp f0, f0, f1
-; CHECK-NEXT: lfdx f1, r19, r7
+; CHECK-NEXT: lfdx f1, r16, r9
; CHECK-NEXT: xsadddp f0, f1, f0
-; CHECK-NEXT: stfdx f0, r19, r7
-; CHECK-NEXT: lfdx f0, r17, r0
-; CHECK-NEXT: lfdx f1, r18, r0
+; CHECK-NEXT: stfdx f0, r16, r9
+; CHECK-NEXT: lfdx f0, r18, r7
+; CHECK-NEXT: lfdx f1, r17, r7
; CHECK-NEXT: xsmuldp f0, f0, f1
-; CHECK-NEXT: lfdx f1, r20, r7
+; CHECK-NEXT: lfdx f1, r19, r9
; CHECK-NEXT: xsadddp f0, f1, f0
-; CHECK-NEXT: stfdx f0, r20, r7
-; CHECK-NEXT: lfdx f0, r17, r12
-; CHECK-NEXT: lfdx f1, r18, r12
-; CHECK-NEXT: add r17, r17, r11
-; CHECK-NEXT: add r18, r18, r11
+; CHECK-NEXT: stfdx f0, r19, r9
+; CHECK-NEXT: lfdx f0, r18, r8
+; CHECK-NEXT: lfdx f1, r17, r8
+; CHECK-NEXT: add r18, r18, r12
+; CHECK-NEXT: add r17, r17, r12
; CHECK-NEXT: xsmuldp f0, f0, f1
-; CHECK-NEXT: lfdx f1, r21, r7
+; CHECK-NEXT: lfdx f1, r20, r9
; CHECK-NEXT: xsadddp f0, f1, f0
-; CHECK-NEXT: stfdx f0, r21, r7
-; CHECK-NEXT: lfd f0, 0(r22)
-; CHECK-NEXT: lfd f1, 0(r23)
+; CHECK-NEXT: stfdx f0, r20, r9
+; CHECK-NEXT: lfd f0, 0(r23)
+; CHECK-NEXT: lfd f1, 0(r22)
; CHECK-NEXT: xsmuldp f0, f0, f1
-; CHECK-NEXT: lfdx f1, r24, r7
+; CHECK-NEXT: lfdx f1, r21, r9
; CHECK-NEXT: xsadddp f0, f1, f0
-; CHECK-NEXT: stfdx f0, r24, r7
-; CHECK-NEXT: lfdx f0, r22, r0
-; CHECK-NEXT: lfdx f1, r23, r0
+; CHECK-NEXT: stfdx f0, r21, r9
+; CHECK-NEXT: lfdx f0, r23, r7
+; CHECK-NEXT: lfdx f1, r22, r7
; CHECK-NEXT: xsmuldp f0, f0, f1
-; CHECK-NEXT: lfdx f1, r25, r7
+; CHECK-NEXT: lfdx f1, r24, r9
; CHECK-NEXT: xsadddp f0, f1, f0
-; CHECK-NEXT: stfdx f0, r25, r7
-; CHECK-NEXT: lfdx f0, r22, r12
-; CHECK-NEXT: lfdx f1, r23, r12
-; CHECK-NEXT: add r22, r22, r11
-; CHECK-NEXT: add r23, r23, r11
+; CHECK-NEXT: stfdx f0, r24, r9
+; CHECK-NEXT: lfdx f0, r23, r8
+; CHECK-NEXT: lfdx f1, r22, r8
+; CHECK-NEXT: add r23, r23, r12
+; CHECK-NEXT: add r22, r22, r12
; CHECK-NEXT: xsmuldp f0, f0, f1
-; CHECK-NEXT: lfdx f1, r26, r7
+; CHECK-NEXT: lfdx f1, r25, r9
; CHECK-NEXT: xsadddp f0, f1, f0
-; CHECK-NEXT: stfdx f0, r26, r7
-; CHECK-NEXT: lfd f0, 0(r27)
-; CHECK-NEXT: lfd f1, 0(r28)
+; CHECK-NEXT: stfdx f0, r25, r9
+; CHECK-NEXT: lfd f0, 0(r28)
+; CHECK-NEXT: lfd f1, 0(r27)
; CHECK-NEXT: xsmuldp f0, f0, f1
-; CHECK-NEXT: lfdx f1, r29, r7
+; CHECK-NEXT: lfdx f1, r26, r9
; CHECK-NEXT: xsadddp f0, f1, f0
-; CHECK-NEXT: stfdx f0, r29, r7
-; CHECK-NEXT: lfdx f0, r27, r0
-; CHECK-NEXT: lfdx f1, r28, r0
+; CHECK-NEXT: stfdx f0, r26, r9
+; CHECK-NEXT: lfdx f0, r28, r7
+; CHECK-NEXT: lfdx f1, r27, r7
; CHECK-NEXT: xsmuldp f0, f0, f1
-; CHECK-NEXT: lfdx f1, r8, r7
+; CHECK-NEXT: lfdx f1, r29, r9
; CHECK-NEXT: xsadddp f0, f1, f0
-; CHECK-NEXT: stfdx f0, r8, r7
-; CHECK-NEXT: lfdx f0, r27, r12
-; CHECK-NEXT: lfdx f1, r28, r12
-; CHECK-NEXT: add r27, r27, r11
-; CHECK-NEXT: add r28, r28, r11
+; CHECK-NEXT: stfdx f0, r29, r9
+; CHECK-NEXT: lfdx f0, r28, r8
+; CHECK-NEXT: lfdx f1, r27, r8
+; CHECK-NEXT: add r28, r28, r12
+; CHECK-NEXT: add r27, r27, r12
; CHECK-NEXT: xsmuldp f0, f0, f1
-; CHECK-NEXT: lfdx f1, r9, r7
+; CHECK-NEXT: lfdx f1, r30, r9
; CHECK-NEXT: xsadddp f0, f1, f0
-; CHECK-NEXT: stfdx f0, r9, r7
-; CHECK-NEXT: add r7, r7, r11
+; CHECK-NEXT: stfdx f0, r30, r9
+; CHECK-NEXT: add r9, r9, r12
; CHECK-NEXT: bdnz .LBB7_3
; CHECK-NEXT: .LBB7_4: # %for.cond.cleanup.loopexit.unr-lcssa
+; CHECK-NEXT: ld r7, -192(r1) # 8-byte Folded Reload
; CHECK-NEXT: cmpldi r6, 0
; CHECK-NEXT: beq cr0, .LBB7_7
; CHECK-NEXT: # %bb.5: # %for.body.epil.preheader
-; CHECK-NEXT: ld r3, -184(r1) # 8-byte Folded Reload
-; CHECK-NEXT: ld r0, -160(r1) # 8-byte Folded Reload
-; CHECK-NEXT: sldi r8, r30, 3
-; CHECK-NEXT: add r3, r10, r3
-; CHECK-NEXT: sldi r3, r3, 3
-; CHECK-NEXT: add r7, r5, r3
-; CHECK-NEXT: add r9, r4, r3
-; CHECK-NEXT: add r11, r0, r3
-; CHECK-NEXT: ld r3, -176(r1) # 8-byte Folded Reload
-; CHECK-NEXT: add r3, r10, r3
-; CHECK-NEXT: sldi r3, r3, 3
-; CHECK-NEXT: add r12, r5, r3
-; CHECK-NEXT: add r30, r4, r3
-; CHECK-NEXT: add r29, r0, r3
-; CHECK-NEXT: ld r3, -168(r1) # 8-byte Folded Reload
-; CHECK-NEXT: add r3, r10, r3
-; CHECK-NEXT: li r10, 0
-; CHECK-NEXT: sldi r3, r3, 3
-; CHECK-NEXT: add r5, r5, r3
-; CHECK-NEXT: add r4, r4, r3
-; CHECK-NEXT: add r3, r0, r3
+; CHECK-NEXT: ld r4, -184(r1) # 8-byte Folded Reload
+; CHECK-NEXT: ld r29, -160(r1) # 8-byte Folded Reload
+; CHECK-NEXT: mr r30, r3
+; CHECK-NEXT: sldi r7, r7, 3
+; CHECK-NEXT: add r4, r0, r4
+; CHECK-NEXT: sldi r4, r4, 3
+; CHECK-NEXT: add r3, r5, r4
+; CHECK-NEXT: add r8, r29, r4
+; CHECK-NEXT: add r9, r30, r4
+; CHECK-NEXT: ld r4, -176(r1) # 8-byte Folded Reload
+; CHECK-NEXT: add r4, r0, r4
+; CHECK-NEXT: sldi r4, r4, 3
+; CHECK-NEXT: add r10, r5, r4
+; CHECK-NEXT: add r11, r29, r4
+; CHECK-NEXT: add r12, r30, r4
+; CHECK-NEXT: ld r4, -168(r1) # 8-byte Folded Reload
+; CHECK-NEXT: add r4, r0, r4
+; CHECK-NEXT: sldi r0, r4, 3
+; CHECK-NEXT: add r5, r5, r0
+; CHECK-NEXT: add r4, r29, r0
+; CHECK-NEXT: add r30, r30, r0
+; CHECK-NEXT: li r0, 0
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB7_6: # %for.body.epil
; CHECK-NEXT: #
-; CHECK-NEXT: lfdx f0, r3, r10
-; CHECK-NEXT: lfdx f1, r4, r10
+; CHECK-NEXT: lfdx f0, r30, r0
+; CHECK-NEXT: lfdx f1, r4, r0
; CHECK-NEXT: addi r6, r6, -1
; CHECK-NEXT: cmpldi r6, 0
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfd f1, 0(r5)
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfd f0, 0(r5)
-; CHECK-NEXT: add r5, r5, r8
-; CHECK-NEXT: lfdx f0, r29, r10
-; CHECK-NEXT: lfdx f1, r30, r10
+; CHECK-NEXT: add r5, r5, r7
+; CHECK-NEXT: lfdx f0, r12, r0
+; CHECK-NEXT: lfdx f1, r11, r0
; CHECK-NEXT: xsmuldp f0, f0, f1
-; CHECK-NEXT: lfdx f1, r12, r10
+; CHECK-NEXT: lfdx f1, r10, r0
; CHECK-NEXT: xsadddp f0, f1, f0
-; CHECK-NEXT: stfdx f0, r12, r10
-; CHECK-NEXT: lfdx f0, r11, r10
-; CHECK-NEXT: lfdx f1, r9, r10
+; CHECK-NEXT: stfdx f0, r10, r0
+; CHECK-NEXT: lfdx f0, r9, r0
+; CHECK-NEXT: lfdx f1, r8, r0
; CHECK-NEXT: xsmuldp f0, f0, f1
-; CHECK-NEXT: lfdx f1, r7, r10
+; CHECK-NEXT: lfdx f1, r3, r0
; CHECK-NEXT: xsadddp f0, f1, f0
-; CHECK-NEXT: stfdx f0, r7, r10
-; CHECK-NEXT: add r10, r10, r8
+; CHECK-NEXT: stfdx f0, r3, r0
+; CHECK-NEXT: add r0, r0, r7
; CHECK-NEXT: bne cr0, .LBB7_6
; CHECK-NEXT: .LBB7_7: # %for.cond.cleanup
; CHECK-NEXT: ld r2, -152(r1) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
index 3611d92..e305a74 100644
--- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -90,7 +90,7 @@
; CHECK-NEXT: Lazy Block Frequency Analysis
; CHECK-NEXT: RISC-V DAG->DAG Pattern Instruction Selection
; CHECK-NEXT: Finalize ISel and expand pseudo-instructions
-; CHECK-NEXT: RISC-V Fold Masks
+; CHECK-NEXT: RISC-V Vector Peephole Optimization
; CHECK-NEXT: Lazy Machine Block Frequency Analysis
; CHECK-NEXT: Early Tail Duplication
; CHECK-NEXT: Optimize machine instruction PHIs
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll
index 3a99f53..cb50ca4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll
@@ -97,3 +97,112 @@ define void @vfwmul_v2f32_multiple_users(ptr %x, ptr %y, ptr %z, <2 x float> %a,
store <2 x double> %g, ptr %z
ret void
}
+
+define void @vfwmacc_v2f32_multiple_users(ptr %x, ptr %y, ptr %z, <2 x float> %a, <2 x float> %b, <2 x float> %b2, <2 x double> %w) {
+; NO_FOLDING-LABEL: vfwmacc_v2f32_multiple_users:
+; NO_FOLDING: # %bb.0:
+; NO_FOLDING-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; NO_FOLDING-NEXT: vfwcvt.f.f.v v12, v8
+; NO_FOLDING-NEXT: vfwcvt.f.f.v v8, v9
+; NO_FOLDING-NEXT: vfwcvt.f.f.v v9, v10
+; NO_FOLDING-NEXT: vsetvli zero, zero, e64, m1, ta, ma
+; NO_FOLDING-NEXT: vfmul.vv v10, v12, v8
+; NO_FOLDING-NEXT: vfmadd.vv v12, v9, v11
+; NO_FOLDING-NEXT: vfsub.vv v8, v8, v9
+; NO_FOLDING-NEXT: vse64.v v10, (a0)
+; NO_FOLDING-NEXT: vse64.v v12, (a1)
+; NO_FOLDING-NEXT: vse64.v v8, (a2)
+; NO_FOLDING-NEXT: ret
+;
+; FOLDING-LABEL: vfwmacc_v2f32_multiple_users:
+; FOLDING: # %bb.0:
+; FOLDING-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; FOLDING-NEXT: vfwmul.vv v12, v8, v9
+; FOLDING-NEXT: vfwmacc.vv v11, v8, v10
+; FOLDING-NEXT: vfwsub.vv v8, v9, v10
+; FOLDING-NEXT: vse64.v v12, (a0)
+; FOLDING-NEXT: vse64.v v11, (a1)
+; FOLDING-NEXT: vse64.v v8, (a2)
+; FOLDING-NEXT: ret
+ %c = fpext <2 x float> %a to <2 x double>
+ %d = fpext <2 x float> %b to <2 x double>
+ %d2 = fpext <2 x float> %b2 to <2 x double>
+ %e = fmul <2 x double> %c, %d
+ %f = call <2 x double> @llvm.fma(<2 x double> %c, <2 x double> %d2, <2 x double> %w)
+ %g = fsub <2 x double> %d, %d2
+ store <2 x double> %e, ptr %x
+ store <2 x double> %f, ptr %y
+ store <2 x double> %g, ptr %z
+ ret void
+}
+
+; Negative test. We can't fold because the FMA addend is a user.
+define void @vfwmacc_v2f32_multiple_users_addend_user(ptr %x, ptr %y, ptr %z, <2 x float> %a, <2 x float> %b, <2 x float> %b2) {
+; NO_FOLDING-LABEL: vfwmacc_v2f32_multiple_users_addend_user:
+; NO_FOLDING: # %bb.0:
+; NO_FOLDING-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; NO_FOLDING-NEXT: vfwcvt.f.f.v v11, v8
+; NO_FOLDING-NEXT: vfwcvt.f.f.v v8, v9
+; NO_FOLDING-NEXT: vfwcvt.f.f.v v9, v10
+; NO_FOLDING-NEXT: vsetvli zero, zero, e64, m1, ta, ma
+; NO_FOLDING-NEXT: vfmul.vv v10, v11, v8
+; NO_FOLDING-NEXT: vfmadd.vv v11, v9, v8
+; NO_FOLDING-NEXT: vfsub.vv v8, v8, v9
+; NO_FOLDING-NEXT: vse64.v v10, (a0)
+; NO_FOLDING-NEXT: vse64.v v11, (a1)
+; NO_FOLDING-NEXT: vse64.v v8, (a2)
+; NO_FOLDING-NEXT: ret
+;
+; FOLDING-LABEL: vfwmacc_v2f32_multiple_users_addend_user:
+; FOLDING: # %bb.0:
+; FOLDING-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; FOLDING-NEXT: vfwcvt.f.f.v v11, v8
+; FOLDING-NEXT: vfwcvt.f.f.v v8, v9
+; FOLDING-NEXT: vfwcvt.f.f.v v9, v10
+; FOLDING-NEXT: vsetvli zero, zero, e64, m1, ta, ma
+; FOLDING-NEXT: vfmul.vv v10, v11, v8
+; FOLDING-NEXT: vfmadd.vv v11, v9, v8
+; FOLDING-NEXT: vfsub.vv v8, v8, v9
+; FOLDING-NEXT: vse64.v v10, (a0)
+; FOLDING-NEXT: vse64.v v11, (a1)
+; FOLDING-NEXT: vse64.v v8, (a2)
+; FOLDING-NEXT: ret
+ %c = fpext <2 x float> %a to <2 x double>
+ %d = fpext <2 x float> %b to <2 x double>
+ %d2 = fpext <2 x float> %b2 to <2 x double>
+ %e = fmul <2 x double> %c, %d
+ %f = call <2 x double> @llvm.fma(<2 x double> %c, <2 x double> %d2, <2 x double> %d)
+ %g = fsub <2 x double> %d, %d2
+ store <2 x double> %e, ptr %x
+ store <2 x double> %f, ptr %y
+ store <2 x double> %g, ptr %z
+ ret void
+}
+
+; Negative test. We can't fold because the FMA addend is a user.
+define void @vfwmacc_v2f32_addend_user(ptr %x, <2 x float> %a, <2 x float> %b) {
+; NO_FOLDING-LABEL: vfwmacc_v2f32_addend_user:
+; NO_FOLDING: # %bb.0:
+; NO_FOLDING-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; NO_FOLDING-NEXT: vfwcvt.f.f.v v10, v8
+; NO_FOLDING-NEXT: vfwcvt.f.f.v v8, v9
+; NO_FOLDING-NEXT: vsetvli zero, zero, e64, m1, ta, ma
+; NO_FOLDING-NEXT: vfmadd.vv v8, v10, v8
+; NO_FOLDING-NEXT: vse64.v v8, (a0)
+; NO_FOLDING-NEXT: ret
+;
+; FOLDING-LABEL: vfwmacc_v2f32_addend_user:
+; FOLDING: # %bb.0:
+; FOLDING-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; FOLDING-NEXT: vfwcvt.f.f.v v10, v8
+; FOLDING-NEXT: vfwcvt.f.f.v v8, v9
+; FOLDING-NEXT: vsetvli zero, zero, e64, m1, ta, ma
+; FOLDING-NEXT: vfmadd.vv v8, v10, v8
+; FOLDING-NEXT: vse64.v v8, (a0)
+; FOLDING-NEXT: ret
+ %c = fpext <2 x float> %a to <2 x double>
+ %d = fpext <2 x float> %b to <2 x double>
+ %f = call <2 x double> @llvm.fma(<2 x double> %c, <2 x double> %d, <2 x double> %d)
+ store <2 x double> %f, ptr %x
+ ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmacc.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmacc.ll
index 1803b52..5140d89 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmacc.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmacc.ll
@@ -2031,11 +2031,8 @@ define <8 x double> @vfwnmsac_fv_v8f64_v8f16(<8 x double> %va, <8 x half> %vb, h
define <2 x float> @vfwmacc_vf2_v2f32(<2 x float> %va, <2 x half> %vb, half %c) {
; CHECK-LABEL: vfwmacc_vf2_v2f32:
; CHECK: # %bb.0:
-; CHECK-NEXT: fcvt.s.h fa5, fa0
; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT: vfwcvt.f.f.v v10, v9
-; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT: vfmacc.vf v8, fa5, v10
+; CHECK-NEXT: vfwmacc.vf v8, fa0, v9
; CHECK-NEXT: ret
%cext = fpext half %c to float
%head = insertelement <2 x float> poison, float %cext, i32 0
@@ -2048,11 +2045,8 @@ define <2 x float> @vfwmacc_vf2_v2f32(<2 x float> %va, <2 x half> %vb, half %c)
define <2 x float> @vfwmsac_vf2_v2f32(<2 x float> %va, <2 x half> %vb, half %c) {
; CHECK-LABEL: vfwmsac_vf2_v2f32:
; CHECK: # %bb.0:
-; CHECK-NEXT: fcvt.s.h fa5, fa0
; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT: vfwcvt.f.f.v v10, v9
-; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT: vfmsac.vf v8, fa5, v10
+; CHECK-NEXT: vfwmsac.vf v8, fa0, v9
; CHECK-NEXT: ret
%cext = fpext half %c to float
%head = insertelement <2 x float> poison, float %cext, i32 0
@@ -2066,11 +2060,8 @@ define <2 x float> @vfwmsac_vf2_v2f32(<2 x float> %va, <2 x half> %vb, half %c)
define <2 x float> @vfwnmacc_vf2_v2f32(<2 x float> %va, <2 x half> %vb, half %c) {
; CHECK-LABEL: vfwnmacc_vf2_v2f32:
; CHECK: # %bb.0:
-; CHECK-NEXT: fcvt.s.h fa5, fa0
; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT: vfwcvt.f.f.v v10, v9
-; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT: vfnmacc.vf v8, fa5, v10
+; CHECK-NEXT: vfwnmacc.vf v8, fa0, v9
; CHECK-NEXT: ret
%cext = fpext half %c to float
%head = insertelement <2 x float> poison, float %cext, i32 0
@@ -2085,11 +2076,8 @@ define <2 x float> @vfwnmacc_vf2_v2f32(<2 x float> %va, <2 x half> %vb, half %c)
define <2 x float> @vfwnmsac_vf2_v2f32(<2 x float> %va, <2 x half> %vb, half %c) {
; CHECK-LABEL: vfwnmsac_vf2_v2f32:
; CHECK: # %bb.0:
-; CHECK-NEXT: fcvt.s.h fa5, fa0
; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT: vfwcvt.f.f.v v10, v9
-; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT: vfnmsac.vf v8, fa5, v10
+; CHECK-NEXT: vfwnmsac.vf v8, fa0, v9
; CHECK-NEXT: ret
%cext = fpext half %c to float
%head = insertelement <2 x float> poison, float %cext, i32 0
diff --git a/llvm/test/CodeGen/SystemZ/vec-combine-01.ll b/llvm/test/CodeGen/SystemZ/vec-combine-01.ll
index 6f0abd6..16231b2 100644
--- a/llvm/test/CodeGen/SystemZ/vec-combine-01.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-combine-01.ll
@@ -153,3 +153,13 @@ define void @f7(ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr %ptr4) {
store i8 %trunc3, ptr %ptr4
ret void
}
+
+; Test that a truncating store with a non-simple VT can be handled.
+define void @f8(ptr %src, ptr %dst) {
+; CHECK-LABEL: f8:
+ %1 = load <12 x i32>, ptr %src, align 64
+ %2 = extractelement <12 x i32> %1, i64 11
+ %3 = trunc i32 %2 to i16
+ store i16 %3, ptr %dst, align 2
+ ret void
+}
diff --git a/llvm/test/CodeGen/X86/known-bits.ll b/llvm/test/CodeGen/X86/known-bits.ll
index 9741f6f..f7b2538 100644
--- a/llvm/test/CodeGen/X86/known-bits.ll
+++ b/llvm/test/CodeGen/X86/known-bits.ll
@@ -115,7 +115,7 @@ define i128 @knownbits_mask_addc_shl(i64 %a0, i64 %a1, i64 %a2) nounwind {
; X64-NEXT: andq $-1024, %rdi # imm = 0xFC00
; X64-NEXT: andq $-1024, %rsi # imm = 0xFC00
; X64-NEXT: addq %rdi, %rsi
-; X64-NEXT: adcq $0, %rdx
+; X64-NEXT: adcl $0, %edx
; X64-NEXT: shldq $54, %rsi, %rdx
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: retq
diff --git a/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll b/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll
index 8110dd4..5da6832 100644
--- a/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll
+++ b/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll
@@ -737,6 +737,29 @@ entry:
ret void
}
+define void @freeze_vector_insert(<2 x float> %vec, i32 %idx, float %scalar) sanitize_numerical_stability {
+; CHECK-LABEL: @freeze_vector_insert(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[TMP0]], ptrtoint (ptr @freeze_vector_insert to i64)
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr @__nsan_shadow_args_ptr, align 1
+; CHECK-NEXT: [[TMP3:%.*]] = fpext <2 x float> [[VEC:%.*]] to <2 x double>
+; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP1]], <2 x double> [[TMP2]], <2 x double> [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = load double, ptr getelementptr ([16384 x i8], ptr @__nsan_shadow_args_ptr, i64 0, i64 16), align 1
+; CHECK-NEXT: [[TMP6:%.*]] = fpext float [[SCALAR:%.*]] to double
+; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP1]], double [[TMP5]], double [[TMP6]]
+; CHECK-NEXT: store i64 0, ptr @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[VEC]], float [[SCALAR]], i32 [[IDX:%.*]]
+; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x double> [[TMP4]], double [[TMP7]], i32 [[IDX]]
+; CHECK-NEXT: [[FROZEN:%.*]] = freeze <2 x float> [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = freeze <2 x double> [[TMP9]]
+; CHECK-NEXT: ret void
+;
+entry:
+ %1 = insertelement <2 x float> %vec, float %scalar, i32 %idx
+ %frozen = freeze <2 x float> %1
+ ret void
+}
define void @vector_shuffle(<2 x float> %0) sanitize_numerical_stability {
; CHECK-LABEL: @vector_shuffle(
diff --git a/llvm/test/MC/LoongArch/Basic/Integer/invalid.s b/llvm/test/MC/LoongArch/Basic/Integer/invalid.s
index 958d5ca..08a131d 100644
--- a/llvm/test/MC/LoongArch/Basic/Integer/invalid.s
+++ b/llvm/test/MC/LoongArch/Basic/Integer/invalid.s
@@ -99,11 +99,13 @@ jirl $a0, $a0, 0x20000
# CHECK: :[[#@LINE-1]]:16: error: operand must be a symbol with modifier (e.g. %b16) or an integer in the range [-131072, 131068]
## simm20
-pcaddi $a0, -0x80001
-# CHECK: :[[#@LINE-1]]:13: error: immediate must be an integer in the range [-524288, 524287]
pcaddu12i $a0, 0x80000
# CHECK: :[[#@LINE-1]]:16: error: immediate must be an integer in the range [-524288, 524287]
+## simm20_pcaddi
+pcaddi $a0, -0x80001
+# CHECK: :[[#@LINE-1]]:13: error: operand must be a symbol with modifier (e.g. %pcrel_20) or an integer in the range [-524288, 524287]
+
## simm20_lu12iw
lu12i.w $a0, -0x80001
# CHECK: :[[#@LINE-1]]:14: error: operand must be a symbol with modifier (e.g. %abs_hi20) or an integer in the range [-524288, 524287]
diff --git a/llvm/test/MC/LoongArch/Relocations/relocations.s b/llvm/test/MC/LoongArch/Relocations/relocations.s
index e83b671..091dce2 100644
--- a/llvm/test/MC/LoongArch/Relocations/relocations.s
+++ b/llvm/test/MC/LoongArch/Relocations/relocations.s
@@ -288,3 +288,23 @@ addi.d $t1, $a2, %le_lo12_r(foo)
# RELOC: R_LARCH_TLS_LE_LO12_R foo 0x0
# INSTR: addi.d $t1, $a2, %le_lo12_r(foo)
# FIXUP: fixup A - offset: 0, value: %le_lo12_r(foo), kind: FK_NONE
+
+pcaddi $t1, %pcrel_20(foo)
+# RELOC: R_LARCH_PCREL20_S2 foo 0x0
+# INSTR: pcaddi $t1, %pcrel_20(foo)
+# FIXUP: fixup A - offset: 0, value: %pcrel_20(foo), kind: FK_NONE
+
+pcaddi $t1, %ld_pcrel_20(foo)
+# RELOC: R_LARCH_TLS_LD_PCREL20_S2 foo 0x0
+# INSTR: pcaddi $t1, %ld_pcrel_20(foo)
+# FIXUP: fixup A - offset: 0, value: %ld_pcrel_20(foo), kind: FK_NONE
+
+pcaddi $t1, %gd_pcrel_20(foo)
+# RELOC: R_LARCH_TLS_GD_PCREL20_S2 foo 0x0
+# INSTR: pcaddi $t1, %gd_pcrel_20(foo)
+# FIXUP: fixup A - offset: 0, value: %gd_pcrel_20(foo), kind: FK_NONE
+
+pcaddi $t1, %desc_pcrel_20(foo)
+# RELOC: R_LARCH_TLS_DESC_PCREL20_S2 foo 0x0
+# INSTR: pcaddi $t1, %desc_pcrel_20(foo)
+# FIXUP: fixup A - offset: 0, value: %desc_pcrel_20(foo), kind: FK_NONE
diff --git a/llvm/test/Transforms/LICM/hoist-binop.ll b/llvm/test/Transforms/LICM/hoist-binop.ll
new file mode 100644
index 0000000..8bda74e
--- /dev/null
+++ b/llvm/test/Transforms/LICM/hoist-binop.ll
@@ -0,0 +1,226 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=licm < %s | FileCheck %s
+
+; Fold ADD and remove old op if unused.
+define void @add_one_use(i64 %c1, i64 %c2) {
+; CHECK-LABEL: @add_one_use(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[INVARIANT_OP:%.*]] = add i64 [[C1:%.*]], [[C2:%.*]]
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT: [[INDEX_NEXT_REASS]] = add i64 [[INDEX]], [[INVARIANT_OP]]
+; CHECK-NEXT: br label [[LOOP]]
+;
+entry:
+ br label %loop
+
+loop:
+ %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
+ %step.add = add i64 %index, %c1
+ %index.next = add i64 %step.add, %c2
+ br label %loop
+}
+
+; Fold ADD and copy NUW if both ops have it.
+; https://alive2.llvm.org/ce/z/bPAT7Z
+define void @add_nuw(i64 %c1, i64 %c2) {
+; CHECK-LABEL: @add_nuw(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[INVARIANT_OP:%.*]] = add nuw i64 [[C1:%.*]], [[C2:%.*]]
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT: [[STEP_ADD:%.*]] = add nuw i64 [[INDEX]], [[C1]]
+; CHECK-NEXT: call void @use(i64 [[STEP_ADD]])
+; CHECK-NEXT: [[INDEX_NEXT_REASS]] = add nuw i64 [[INDEX]], [[INVARIANT_OP]]
+; CHECK-NEXT: br label [[LOOP]]
+;
+entry:
+ br label %loop
+
+loop:
+ %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
+ %step.add = add nuw i64 %index, %c1
+ call void @use(i64 %step.add)
+ %index.next = add nuw i64 %step.add, %c2
+ br label %loop
+}
+
+; Fold ADD but don't copy NUW if only one op has it.
+define void @add_no_nuw(i64 %c1, i64 %c2) {
+; CHECK-LABEL: @add_no_nuw(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[INVARIANT_OP:%.*]] = add i64 [[C1:%.*]], [[C2:%.*]]
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT: [[STEP_ADD:%.*]] = add i64 [[INDEX]], [[C1]]
+; CHECK-NEXT: call void @use(i64 [[STEP_ADD]])
+; CHECK-NEXT: [[INDEX_NEXT_REASS]] = add i64 [[INDEX]], [[INVARIANT_OP]]
+; CHECK-NEXT: br label [[LOOP]]
+;
+entry:
+ br label %loop
+
+loop:
+ %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
+ %step.add = add i64 %index, %c1
+ call void @use(i64 %step.add)
+ %index.next = add nuw i64 %step.add, %c2
+ br label %loop
+}
+
+; Fold ADD but don't copy NSW if one op has it.
+define void @add_no_nsw(i64 %c1, i64 %c2) {
+; CHECK-LABEL: @add_no_nsw(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[INVARIANT_OP:%.*]] = add i64 [[C1:%.*]], [[C2:%.*]]
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT: [[STEP_ADD:%.*]] = add i64 [[INDEX]], [[C1]]
+; CHECK-NEXT: call void @use(i64 [[STEP_ADD]])
+; CHECK-NEXT: [[INDEX_NEXT_REASS]] = add i64 [[INDEX]], [[INVARIANT_OP]]
+; CHECK-NEXT: br label [[LOOP]]
+;
+entry:
+ br label %loop
+
+loop:
+ %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
+ %step.add = add i64 %index, %c1
+ call void @use(i64 %step.add)
+ %index.next = add nsw i64 %step.add, %c2
+ br label %loop
+}
+
+; Fold ADD but don't copy NSW even if both ops have it.
+define void @add_no_nsw_2(i64 %c1, i64 %c2) {
+; CHECK-LABEL: @add_no_nsw_2(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[INVARIANT_OP:%.*]] = add i64 [[C1:%.*]], [[C2:%.*]]
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT: [[STEP_ADD:%.*]] = add nsw i64 [[INDEX]], [[C1]]
+; CHECK-NEXT: call void @use(i64 [[STEP_ADD]])
+; CHECK-NEXT: [[INDEX_NEXT_REASS]] = add i64 [[INDEX]], [[INVARIANT_OP]]
+; CHECK-NEXT: br label [[LOOP]]
+;
+entry:
+ br label %loop
+
+loop:
+ %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
+ %step.add = add nsw i64 %index, %c1
+ call void @use(i64 %step.add)
+ %index.next = add nsw i64 %step.add, %c2
+ br label %loop
+}
+
+; Don't fold if the ops are different (even if they are both associative).
+define void @diff_ops(i64 %c1, i64 %c2) {
+; CHECK-LABEL: @diff_ops(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT: [[STEP_ADD:%.*]] = add i64 [[INDEX]], [[C1:%.*]]
+; CHECK-NEXT: call void @use(i64 [[STEP_ADD]])
+; CHECK-NEXT: [[INDEX_NEXT]] = mul i64 [[STEP_ADD]], [[C2:%.*]]
+; CHECK-NEXT: br label [[LOOP]]
+;
+entry:
+ br label %loop
+
+loop:
+ %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
+ %step.add = add i64 %index, %c1
+ call void @use(i64 %step.add)
+ %index.next = mul i64 %step.add, %c2
+ br label %loop
+}
+
+; Don't fold if the ops are not associative.
+define void @noassoc_ops(i64 %c1, i64 %c2) {
+; CHECK-LABEL: @noassoc_ops(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT: [[STEP_ADD:%.*]] = sub i64 [[INDEX]], [[C1:%.*]]
+; CHECK-NEXT: call void @use(i64 [[STEP_ADD]])
+; CHECK-NEXT: [[INDEX_NEXT]] = sub i64 [[STEP_ADD]], [[C2:%.*]]
+; CHECK-NEXT: br label [[LOOP]]
+;
+entry:
+ br label %loop
+
+loop:
+ %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
+ %step.add = sub i64 %index, %c1
+ call void @use(i64 %step.add)
+ %index.next = sub i64 %step.add, %c2
+ br label %loop
+}
+
+; Don't fold floating-point ops, even if they are associative. This would be
+; valid, but is currently disabled.
+define void @fadd(float %c1, float %c2) {
+; CHECK-LABEL: @fadd(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[INDEX:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT: [[STEP_ADD:%.*]] = fadd fast float [[INDEX]], [[C1:%.*]]
+; CHECK-NEXT: call void @use(float [[STEP_ADD]])
+; CHECK-NEXT: [[INDEX_NEXT]] = fadd fast float [[STEP_ADD]], [[C2:%.*]]
+; CHECK-NEXT: br label [[LOOP]]
+;
+entry:
+ br label %loop
+
+loop:
+ %index = phi float [ 0., %entry ], [ %index.next, %loop ]
+ %step.add = fadd fast float %index, %c1
+ call void @use(float %step.add)
+ %index.next = fadd fast float %step.add, %c2
+ br label %loop
+}
+
+; Original reproducer, adapted from:
+; for(long i = 0; i < n; ++i)
+; a[i] = (i*k) * v;
+define void @test(i64 %n, i64 %k) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[K_2:%.*]] = shl nuw nsw i64 [[K:%.*]], 1
+; CHECK-NEXT: [[VEC_INIT:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[K]], i64 1
+; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[K_2]], i64 0
+; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[DOTSPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[INVARIANT_OP:%.*]] = add <2 x i64> [[DOTSPLAT]], [[DOTSPLAT]]
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ [[VEC_INIT]], [[ENTRY:%.*]] ], [ [[VEC_IND_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT: call void @use(<2 x i64> [[STEP_ADD]])
+; CHECK-NEXT: [[VEC_IND_NEXT_REASS]] = add <2 x i64> [[VEC_IND]], [[INVARIANT_OP]]
+; CHECK-NEXT: br label [[LOOP]]
+;
+entry:
+ %k.2 = shl nuw nsw i64 %k, 1
+ %vec.init = insertelement <2 x i64> zeroinitializer, i64 %k, i64 1
+ %.splatinsert = insertelement <2 x i64> poison, i64 %k.2, i64 0
+ %.splat = shufflevector <2 x i64> %.splatinsert, <2 x i64> poison, <2 x i32> zeroinitializer
+ br label %loop
+
+loop:
+ %vec.ind = phi <2 x i64> [ %vec.init, %entry ], [ %vec.ind.next, %loop ]
+ %step.add = add <2 x i64> %vec.ind, %.splat
+ call void @use(<2 x i64> %step.add)
+ %vec.ind.next = add <2 x i64> %step.add, %.splat
+ br label %loop
+}
+
+declare void @use()
diff --git a/llvm/test/Transforms/LICM/sink-foldable.ll b/llvm/test/Transforms/LICM/sink-foldable.ll
index 38577a5..36e2eab 100644
--- a/llvm/test/Transforms/LICM/sink-foldable.ll
+++ b/llvm/test/Transforms/LICM/sink-foldable.ll
@@ -77,9 +77,10 @@ return:
define ptr @test2(i32 %j, ptr readonly %P, ptr readnone %Q) {
; CHECK-LABEL: @test2(
; CHECK-NEXT: entry:
+; CHECK-NEXT: [[INVARIANT_OP:%.*]] = add i32 1, 1
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.cond:
-; CHECK-NEXT: [[I_ADDR_0:%.*]] = phi i32 [ [[ADD:%.*]], [[IF_END:%.*]] ]
+; CHECK-NEXT: [[I_ADDR_0:%.*]] = phi i32 [ [[ADD_REASS:%.*]], [[IF_END:%.*]] ]
; CHECK-NEXT: [[P_ADDR_0:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[IF_END]] ]
; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_ADDR_0]], [[J:%.*]]
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[LOOPEXIT0:%.*]]
@@ -97,7 +98,7 @@ define ptr @test2(i32 %j, ptr readonly %P, ptr readnone %Q) {
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[ADD_PTR]], i64 [[IDX2_EXT]]
; CHECK-NEXT: [[L1:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8
; CHECK-NEXT: [[CMP2:%.*]] = icmp ugt ptr [[L1]], [[Q]]
-; CHECK-NEXT: [[ADD]] = add nsw i32 [[ADD_I]], 1
+; CHECK-NEXT: [[ADD_REASS]] = add i32 [[I_ADDR]], [[INVARIANT_OP]]
; CHECK-NEXT: br i1 [[CMP2]], label [[LOOPEXIT2:%.*]], label [[FOR_COND]]
; CHECK: loopexit0:
; CHECK-NEXT: [[P0:%.*]] = phi ptr [ null, [[FOR_COND]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index 3a14842..9c9547c 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -135,8 +135,8 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: Not Interleaving.
; CHECK-NEXT: LV: Interleaving is not beneficial.
; CHECK-NEXT: LV: Found a vectorizable loop (vscale x 4) in <stdin>
-; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop
; CHECK-NEXT: VF picked by VPlan cost model: vscale x 4
+; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop
; CHECK-NEXT: Executing best plan with VF=vscale x 4, UF=1
; CHECK-NEXT: VPlan 'Final VPlan for VF={vscale x 4},UF>=1' {
; CHECK-NEXT: Live-in vp<%0> = VF * UF
@@ -340,8 +340,8 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: Not Interleaving.
; CHECK-NEXT: LV: Interleaving is not beneficial.
; CHECK-NEXT: LV: Found a vectorizable loop (vscale x 4) in <stdin>
-; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop
; CHECK-NEXT: VF picked by VPlan cost model: vscale x 4
+; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop
; CHECK-NEXT: Executing best plan with VF=vscale x 4, UF=1
; CHECK-NEXT: VPlan 'Final VPlan for VF={vscale x 4},UF>=1' {
; CHECK-NEXT: Live-in vp<%0> = VF * UF
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll
index 4dea523..31f0e06 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll
@@ -6,23 +6,19 @@ define i16 @test() {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[A:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 5
; CHECK-NEXT: [[A1:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 6
-; CHECK-NEXT: [[A2:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 7
; CHECK-NEXT: br label [[WHILE:%.*]]
; CHECK: while:
-; CHECK-NEXT: [[PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX5:%.*]], [[WHILE]] ]
+; CHECK-NEXT: [[PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX26:%.*]], [[WHILE]] ]
; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr null, align 8
; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr [[A1]], align 16
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[A2]], align 8
-; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr null, align 8
-; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i64>, ptr [[A]], align 8
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i64> [[TMP5]], i64 [[TMP0]], i32 2
-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i64> [[TMP6]], i64 [[TMP1]], i32 3
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i64> [[TMP7]], <8 x i64> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> zeroinitializer, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 4, i32 5, i32 8, i32 8>
-; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> [[TMP10]])
-; CHECK-NEXT: [[OP_RDX5]] = xor i64 [[TMP3]], [[TMP11]]
+; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr null, align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr [[A]], align 8
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
+; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> [[TMP4]])
+; CHECK-NEXT: [[OP_RDX:%.*]] = xor i64 0, [[TMP2]]
+; CHECK-NEXT: [[OP_RDX24:%.*]] = xor i64 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[OP_RDX25:%.*]] = xor i64 [[OP_RDX]], [[OP_RDX24]]
+; CHECK-NEXT: [[OP_RDX26]] = xor i64 [[OP_RDX25]], [[TMP5]]
; CHECK-NEXT: br label [[WHILE]]
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll
index 8f1d7a1..69ecf18 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll
@@ -7,9 +7,11 @@ define void @test(ptr nocapture %t2) {
; CHECK-NEXT: [[T4:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 7
; CHECK-NEXT: [[T5:%.*]] = load i32, ptr [[T4]], align 4
; CHECK-NEXT: [[T8:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 1
+; CHECK-NEXT: [[T9:%.*]] = load i32, ptr [[T8]], align 4
; CHECK-NEXT: [[T10:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 6
; CHECK-NEXT: [[T11:%.*]] = load i32, ptr [[T10]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[T8]], align 4
+; CHECK-NEXT: [[T14:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 2
+; CHECK-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4
; CHECK-NEXT: [[T16:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 5
; CHECK-NEXT: [[T17:%.*]] = load i32, ptr [[T16]], align 4
; CHECK-NEXT: [[T20:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 3
@@ -19,11 +21,10 @@ define void @test(ptr nocapture %t2) {
; CHECK-NEXT: [[T24:%.*]] = add nsw i32 [[T23]], [[T21]]
; CHECK-NEXT: [[T25:%.*]] = sub nsw i32 [[T21]], [[T23]]
; CHECK-NEXT: [[T27:%.*]] = sub nsw i32 [[T3]], [[T24]]
-; CHECK-NEXT: [[T9:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; CHECK-NEXT: [[T15:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
; CHECK-NEXT: [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]]
; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T27]], [[T29]]
; CHECK-NEXT: [[T31:%.*]] = mul nsw i32 [[T30]], 4433
+; CHECK-NEXT: [[T32:%.*]] = mul nsw i32 [[T27]], 6270
; CHECK-NEXT: [[T34:%.*]] = mul nsw i32 [[T29]], -15137
; CHECK-NEXT: [[T37:%.*]] = add nsw i32 [[T25]], [[T11]]
; CHECK-NEXT: [[T38:%.*]] = add nsw i32 [[T17]], [[T5]]
@@ -33,19 +34,20 @@ define void @test(ptr nocapture %t2) {
; CHECK-NEXT: [[T42:%.*]] = mul nsw i32 [[T17]], 16819
; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069
; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[T27]], i32 2
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[T47]], i32 3
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> <i32 poison, i32 poison, i32 6270, i32 poison>, <4 x i32> <i32 1, i32 0, i32 6, i32 poison>
-; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T40]], i32 3
-; CHECK-NEXT: [[TMP9:%.*]] = add nsw <4 x i32> [[TMP6]], [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = mul nsw <4 x i32> [[TMP6]], [[TMP8]]
-; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
-; CHECK-NEXT: [[T50:%.*]] = add nsw i32 [[T40]], [[T48]]
-; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 poison, i32 poison, i32 3>
-; CHECK-NEXT: [[T701:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[T50]], i32 5
+; CHECK-NEXT: [[T49:%.*]] = add nsw i32 [[T40]], [[T47]]
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T15]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T40]], i32 1
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T9]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[T48]], i32 1
+; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP2]], [[TMP4]]
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> <i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[T67:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T32]], i32 2
+; CHECK-NEXT: [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[T49]], i32 3
+; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[T701:%.*]] = shufflevector <8 x i32> [[T68]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[T701]], i32 [[T34]], i32 6
-; CHECK-NEXT: [[T76:%.*]] = shl <8 x i32> [[T71]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT: [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[T49]], i32 7
+; CHECK-NEXT: [[T76:%.*]] = shl <8 x i32> [[T72]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
; CHECK-NEXT: store <8 x i32> [[T76]], ptr [[T2]], align 4
; CHECK-NEXT: ret void
;
diff --git a/llvm/test/Transforms/SimplifyCFG/AMDGPU/skip-threading.ll b/llvm/test/Transforms/SimplifyCFG/AMDGPU/skip-threading.ll
new file mode 100644
index 0000000..b1262e2
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/AMDGPU/skip-threading.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn -S -passes=simplifycfg < %s | FileCheck %s
+
+declare void @bar1()
+declare void @bar2()
+declare void @bar3()
+
+define i32 @test_01a(i32 %a) {
+; CHECK-LABEL: define i32 @test_01a(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A]], 0
+; CHECK-NEXT: br i1 [[COND]], label %[[MERGE:.*]], label %[[IF_FALSE:.*]]
+; CHECK: [[IF_FALSE]]:
+; CHECK-NEXT: call void @bar1()
+; CHECK-NEXT: br label %[[MERGE]]
+; CHECK: [[MERGE]]:
+; CHECK-NEXT: call void @bar2()
+; CHECK-NEXT: br i1 [[COND]], label %[[EXIT:.*]], label %[[IF_FALSE_2:.*]]
+; CHECK: [[IF_FALSE_2]]:
+; CHECK-NEXT: call void @bar3()
+; CHECK-NEXT: br label %[[EXIT]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret i32 [[A]]
+;
+entry:
+ %cond = icmp eq i32 %a, 0
+ br i1 %cond, label %merge, label %if.false
+
+if.false:
+ call void @bar1()
+ br label %merge
+
+merge:
+ call void @bar2()
+ br i1 %cond, label %exit, label %if.false.2
+
+if.false.2:
+ call void @bar3()
+ br label %exit
+
+exit:
+ ret i32 %a
+}
diff --git a/llvm/test/Transforms/SimplifyCFG/convergent.ll b/llvm/test/Transforms/SimplifyCFG/convergent.ll
index 6ba51e0..d148063 100644
--- a/llvm/test/Transforms/SimplifyCFG/convergent.ll
+++ b/llvm/test/Transforms/SimplifyCFG/convergent.ll
@@ -4,6 +4,9 @@
; RUN: opt -S -passes='simplifycfg<hoist-common-insts;sink-common-insts>' < %s | FileCheck -check-prefixes=CHECK,SINK %s
declare void @foo() convergent
+declare void @bar1()
+declare void @bar2()
+declare void @bar3()
declare i32 @tid()
declare i32 @mbcnt(i32 %a, i32 %b) convergent
declare i32 @bpermute(i32 %a, i32 %b) convergent
@@ -45,6 +48,42 @@ exit:
ret i32 %a
}
+define i32 @test_01a(i32 %a) {
+; CHECK-LABEL: @test_01a(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT: br i1 [[COND]], label [[EXIT_CRITEDGE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK: if.false:
+; CHECK-NEXT: call void @bar1()
+; CHECK-NEXT: call void @bar2()
+; CHECK-NEXT: call void @bar3()
+; CHECK-NEXT: br label [[EXIT:%.*]]
+; CHECK: exit.critedge:
+; CHECK-NEXT: call void @bar2()
+; CHECK-NEXT: br label [[EXIT]]
+; CHECK: exit:
+; CHECK-NEXT: ret i32 [[A]]
+;
+entry:
+ %cond = icmp eq i32 %a, 0
+ br i1 %cond, label %merge, label %if.false
+
+if.false:
+ call void @bar1()
+ br label %merge
+
+merge:
+ call void @bar2()
+ br i1 %cond, label %exit, label %if.false.2
+
+if.false.2:
+ call void @bar3()
+ br label %exit
+
+exit:
+ ret i32 %a
+}
+
define void @test_02(ptr %y.coerce) convergent {
; NOSINK-LABEL: @test_02(
; NOSINK-NEXT: entry:
diff --git a/llvm/test/tools/llvm-mca/X86/Barcelona/resources-sse2.s b/llvm/test/tools/llvm-mca/X86/Barcelona/resources-sse2.s
index 904454a..df0053a 100644
--- a/llvm/test/tools/llvm-mca/X86/Barcelona/resources-sse2.s
+++ b/llvm/test/tools/llvm-mca/X86/Barcelona/resources-sse2.s
@@ -624,8 +624,8 @@ xorpd (%rax), %xmm2
# CHECK-NEXT: 2 7 0.50 * psubb (%rax), %xmm2
# CHECK-NEXT: 1 1 0.50 psubd %xmm0, %xmm2
# CHECK-NEXT: 2 7 0.50 * psubd (%rax), %xmm2
-# CHECK-NEXT: 1 3 1.00 psubq %mm0, %mm2
-# CHECK-NEXT: 2 8 1.00 * psubq (%rax), %mm2
+# CHECK-NEXT: 1 1 0.50 psubq %mm0, %mm2
+# CHECK-NEXT: 2 7 0.50 * psubq (%rax), %mm2
# CHECK-NEXT: 1 1 0.50 psubq %xmm0, %xmm2
# CHECK-NEXT: 2 7 0.50 * psubq (%rax), %xmm2
# CHECK-NEXT: 1 1 0.50 psubsb %xmm0, %xmm2
@@ -687,7 +687,7 @@ xorpd (%rax), %xmm2
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1]
-# CHECK-NEXT: - 172.00 75.83 118.33 17.00 100.83 67.00 67.00
+# CHECK-NEXT: - 172.00 75.83 117.33 17.00 101.83 67.00 67.00
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions:
@@ -908,8 +908,8 @@ xorpd (%rax), %xmm2
# CHECK-NEXT: - - - 0.50 - 0.50 0.50 0.50 psubb (%rax), %xmm2
# CHECK-NEXT: - - - 0.50 - 0.50 - - psubd %xmm0, %xmm2
# CHECK-NEXT: - - - 0.50 - 0.50 0.50 0.50 psubd (%rax), %xmm2
-# CHECK-NEXT: - - - 1.00 - - - - psubq %mm0, %mm2
-# CHECK-NEXT: - - - 1.00 - - 0.50 0.50 psubq (%rax), %mm2
+# CHECK-NEXT: - - - 0.50 - 0.50 - - psubq %mm0, %mm2
+# CHECK-NEXT: - - - 0.50 - 0.50 0.50 0.50 psubq (%rax), %mm2
# CHECK-NEXT: - - - 0.50 - 0.50 - - psubq %xmm0, %xmm2
# CHECK-NEXT: - - - 0.50 - 0.50 0.50 0.50 psubq (%rax), %xmm2
# CHECK-NEXT: - - - 0.50 - 0.50 - - psubsb %xmm0, %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/Barcelona/zero-idioms.s b/llvm/test/tools/llvm-mca/X86/Barcelona/zero-idioms.s
index 71902fe..54ff013 100644
--- a/llvm/test/tools/llvm-mca/X86/Barcelona/zero-idioms.s
+++ b/llvm/test/tools/llvm-mca/X86/Barcelona/zero-idioms.s
@@ -49,13 +49,13 @@ pxor %xmm2, %xmm2
# CHECK: Iterations: 1
# CHECK-NEXT: Instructions: 35
-# CHECK-NEXT: Total Cycles: 39
+# CHECK-NEXT: Total Cycles: 37
# CHECK-NEXT: Total uOps: 35
# CHECK: Dispatch Width: 4
-# CHECK-NEXT: uOps Per Cycle: 0.90
-# CHECK-NEXT: IPC: 0.90
-# CHECK-NEXT: Block RThroughput: 11.0
+# CHECK-NEXT: uOps Per Cycle: 0.95
+# CHECK-NEXT: IPC: 0.95
+# CHECK-NEXT: Block RThroughput: 10.0
# CHECK: Instruction Info:
# CHECK-NEXT: [1]: #uOps
@@ -79,7 +79,7 @@ pxor %xmm2, %xmm2
# CHECK-NEXT: 1 0 0.25 pcmpgtw %xmm2, %xmm2
# CHECK-NEXT: 1 3 1.00 psubb %mm2, %mm2
# CHECK-NEXT: 1 3 1.00 psubd %mm2, %mm2
-# CHECK-NEXT: 1 3 1.00 psubq %mm2, %mm2
+# CHECK-NEXT: 1 1 0.50 psubq %mm2, %mm2
# CHECK-NEXT: 1 3 1.00 psubw %mm2, %mm2
# CHECK-NEXT: 1 0 0.25 psubb %xmm2, %xmm2
# CHECK-NEXT: 1 0 0.25 psubd %xmm2, %xmm2
@@ -118,7 +118,7 @@ pxor %xmm2, %xmm2
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1]
-# CHECK-NEXT: - - 2.00 12.00 - 6.00 - -
+# CHECK-NEXT: - - 3.00 11.00 - 6.00 - -
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions:
@@ -135,7 +135,7 @@ pxor %xmm2, %xmm2
# CHECK-NEXT: - - - - - - - - pcmpgtw %xmm2, %xmm2
# CHECK-NEXT: - - - 1.00 - - - - psubb %mm2, %mm2
# CHECK-NEXT: - - - 1.00 - - - - psubd %mm2, %mm2
-# CHECK-NEXT: - - - 1.00 - - - - psubq %mm2, %mm2
+# CHECK-NEXT: - - - - - 1.00 - - psubq %mm2, %mm2
# CHECK-NEXT: - - - 1.00 - - - - psubw %mm2, %mm2
# CHECK-NEXT: - - - - - - - - psubb %xmm2, %xmm2
# CHECK-NEXT: - - - - - - - - psubd %xmm2, %xmm2
@@ -155,48 +155,48 @@ pxor %xmm2, %xmm2
# CHECK-NEXT: - - 1.00 - - - - - pandn %xmm2, %xmm2
# CHECK-NEXT: - - - - - - - - xorps %xmm0, %xmm0
# CHECK-NEXT: - - - - - - - - xorpd %xmm1, %xmm1
-# CHECK-NEXT: - - - - - 1.00 - - pxor %mm2, %mm2
+# CHECK-NEXT: - - 1.00 - - - - - pxor %mm2, %mm2
# CHECK-NEXT: - - - - - - - - pxor %xmm2, %xmm2
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 012345678
+# CHECK-NEXT: 0123456789 0123456
# CHECK-NEXT: Index 0123456789 0123456789
-# CHECK: [0,0] DR . . . . . . . . subl %eax, %eax
-# CHECK-NEXT: [0,1] DR . . . . . . . . subq %rax, %rax
-# CHECK-NEXT: [0,2] DR . . . . . . . . xorl %eax, %eax
-# CHECK-NEXT: [0,3] DR . . . . . . . . xorq %rax, %rax
-# CHECK-NEXT: [0,4] .DeeeER . . . . . . . pcmpgtb %mm2, %mm2
-# CHECK-NEXT: [0,5] .D===eeeER. . . . . . . pcmpgtd %mm2, %mm2
-# CHECK-NEXT: [0,6] .D======eeeER . . . . . . pcmpgtw %mm2, %mm2
-# CHECK-NEXT: [0,7] .D----------R . . . . . . pcmpgtb %xmm2, %xmm2
-# CHECK-NEXT: [0,8] . D---------R . . . . . . pcmpgtd %xmm2, %xmm2
-# CHECK-NEXT: [0,9] . D---------R . . . . . . pcmpgtq %xmm2, %xmm2
-# CHECK-NEXT: [0,10] . D---------R . . . . . . pcmpgtw %xmm2, %xmm2
-# CHECK-NEXT: [0,11] . D========eeeER . . . . . psubb %mm2, %mm2
-# CHECK-NEXT: [0,12] . D==========eeeER . . . . . psubd %mm2, %mm2
-# CHECK-NEXT: [0,13] . D=============eeeER . . . . psubq %mm2, %mm2
-# CHECK-NEXT: [0,14] . D================eeeER. . . . psubw %mm2, %mm2
-# CHECK-NEXT: [0,15] . D--------------------R. . . . psubb %xmm2, %xmm2
-# CHECK-NEXT: [0,16] . D-------------------R. . . . psubd %xmm2, %xmm2
-# CHECK-NEXT: [0,17] . D-------------------R. . . . psubq %xmm2, %xmm2
-# CHECK-NEXT: [0,18] . D-------------------R. . . . psubw %xmm2, %xmm2
-# CHECK-NEXT: [0,19] . D==================eeeER . . . psubsb %mm2, %mm2
-# CHECK-NEXT: [0,20] . D====================eeeER . . psubsw %mm2, %mm2
-# CHECK-NEXT: [0,21] . DeE----------------------R . . psubsb %xmm2, %xmm2
-# CHECK-NEXT: [0,22] . D=eE---------------------R . . psubsw %xmm2, %xmm2
-# CHECK-NEXT: [0,23] . D=======================eeeER . . psubusb %mm2, %mm2
-# CHECK-NEXT: [0,24] . .D=========================eeeER . psubusw %mm2, %mm2
-# CHECK-NEXT: [0,25] . .D=eE--------------------------R . psubusb %xmm2, %xmm2
-# CHECK-NEXT: [0,26] . .D==eE-------------------------R . psubusw %xmm2, %xmm2
-# CHECK-NEXT: [0,27] . .D==eE-------------------------R . andnps %xmm0, %xmm0
-# CHECK-NEXT: [0,28] . . D==eE------------------------R . andnpd %xmm1, %xmm1
-# CHECK-NEXT: [0,29] . . D===========================eER. pandn %mm2, %mm2
-# CHECK-NEXT: [0,30] . . D==eE-------------------------R. pandn %xmm2, %xmm2
-# CHECK-NEXT: [0,31] . . D-----------------------------R. xorps %xmm0, %xmm0
-# CHECK-NEXT: [0,32] . . D----------------------------R. xorpd %xmm1, %xmm1
-# CHECK-NEXT: [0,33] . . D===========================eER pxor %mm2, %mm2
-# CHECK-NEXT: [0,34] . . D-----------------------------R pxor %xmm2, %xmm2
+# CHECK: [0,0] DR . . . . . . .. subl %eax, %eax
+# CHECK-NEXT: [0,1] DR . . . . . . .. subq %rax, %rax
+# CHECK-NEXT: [0,2] DR . . . . . . .. xorl %eax, %eax
+# CHECK-NEXT: [0,3] DR . . . . . . .. xorq %rax, %rax
+# CHECK-NEXT: [0,4] .DeeeER . . . . . .. pcmpgtb %mm2, %mm2
+# CHECK-NEXT: [0,5] .D===eeeER. . . . . .. pcmpgtd %mm2, %mm2
+# CHECK-NEXT: [0,6] .D======eeeER . . . . .. pcmpgtw %mm2, %mm2
+# CHECK-NEXT: [0,7] .D----------R . . . . .. pcmpgtb %xmm2, %xmm2
+# CHECK-NEXT: [0,8] . D---------R . . . . .. pcmpgtd %xmm2, %xmm2
+# CHECK-NEXT: [0,9] . D---------R . . . . .. pcmpgtq %xmm2, %xmm2
+# CHECK-NEXT: [0,10] . D---------R . . . . .. pcmpgtw %xmm2, %xmm2
+# CHECK-NEXT: [0,11] . D========eeeER . . . .. psubb %mm2, %mm2
+# CHECK-NEXT: [0,12] . D==========eeeER . . . .. psubd %mm2, %mm2
+# CHECK-NEXT: [0,13] . D=============eER. . . .. psubq %mm2, %mm2
+# CHECK-NEXT: [0,14] . D==============eeeER . . .. psubw %mm2, %mm2
+# CHECK-NEXT: [0,15] . D------------------R . . .. psubb %xmm2, %xmm2
+# CHECK-NEXT: [0,16] . D-----------------R . . .. psubd %xmm2, %xmm2
+# CHECK-NEXT: [0,17] . D-----------------R . . .. psubq %xmm2, %xmm2
+# CHECK-NEXT: [0,18] . D-----------------R . . .. psubw %xmm2, %xmm2
+# CHECK-NEXT: [0,19] . D================eeeER . .. psubsb %mm2, %mm2
+# CHECK-NEXT: [0,20] . D==================eeeER . .. psubsw %mm2, %mm2
+# CHECK-NEXT: [0,21] . DeE--------------------R . .. psubsb %xmm2, %xmm2
+# CHECK-NEXT: [0,22] . D=eE-------------------R . .. psubsw %xmm2, %xmm2
+# CHECK-NEXT: [0,23] . D=====================eeeER .. psubusb %mm2, %mm2
+# CHECK-NEXT: [0,24] . .D=======================eeeER.. psubusw %mm2, %mm2
+# CHECK-NEXT: [0,25] . .D=eE------------------------R.. psubusb %xmm2, %xmm2
+# CHECK-NEXT: [0,26] . .D==eE-----------------------R.. psubusw %xmm2, %xmm2
+# CHECK-NEXT: [0,27] . .D==eE-----------------------R.. andnps %xmm0, %xmm0
+# CHECK-NEXT: [0,28] . . D==eE----------------------R.. andnpd %xmm1, %xmm1
+# CHECK-NEXT: [0,29] . . D=========================eER. pandn %mm2, %mm2
+# CHECK-NEXT: [0,30] . . D==eE-----------------------R. pandn %xmm2, %xmm2
+# CHECK-NEXT: [0,31] . . D---------------------------R. xorps %xmm0, %xmm0
+# CHECK-NEXT: [0,32] . . D--------------------------R. xorpd %xmm1, %xmm1
+# CHECK-NEXT: [0,33] . . D=========================eER pxor %mm2, %mm2
+# CHECK-NEXT: [0,34] . . D---------------------------R pxor %xmm2, %xmm2
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -219,25 +219,25 @@ pxor %xmm2, %xmm2
# CHECK-NEXT: 11. 1 9.0 0.0 0.0 psubb %mm2, %mm2
# CHECK-NEXT: 12. 1 11.0 0.0 0.0 psubd %mm2, %mm2
# CHECK-NEXT: 13. 1 14.0 0.0 0.0 psubq %mm2, %mm2
-# CHECK-NEXT: 14. 1 17.0 0.0 0.0 psubw %mm2, %mm2
-# CHECK-NEXT: 15. 1 0.0 0.0 20.0 psubb %xmm2, %xmm2
-# CHECK-NEXT: 16. 1 0.0 0.0 19.0 psubd %xmm2, %xmm2
-# CHECK-NEXT: 17. 1 0.0 0.0 19.0 psubq %xmm2, %xmm2
-# CHECK-NEXT: 18. 1 0.0 0.0 19.0 psubw %xmm2, %xmm2
-# CHECK-NEXT: 19. 1 19.0 0.0 0.0 psubsb %mm2, %mm2
-# CHECK-NEXT: 20. 1 21.0 0.0 0.0 psubsw %mm2, %mm2
-# CHECK-NEXT: 21. 1 1.0 1.0 22.0 psubsb %xmm2, %xmm2
-# CHECK-NEXT: 22. 1 2.0 0.0 21.0 psubsw %xmm2, %xmm2
-# CHECK-NEXT: 23. 1 24.0 0.0 0.0 psubusb %mm2, %mm2
-# CHECK-NEXT: 24. 1 26.0 0.0 0.0 psubusw %mm2, %mm2
-# CHECK-NEXT: 25. 1 2.0 0.0 26.0 psubusb %xmm2, %xmm2
-# CHECK-NEXT: 26. 1 3.0 0.0 25.0 psubusw %xmm2, %xmm2
-# CHECK-NEXT: 27. 1 3.0 3.0 25.0 andnps %xmm0, %xmm0
-# CHECK-NEXT: 28. 1 3.0 3.0 24.0 andnpd %xmm1, %xmm1
-# CHECK-NEXT: 29. 1 28.0 0.0 0.0 pandn %mm2, %mm2
-# CHECK-NEXT: 30. 1 3.0 0.0 25.0 pandn %xmm2, %xmm2
-# CHECK-NEXT: 31. 1 0.0 0.0 29.0 xorps %xmm0, %xmm0
-# CHECK-NEXT: 32. 1 0.0 0.0 28.0 xorpd %xmm1, %xmm1
-# CHECK-NEXT: 33. 1 28.0 0.0 0.0 pxor %mm2, %mm2
-# CHECK-NEXT: 34. 1 0.0 0.0 29.0 pxor %xmm2, %xmm2
-# CHECK-NEXT: 1 6.5 0.2 10.5 <total>
+# CHECK-NEXT: 14. 1 15.0 0.0 0.0 psubw %mm2, %mm2
+# CHECK-NEXT: 15. 1 0.0 0.0 18.0 psubb %xmm2, %xmm2
+# CHECK-NEXT: 16. 1 0.0 0.0 17.0 psubd %xmm2, %xmm2
+# CHECK-NEXT: 17. 1 0.0 0.0 17.0 psubq %xmm2, %xmm2
+# CHECK-NEXT: 18. 1 0.0 0.0 17.0 psubw %xmm2, %xmm2
+# CHECK-NEXT: 19. 1 17.0 0.0 0.0 psubsb %mm2, %mm2
+# CHECK-NEXT: 20. 1 19.0 0.0 0.0 psubsw %mm2, %mm2
+# CHECK-NEXT: 21. 1 1.0 1.0 20.0 psubsb %xmm2, %xmm2
+# CHECK-NEXT: 22. 1 2.0 0.0 19.0 psubsw %xmm2, %xmm2
+# CHECK-NEXT: 23. 1 22.0 0.0 0.0 psubusb %mm2, %mm2
+# CHECK-NEXT: 24. 1 24.0 0.0 0.0 psubusw %mm2, %mm2
+# CHECK-NEXT: 25. 1 2.0 0.0 24.0 psubusb %xmm2, %xmm2
+# CHECK-NEXT: 26. 1 3.0 0.0 23.0 psubusw %xmm2, %xmm2
+# CHECK-NEXT: 27. 1 3.0 3.0 23.0 andnps %xmm0, %xmm0
+# CHECK-NEXT: 28. 1 3.0 3.0 22.0 andnpd %xmm1, %xmm1
+# CHECK-NEXT: 29. 1 26.0 0.0 0.0 pandn %mm2, %mm2
+# CHECK-NEXT: 30. 1 3.0 0.0 23.0 pandn %xmm2, %xmm2
+# CHECK-NEXT: 31. 1 0.0 0.0 27.0 xorps %xmm0, %xmm0
+# CHECK-NEXT: 32. 1 0.0 0.0 26.0 xorpd %xmm1, %xmm1
+# CHECK-NEXT: 33. 1 26.0 0.0 0.0 pxor %mm2, %mm2
+# CHECK-NEXT: 34. 1 0.0 0.0 27.0 pxor %xmm2, %xmm2
+# CHECK-NEXT: 1 6.1 0.2 9.7 <total>
diff --git a/llvm/test/tools/llvm-mca/X86/Broadwell/resources-mmx.s b/llvm/test/tools/llvm-mca/X86/Broadwell/resources-mmx.s
index 69491f0..53b9d22 100644
--- a/llvm/test/tools/llvm-mca/X86/Broadwell/resources-mmx.s
+++ b/llvm/test/tools/llvm-mca/X86/Broadwell/resources-mmx.s
@@ -173,11 +173,11 @@ pxor (%rax), %mm2
# CHECK-NEXT: 1 5 0.50 * movq (%rax), %mm2
# CHECK-NEXT: 1 1 1.00 movq %mm0, %rcx
# CHECK-NEXT: 2 1 1.00 * movq %mm0, (%rax)
-# CHECK-NEXT: 3 3 2.00 packsswb %mm0, %mm2
+# CHECK-NEXT: 2 3 2.00 packsswb %mm0, %mm2
# CHECK-NEXT: 3 7 2.00 * packsswb (%rax), %mm2
-# CHECK-NEXT: 3 3 2.00 packssdw %mm0, %mm2
+# CHECK-NEXT: 2 3 2.00 packssdw %mm0, %mm2
# CHECK-NEXT: 3 7 2.00 * packssdw (%rax), %mm2
-# CHECK-NEXT: 3 3 2.00 packuswb %mm0, %mm2
+# CHECK-NEXT: 2 3 2.00 packuswb %mm0, %mm2
# CHECK-NEXT: 3 7 2.00 * packuswb (%rax), %mm2
# CHECK-NEXT: 1 1 0.50 paddb %mm0, %mm2
# CHECK-NEXT: 2 6 0.50 * paddb (%rax), %mm2
@@ -284,7 +284,7 @@ pxor (%rax), %mm2
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
-# CHECK-NEXT: - - 46.67 35.17 23.67 23.67 2.00 57.17 1.00 0.67
+# CHECK-NEXT: - - 45.92 34.42 23.67 23.67 2.00 56.42 0.25 0.67
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
@@ -297,11 +297,11 @@ pxor (%rax), %mm2
# CHECK-NEXT: - - - - 0.50 0.50 - - - - movq (%rax), %mm2
# CHECK-NEXT: - - 1.00 - - - - - - - movq %mm0, %rcx
# CHECK-NEXT: - - - - 0.33 0.33 1.00 - - 0.33 movq %mm0, (%rax)
-# CHECK-NEXT: - - 0.25 0.25 - - - 2.25 0.25 - packsswb %mm0, %mm2
+# CHECK-NEXT: - - - - - - - 2.00 - - packsswb %mm0, %mm2
# CHECK-NEXT: - - - - 0.50 0.50 - 2.00 - - packsswb (%rax), %mm2
-# CHECK-NEXT: - - 0.25 0.25 - - - 2.25 0.25 - packssdw %mm0, %mm2
+# CHECK-NEXT: - - - - - - - 2.00 - - packssdw %mm0, %mm2
# CHECK-NEXT: - - - - 0.50 0.50 - 2.00 - - packssdw (%rax), %mm2
-# CHECK-NEXT: - - 0.25 0.25 - - - 2.25 0.25 - packuswb %mm0, %mm2
+# CHECK-NEXT: - - - - - - - 2.00 - - packuswb %mm0, %mm2
# CHECK-NEXT: - - - - 0.50 0.50 - 2.00 - - packuswb (%rax), %mm2
# CHECK-NEXT: - - - 0.50 - - - 0.50 - - paddb %mm0, %mm2
# CHECK-NEXT: - - - 0.50 0.50 0.50 - 0.50 - - paddb (%rax), %mm2
diff --git a/llvm/test/tools/llvm-mca/X86/Generic/resources-sse2.s b/llvm/test/tools/llvm-mca/X86/Generic/resources-sse2.s
index 904454a..df0053a 100644
--- a/llvm/test/tools/llvm-mca/X86/Generic/resources-sse2.s
+++ b/llvm/test/tools/llvm-mca/X86/Generic/resources-sse2.s
@@ -624,8 +624,8 @@ xorpd (%rax), %xmm2
# CHECK-NEXT: 2 7 0.50 * psubb (%rax), %xmm2
# CHECK-NEXT: 1 1 0.50 psubd %xmm0, %xmm2
# CHECK-NEXT: 2 7 0.50 * psubd (%rax), %xmm2
-# CHECK-NEXT: 1 3 1.00 psubq %mm0, %mm2
-# CHECK-NEXT: 2 8 1.00 * psubq (%rax), %mm2
+# CHECK-NEXT: 1 1 0.50 psubq %mm0, %mm2
+# CHECK-NEXT: 2 7 0.50 * psubq (%rax), %mm2
# CHECK-NEXT: 1 1 0.50 psubq %xmm0, %xmm2
# CHECK-NEXT: 2 7 0.50 * psubq (%rax), %xmm2
# CHECK-NEXT: 1 1 0.50 psubsb %xmm0, %xmm2
@@ -687,7 +687,7 @@ xorpd (%rax), %xmm2
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1]
-# CHECK-NEXT: - 172.00 75.83 118.33 17.00 100.83 67.00 67.00
+# CHECK-NEXT: - 172.00 75.83 117.33 17.00 101.83 67.00 67.00
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions:
@@ -908,8 +908,8 @@ xorpd (%rax), %xmm2
# CHECK-NEXT: - - - 0.50 - 0.50 0.50 0.50 psubb (%rax), %xmm2
# CHECK-NEXT: - - - 0.50 - 0.50 - - psubd %xmm0, %xmm2
# CHECK-NEXT: - - - 0.50 - 0.50 0.50 0.50 psubd (%rax), %xmm2
-# CHECK-NEXT: - - - 1.00 - - - - psubq %mm0, %mm2
-# CHECK-NEXT: - - - 1.00 - - 0.50 0.50 psubq (%rax), %mm2
+# CHECK-NEXT: - - - 0.50 - 0.50 - - psubq %mm0, %mm2
+# CHECK-NEXT: - - - 0.50 - 0.50 0.50 0.50 psubq (%rax), %mm2
# CHECK-NEXT: - - - 0.50 - 0.50 - - psubq %xmm0, %xmm2
# CHECK-NEXT: - - - 0.50 - 0.50 0.50 0.50 psubq (%rax), %xmm2
# CHECK-NEXT: - - - 0.50 - 0.50 - - psubsb %xmm0, %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/Haswell/resources-mmx.s b/llvm/test/tools/llvm-mca/X86/Haswell/resources-mmx.s
index 5094dd1..01f516a 100644
--- a/llvm/test/tools/llvm-mca/X86/Haswell/resources-mmx.s
+++ b/llvm/test/tools/llvm-mca/X86/Haswell/resources-mmx.s
@@ -173,11 +173,11 @@ pxor (%rax), %mm2
# CHECK-NEXT: 1 5 0.50 * movq (%rax), %mm2
# CHECK-NEXT: 1 1 1.00 movq %mm0, %rcx
# CHECK-NEXT: 2 1 1.00 * movq %mm0, (%rax)
-# CHECK-NEXT: 3 3 2.00 packsswb %mm0, %mm2
+# CHECK-NEXT: 2 3 2.00 packsswb %mm0, %mm2
# CHECK-NEXT: 3 7 2.00 * packsswb (%rax), %mm2
-# CHECK-NEXT: 3 3 2.00 packssdw %mm0, %mm2
+# CHECK-NEXT: 2 3 2.00 packssdw %mm0, %mm2
# CHECK-NEXT: 3 7 2.00 * packssdw (%rax), %mm2
-# CHECK-NEXT: 3 3 2.00 packuswb %mm0, %mm2
+# CHECK-NEXT: 2 3 2.00 packuswb %mm0, %mm2
# CHECK-NEXT: 3 7 2.00 * packuswb (%rax), %mm2
# CHECK-NEXT: 1 1 0.50 paddb %mm0, %mm2
# CHECK-NEXT: 2 6 0.50 * paddb (%rax), %mm2
@@ -284,7 +284,7 @@ pxor (%rax), %mm2
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
-# CHECK-NEXT: - - 46.67 35.17 23.67 23.67 2.00 57.17 1.00 0.67
+# CHECK-NEXT: - - 45.92 34.42 23.67 23.67 2.00 56.42 0.25 0.67
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
@@ -297,11 +297,11 @@ pxor (%rax), %mm2
# CHECK-NEXT: - - - - 0.50 0.50 - - - - movq (%rax), %mm2
# CHECK-NEXT: - - 1.00 - - - - - - - movq %mm0, %rcx
# CHECK-NEXT: - - - - 0.33 0.33 1.00 - - 0.33 movq %mm0, (%rax)
-# CHECK-NEXT: - - 0.25 0.25 - - - 2.25 0.25 - packsswb %mm0, %mm2
+# CHECK-NEXT: - - - - - - - 2.00 - - packsswb %mm0, %mm2
# CHECK-NEXT: - - - - 0.50 0.50 - 2.00 - - packsswb (%rax), %mm2
-# CHECK-NEXT: - - 0.25 0.25 - - - 2.25 0.25 - packssdw %mm0, %mm2
+# CHECK-NEXT: - - - - - - - 2.00 - - packssdw %mm0, %mm2
# CHECK-NEXT: - - - - 0.50 0.50 - 2.00 - - packssdw (%rax), %mm2
-# CHECK-NEXT: - - 0.25 0.25 - - - 2.25 0.25 - packuswb %mm0, %mm2
+# CHECK-NEXT: - - - - - - - 2.00 - - packuswb %mm0, %mm2
# CHECK-NEXT: - - - - 0.50 0.50 - 2.00 - - packuswb (%rax), %mm2
# CHECK-NEXT: - - - 0.50 - - - 0.50 - - paddb %mm0, %mm2
# CHECK-NEXT: - - - 0.50 0.50 0.50 - 0.50 - - paddb (%rax), %mm2
diff --git a/llvm/test/tools/llvm-mca/X86/SandyBridge/resources-sse2.s b/llvm/test/tools/llvm-mca/X86/SandyBridge/resources-sse2.s
index c3b8b73..e2cfd02 100644
--- a/llvm/test/tools/llvm-mca/X86/SandyBridge/resources-sse2.s
+++ b/llvm/test/tools/llvm-mca/X86/SandyBridge/resources-sse2.s
@@ -624,8 +624,8 @@ xorpd (%rax), %xmm2
# CHECK-NEXT: 2 7 0.50 * psubb (%rax), %xmm2
# CHECK-NEXT: 1 1 0.50 psubd %xmm0, %xmm2
# CHECK-NEXT: 2 7 0.50 * psubd (%rax), %xmm2
-# CHECK-NEXT: 1 3 1.00 psubq %mm0, %mm2
-# CHECK-NEXT: 2 8 1.00 * psubq (%rax), %mm2
+# CHECK-NEXT: 1 1 0.50 psubq %mm0, %mm2
+# CHECK-NEXT: 2 7 0.50 * psubq (%rax), %mm2
# CHECK-NEXT: 1 1 0.50 psubq %xmm0, %xmm2
# CHECK-NEXT: 2 7 0.50 * psubq (%rax), %xmm2
# CHECK-NEXT: 1 1 0.50 psubsb %xmm0, %xmm2
@@ -687,7 +687,7 @@ xorpd (%rax), %xmm2
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1]
-# CHECK-NEXT: - 172.00 75.83 118.33 17.00 100.83 67.00 67.00
+# CHECK-NEXT: - 172.00 75.83 117.33 17.00 101.83 67.00 67.00
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions:
@@ -908,8 +908,8 @@ xorpd (%rax), %xmm2
# CHECK-NEXT: - - - 0.50 - 0.50 0.50 0.50 psubb (%rax), %xmm2
# CHECK-NEXT: - - - 0.50 - 0.50 - - psubd %xmm0, %xmm2
# CHECK-NEXT: - - - 0.50 - 0.50 0.50 0.50 psubd (%rax), %xmm2
-# CHECK-NEXT: - - - 1.00 - - - - psubq %mm0, %mm2
-# CHECK-NEXT: - - - 1.00 - - 0.50 0.50 psubq (%rax), %mm2
+# CHECK-NEXT: - - - 0.50 - 0.50 - - psubq %mm0, %mm2
+# CHECK-NEXT: - - - 0.50 - 0.50 0.50 0.50 psubq (%rax), %mm2
# CHECK-NEXT: - - - 0.50 - 0.50 - - psubq %xmm0, %xmm2
# CHECK-NEXT: - - - 0.50 - 0.50 0.50 0.50 psubq (%rax), %xmm2
# CHECK-NEXT: - - - 0.50 - 0.50 - - psubsb %xmm0, %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/SandyBridge/zero-idioms.s b/llvm/test/tools/llvm-mca/X86/SandyBridge/zero-idioms.s
index 4a034cc..bdca772 100644
--- a/llvm/test/tools/llvm-mca/X86/SandyBridge/zero-idioms.s
+++ b/llvm/test/tools/llvm-mca/X86/SandyBridge/zero-idioms.s
@@ -83,12 +83,12 @@ vpxor %xmm3, %xmm3, %xmm5
# CHECK: Iterations: 1
# CHECK-NEXT: Instructions: 63
-# CHECK-NEXT: Total Cycles: 27
+# CHECK-NEXT: Total Cycles: 25
# CHECK-NEXT: Total uOps: 63
# CHECK: Dispatch Width: 4
-# CHECK-NEXT: uOps Per Cycle: 2.33
-# CHECK-NEXT: IPC: 2.33
+# CHECK-NEXT: uOps Per Cycle: 2.52
+# CHECK-NEXT: IPC: 2.52
# CHECK-NEXT: Block RThroughput: 15.8
# CHECK: Instruction Info:
@@ -121,7 +121,7 @@ vpxor %xmm3, %xmm3, %xmm5
# CHECK-NEXT: 1 0 0.25 vpcmpgtw %xmm3, %xmm3, %xmm5
# CHECK-NEXT: 1 3 1.00 psubb %mm2, %mm2
# CHECK-NEXT: 1 3 1.00 psubd %mm2, %mm2
-# CHECK-NEXT: 1 3 1.00 psubq %mm2, %mm2
+# CHECK-NEXT: 1 1 0.50 psubq %mm2, %mm2
# CHECK-NEXT: 1 3 1.00 psubw %mm2, %mm2
# CHECK-NEXT: 1 0 0.25 psubb %xmm2, %xmm2
# CHECK-NEXT: 1 0 0.25 psubd %xmm2, %xmm2
@@ -250,71 +250,71 @@ vpxor %xmm3, %xmm3, %xmm5
# CHECK: Timeline view:
# CHECK-NEXT: 0123456789
-# CHECK-NEXT: Index 0123456789 0123456
+# CHECK-NEXT: Index 0123456789 01234
-# CHECK: [0,0] DR . . . . .. subl %eax, %eax
-# CHECK-NEXT: [0,1] DR . . . . .. subq %rax, %rax
-# CHECK-NEXT: [0,2] DR . . . . .. xorl %eax, %eax
-# CHECK-NEXT: [0,3] DR . . . . .. xorq %rax, %rax
-# CHECK-NEXT: [0,4] .DeeeER . . . .. pcmpgtb %mm2, %mm2
-# CHECK-NEXT: [0,5] .D===eeeER. . . .. pcmpgtd %mm2, %mm2
-# CHECK-NEXT: [0,6] .D======eeeER . . .. pcmpgtw %mm2, %mm2
-# CHECK-NEXT: [0,7] .D----------R . . .. pcmpgtb %xmm2, %xmm2
-# CHECK-NEXT: [0,8] . D---------R . . .. pcmpgtd %xmm2, %xmm2
-# CHECK-NEXT: [0,9] . D---------R . . .. pcmpgtq %xmm2, %xmm2
-# CHECK-NEXT: [0,10] . D---------R . . .. pcmpgtw %xmm2, %xmm2
-# CHECK-NEXT: [0,11] . D---------R . . .. vpcmpgtb %xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,12] . D--------R . . .. vpcmpgtd %xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,13] . D--------R . . .. vpcmpgtq %xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,14] . D--------R . . .. vpcmpgtw %xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,15] . D--------R . . .. vpcmpgtb %xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,16] . D-------R . . .. vpcmpgtd %xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,17] . D-------R . . .. vpcmpgtq %xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,18] . D-------R . . .. vpcmpgtw %xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,19] . D======eeeER . .. psubb %mm2, %mm2
-# CHECK-NEXT: [0,20] . D========eeeER . .. psubd %mm2, %mm2
-# CHECK-NEXT: [0,21] . D===========eeeER .. psubq %mm2, %mm2
-# CHECK-NEXT: [0,22] . D==============eeeER.. psubw %mm2, %mm2
-# CHECK-NEXT: [0,23] . D------------------R.. psubb %xmm2, %xmm2
-# CHECK-NEXT: [0,24] . .D-----------------R.. psubd %xmm2, %xmm2
-# CHECK-NEXT: [0,25] . .D-----------------R.. psubq %xmm2, %xmm2
-# CHECK-NEXT: [0,26] . .D-----------------R.. psubw %xmm2, %xmm2
-# CHECK-NEXT: [0,27] . .D-----------------R.. vpsubb %xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,28] . . D----------------R.. vpsubd %xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,29] . . D----------------R.. vpsubq %xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,30] . . D----------------R.. vpsubw %xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,31] . . D----------------R.. vpsubb %xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,32] . . D---------------R.. vpsubd %xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,33] . . D---------------R.. vpsubq %xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,34] . . D---------------R.. vpsubw %xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,35] . . DeE-------------R.. andnps %xmm0, %xmm0
-# CHECK-NEXT: [0,36] . . DeE------------R.. andnpd %xmm1, %xmm1
-# CHECK-NEXT: [0,37] . . D=eE-----------R.. vandnps %xmm2, %xmm2, %xmm2
-# CHECK-NEXT: [0,38] . . D===eE---------R.. vandnpd %xmm1, %xmm1, %xmm1
-# CHECK-NEXT: [0,39] . . D==eE----------R.. vandnps %ymm2, %ymm2, %ymm2
-# CHECK-NEXT: [0,40] . . D===eE--------R.. vandnpd %ymm1, %ymm1, %ymm1
-# CHECK-NEXT: [0,41] . . D============eER. pandn %mm2, %mm2
-# CHECK-NEXT: [0,42] . . D==eE----------R. pandn %xmm2, %xmm2
-# CHECK-NEXT: [0,43] . . DeE------------R. vpandn %xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,44] . . .D===eE--------R. vandnps %xmm2, %xmm2, %xmm5
-# CHECK-NEXT: [0,45] . . .D====eE-------R. vandnpd %xmm1, %xmm1, %xmm5
-# CHECK-NEXT: [0,46] . . .DeE-----------R. vpandn %xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,47] . . .D=====eE------R. vandnps %ymm2, %ymm2, %ymm5
-# CHECK-NEXT: [0,48] . . . D=====eE-----R. vandnpd %ymm1, %ymm1, %ymm5
-# CHECK-NEXT: [0,49] . . . D------------R. xorps %xmm0, %xmm0
-# CHECK-NEXT: [0,50] . . . D------------R. xorpd %xmm1, %xmm1
-# CHECK-NEXT: [0,51] . . . D------------R. vxorps %xmm2, %xmm2, %xmm2
-# CHECK-NEXT: [0,52] . . . D-----------R. vxorpd %xmm1, %xmm1, %xmm1
-# CHECK-NEXT: [0,53] . . . D-----------R. vxorps %ymm2, %ymm2, %ymm2
-# CHECK-NEXT: [0,54] . . . D-----------R. vxorpd %ymm1, %ymm1, %ymm1
-# CHECK-NEXT: [0,55] . . . D==========eER pxor %mm2, %mm2
-# CHECK-NEXT: [0,56] . . . D-----------R pxor %xmm2, %xmm2
-# CHECK-NEXT: [0,57] . . . D-----------R vpxor %xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,58] . . . D-----------R vxorps %xmm4, %xmm4, %xmm5
-# CHECK-NEXT: [0,59] . . . D-----------R vxorpd %xmm1, %xmm1, %xmm3
-# CHECK-NEXT: [0,60] . . . D----------R vxorps %ymm4, %ymm4, %ymm5
-# CHECK-NEXT: [0,61] . . . D----------R vxorpd %ymm1, %ymm1, %ymm3
-# CHECK-NEXT: [0,62] . . . D----------R vpxor %xmm3, %xmm3, %xmm5
+# CHECK: [0,0] DR . . . . . subl %eax, %eax
+# CHECK-NEXT: [0,1] DR . . . . . subq %rax, %rax
+# CHECK-NEXT: [0,2] DR . . . . . xorl %eax, %eax
+# CHECK-NEXT: [0,3] DR . . . . . xorq %rax, %rax
+# CHECK-NEXT: [0,4] .DeeeER . . . . pcmpgtb %mm2, %mm2
+# CHECK-NEXT: [0,5] .D===eeeER. . . . pcmpgtd %mm2, %mm2
+# CHECK-NEXT: [0,6] .D======eeeER . . . pcmpgtw %mm2, %mm2
+# CHECK-NEXT: [0,7] .D----------R . . . pcmpgtb %xmm2, %xmm2
+# CHECK-NEXT: [0,8] . D---------R . . . pcmpgtd %xmm2, %xmm2
+# CHECK-NEXT: [0,9] . D---------R . . . pcmpgtq %xmm2, %xmm2
+# CHECK-NEXT: [0,10] . D---------R . . . pcmpgtw %xmm2, %xmm2
+# CHECK-NEXT: [0,11] . D---------R . . . vpcmpgtb %xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,12] . D--------R . . . vpcmpgtd %xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,13] . D--------R . . . vpcmpgtq %xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,14] . D--------R . . . vpcmpgtw %xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,15] . D--------R . . . vpcmpgtb %xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,16] . D-------R . . . vpcmpgtd %xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,17] . D-------R . . . vpcmpgtq %xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,18] . D-------R . . . vpcmpgtw %xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,19] . D======eeeER . . psubb %mm2, %mm2
+# CHECK-NEXT: [0,20] . D========eeeER . . psubd %mm2, %mm2
+# CHECK-NEXT: [0,21] . D===========eER. . psubq %mm2, %mm2
+# CHECK-NEXT: [0,22] . D============eeeER . psubw %mm2, %mm2
+# CHECK-NEXT: [0,23] . D----------------R . psubb %xmm2, %xmm2
+# CHECK-NEXT: [0,24] . .D---------------R . psubd %xmm2, %xmm2
+# CHECK-NEXT: [0,25] . .D---------------R . psubq %xmm2, %xmm2
+# CHECK-NEXT: [0,26] . .D---------------R . psubw %xmm2, %xmm2
+# CHECK-NEXT: [0,27] . .D---------------R . vpsubb %xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,28] . . D--------------R . vpsubd %xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,29] . . D--------------R . vpsubq %xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,30] . . D--------------R . vpsubw %xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,31] . . D--------------R . vpsubb %xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,32] . . D-------------R . vpsubd %xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,33] . . D-------------R . vpsubq %xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,34] . . D-------------R . vpsubw %xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,35] . . DeE-----------R . andnps %xmm0, %xmm0
+# CHECK-NEXT: [0,36] . . DeE----------R . andnpd %xmm1, %xmm1
+# CHECK-NEXT: [0,37] . . D=eE---------R . vandnps %xmm2, %xmm2, %xmm2
+# CHECK-NEXT: [0,38] . . D===eE-------R . vandnpd %xmm1, %xmm1, %xmm1
+# CHECK-NEXT: [0,39] . . D==eE--------R . vandnps %ymm2, %ymm2, %ymm2
+# CHECK-NEXT: [0,40] . . D===eE------R . vandnpd %ymm1, %ymm1, %ymm1
+# CHECK-NEXT: [0,41] . . D==========eER. pandn %mm2, %mm2
+# CHECK-NEXT: [0,42] . . D==eE--------R. pandn %xmm2, %xmm2
+# CHECK-NEXT: [0,43] . . DeE----------R. vpandn %xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,44] . . .D===eE------R. vandnps %xmm2, %xmm2, %xmm5
+# CHECK-NEXT: [0,45] . . .D====eE-----R. vandnpd %xmm1, %xmm1, %xmm5
+# CHECK-NEXT: [0,46] . . .DeE---------R. vpandn %xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,47] . . .D=====eE----R. vandnps %ymm2, %ymm2, %ymm5
+# CHECK-NEXT: [0,48] . . . D=====eE---R. vandnpd %ymm1, %ymm1, %ymm5
+# CHECK-NEXT: [0,49] . . . D----------R. xorps %xmm0, %xmm0
+# CHECK-NEXT: [0,50] . . . D----------R. xorpd %xmm1, %xmm1
+# CHECK-NEXT: [0,51] . . . D----------R. vxorps %xmm2, %xmm2, %xmm2
+# CHECK-NEXT: [0,52] . . . D---------R. vxorpd %xmm1, %xmm1, %xmm1
+# CHECK-NEXT: [0,53] . . . D---------R. vxorps %ymm2, %ymm2, %ymm2
+# CHECK-NEXT: [0,54] . . . D---------R. vxorpd %ymm1, %ymm1, %ymm1
+# CHECK-NEXT: [0,55] . . . D========eER pxor %mm2, %mm2
+# CHECK-NEXT: [0,56] . . . D---------R pxor %xmm2, %xmm2
+# CHECK-NEXT: [0,57] . . . D---------R vpxor %xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,58] . . . D---------R vxorps %xmm4, %xmm4, %xmm5
+# CHECK-NEXT: [0,59] . . . D---------R vxorpd %xmm1, %xmm1, %xmm3
+# CHECK-NEXT: [0,60] . . . D--------R vxorps %ymm4, %ymm4, %ymm5
+# CHECK-NEXT: [0,61] . . . D--------R vxorpd %ymm1, %ymm1, %ymm3
+# CHECK-NEXT: [0,62] . . . D--------R vpxor %xmm3, %xmm3, %xmm5
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -345,45 +345,45 @@ vpxor %xmm3, %xmm3, %xmm5
# CHECK-NEXT: 19. 1 7.0 0.0 0.0 psubb %mm2, %mm2
# CHECK-NEXT: 20. 1 9.0 0.0 0.0 psubd %mm2, %mm2
# CHECK-NEXT: 21. 1 12.0 0.0 0.0 psubq %mm2, %mm2
-# CHECK-NEXT: 22. 1 15.0 0.0 0.0 psubw %mm2, %mm2
-# CHECK-NEXT: 23. 1 0.0 0.0 18.0 psubb %xmm2, %xmm2
-# CHECK-NEXT: 24. 1 0.0 0.0 17.0 psubd %xmm2, %xmm2
-# CHECK-NEXT: 25. 1 0.0 0.0 17.0 psubq %xmm2, %xmm2
-# CHECK-NEXT: 26. 1 0.0 0.0 17.0 psubw %xmm2, %xmm2
-# CHECK-NEXT: 27. 1 0.0 0.0 17.0 vpsubb %xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 28. 1 0.0 0.0 16.0 vpsubd %xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 29. 1 0.0 0.0 16.0 vpsubq %xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 30. 1 0.0 0.0 16.0 vpsubw %xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 31. 1 0.0 0.0 16.0 vpsubb %xmm3, %xmm3, %xmm5
-# CHECK-NEXT: 32. 1 0.0 0.0 15.0 vpsubd %xmm3, %xmm3, %xmm5
-# CHECK-NEXT: 33. 1 0.0 0.0 15.0 vpsubq %xmm3, %xmm3, %xmm5
-# CHECK-NEXT: 34. 1 0.0 0.0 15.0 vpsubw %xmm3, %xmm3, %xmm5
-# CHECK-NEXT: 35. 1 1.0 1.0 13.0 andnps %xmm0, %xmm0
-# CHECK-NEXT: 36. 1 1.0 1.0 12.0 andnpd %xmm1, %xmm1
-# CHECK-NEXT: 37. 1 2.0 2.0 11.0 vandnps %xmm2, %xmm2, %xmm2
-# CHECK-NEXT: 38. 1 4.0 2.0 9.0 vandnpd %xmm1, %xmm1, %xmm1
-# CHECK-NEXT: 39. 1 3.0 0.0 10.0 vandnps %ymm2, %ymm2, %ymm2
-# CHECK-NEXT: 40. 1 4.0 0.0 8.0 vandnpd %ymm1, %ymm1, %ymm1
-# CHECK-NEXT: 41. 1 13.0 0.0 0.0 pandn %mm2, %mm2
-# CHECK-NEXT: 42. 1 3.0 0.0 10.0 pandn %xmm2, %xmm2
-# CHECK-NEXT: 43. 1 1.0 1.0 12.0 vpandn %xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 44. 1 4.0 1.0 8.0 vandnps %xmm2, %xmm2, %xmm5
-# CHECK-NEXT: 45. 1 5.0 1.0 7.0 vandnpd %xmm1, %xmm1, %xmm5
-# CHECK-NEXT: 46. 1 1.0 0.0 11.0 vpandn %xmm3, %xmm3, %xmm5
-# CHECK-NEXT: 47. 1 6.0 3.0 6.0 vandnps %ymm2, %ymm2, %ymm5
-# CHECK-NEXT: 48. 1 6.0 3.0 5.0 vandnpd %ymm1, %ymm1, %ymm5
-# CHECK-NEXT: 49. 1 0.0 0.0 12.0 xorps %xmm0, %xmm0
-# CHECK-NEXT: 50. 1 0.0 0.0 12.0 xorpd %xmm1, %xmm1
-# CHECK-NEXT: 51. 1 0.0 0.0 12.0 vxorps %xmm2, %xmm2, %xmm2
-# CHECK-NEXT: 52. 1 0.0 0.0 11.0 vxorpd %xmm1, %xmm1, %xmm1
-# CHECK-NEXT: 53. 1 0.0 0.0 11.0 vxorps %ymm2, %ymm2, %ymm2
-# CHECK-NEXT: 54. 1 0.0 0.0 11.0 vxorpd %ymm1, %ymm1, %ymm1
-# CHECK-NEXT: 55. 1 11.0 0.0 0.0 pxor %mm2, %mm2
-# CHECK-NEXT: 56. 1 0.0 0.0 11.0 pxor %xmm2, %xmm2
-# CHECK-NEXT: 57. 1 0.0 0.0 11.0 vpxor %xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 58. 1 0.0 0.0 11.0 vxorps %xmm4, %xmm4, %xmm5
-# CHECK-NEXT: 59. 1 0.0 0.0 11.0 vxorpd %xmm1, %xmm1, %xmm3
-# CHECK-NEXT: 60. 1 0.0 0.0 10.0 vxorps %ymm4, %ymm4, %ymm5
-# CHECK-NEXT: 61. 1 0.0 0.0 10.0 vxorpd %ymm1, %ymm1, %ymm3
-# CHECK-NEXT: 62. 1 0.0 0.0 10.0 vpxor %xmm3, %xmm3, %xmm5
-# CHECK-NEXT: 1 1.9 0.3 8.9 <total>
+# CHECK-NEXT: 22. 1 13.0 0.0 0.0 psubw %mm2, %mm2
+# CHECK-NEXT: 23. 1 0.0 0.0 16.0 psubb %xmm2, %xmm2
+# CHECK-NEXT: 24. 1 0.0 0.0 15.0 psubd %xmm2, %xmm2
+# CHECK-NEXT: 25. 1 0.0 0.0 15.0 psubq %xmm2, %xmm2
+# CHECK-NEXT: 26. 1 0.0 0.0 15.0 psubw %xmm2, %xmm2
+# CHECK-NEXT: 27. 1 0.0 0.0 15.0 vpsubb %xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 28. 1 0.0 0.0 14.0 vpsubd %xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 29. 1 0.0 0.0 14.0 vpsubq %xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 30. 1 0.0 0.0 14.0 vpsubw %xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 31. 1 0.0 0.0 14.0 vpsubb %xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 32. 1 0.0 0.0 13.0 vpsubd %xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 33. 1 0.0 0.0 13.0 vpsubq %xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 34. 1 0.0 0.0 13.0 vpsubw %xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 35. 1 1.0 1.0 11.0 andnps %xmm0, %xmm0
+# CHECK-NEXT: 36. 1 1.0 1.0 10.0 andnpd %xmm1, %xmm1
+# CHECK-NEXT: 37. 1 2.0 2.0 9.0 vandnps %xmm2, %xmm2, %xmm2
+# CHECK-NEXT: 38. 1 4.0 2.0 7.0 vandnpd %xmm1, %xmm1, %xmm1
+# CHECK-NEXT: 39. 1 3.0 0.0 8.0 vandnps %ymm2, %ymm2, %ymm2
+# CHECK-NEXT: 40. 1 4.0 0.0 6.0 vandnpd %ymm1, %ymm1, %ymm1
+# CHECK-NEXT: 41. 1 11.0 0.0 0.0 pandn %mm2, %mm2
+# CHECK-NEXT: 42. 1 3.0 0.0 8.0 pandn %xmm2, %xmm2
+# CHECK-NEXT: 43. 1 1.0 1.0 10.0 vpandn %xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 44. 1 4.0 1.0 6.0 vandnps %xmm2, %xmm2, %xmm5
+# CHECK-NEXT: 45. 1 5.0 1.0 5.0 vandnpd %xmm1, %xmm1, %xmm5
+# CHECK-NEXT: 46. 1 1.0 0.0 9.0 vpandn %xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 47. 1 6.0 3.0 4.0 vandnps %ymm2, %ymm2, %ymm5
+# CHECK-NEXT: 48. 1 6.0 3.0 3.0 vandnpd %ymm1, %ymm1, %ymm5
+# CHECK-NEXT: 49. 1 0.0 0.0 10.0 xorps %xmm0, %xmm0
+# CHECK-NEXT: 50. 1 0.0 0.0 10.0 xorpd %xmm1, %xmm1
+# CHECK-NEXT: 51. 1 0.0 0.0 10.0 vxorps %xmm2, %xmm2, %xmm2
+# CHECK-NEXT: 52. 1 0.0 0.0 9.0 vxorpd %xmm1, %xmm1, %xmm1
+# CHECK-NEXT: 53. 1 0.0 0.0 9.0 vxorps %ymm2, %ymm2, %ymm2
+# CHECK-NEXT: 54. 1 0.0 0.0 9.0 vxorpd %ymm1, %ymm1, %ymm1
+# CHECK-NEXT: 55. 1 9.0 0.0 0.0 pxor %mm2, %mm2
+# CHECK-NEXT: 56. 1 0.0 0.0 9.0 pxor %xmm2, %xmm2
+# CHECK-NEXT: 57. 1 0.0 0.0 9.0 vpxor %xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 58. 1 0.0 0.0 9.0 vxorps %xmm4, %xmm4, %xmm5
+# CHECK-NEXT: 59. 1 0.0 0.0 9.0 vxorpd %xmm1, %xmm1, %xmm3
+# CHECK-NEXT: 60. 1 0.0 0.0 8.0 vxorps %ymm4, %ymm4, %ymm5
+# CHECK-NEXT: 61. 1 0.0 0.0 8.0 vxorpd %ymm1, %ymm1, %ymm3
+# CHECK-NEXT: 62. 1 0.0 0.0 8.0 vpxor %xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 1 1.8 0.3 7.7 <total>
diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-avx1.s b/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-avx1.s
index ef5a9e3..2711bc4 100644
--- a/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-avx1.s
+++ b/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-avx1.s
@@ -1461,8 +1461,8 @@ vzeroupper
# CHECK-NEXT: 4 9 2.00 * vphaddsw (%rax), %xmm1, %xmm2
# CHECK-NEXT: 3 3 2.00 vphaddw %xmm0, %xmm1, %xmm2
# CHECK-NEXT: 4 9 2.00 * vphaddw (%rax), %xmm1, %xmm2
-# CHECK-NEXT: 1 4 0.50 vphminposuw %xmm0, %xmm2
-# CHECK-NEXT: 2 10 0.50 * vphminposuw (%rax), %xmm2
+# CHECK-NEXT: 1 4 1.00 vphminposuw %xmm0, %xmm2
+# CHECK-NEXT: 2 10 1.00 * vphminposuw (%rax), %xmm2
# CHECK-NEXT: 3 3 2.00 vphsubd %xmm0, %xmm1, %xmm2
# CHECK-NEXT: 4 9 2.00 * vphsubd (%rax), %xmm1, %xmm2
# CHECK-NEXT: 3 3 2.00 vphsubsw %xmm0, %xmm1, %xmm2
@@ -1736,7 +1736,7 @@ vzeroupper
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
-# CHECK-NEXT: - 126.00 339.58 199.58 173.83 173.83 38.00 326.58 6.25 11.33
+# CHECK-NEXT: - 126.00 340.58 198.58 173.83 173.83 38.00 326.58 6.25 11.33
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
@@ -2171,8 +2171,8 @@ vzeroupper
# CHECK-NEXT: - - 0.50 0.50 0.50 0.50 - 2.00 - - vphaddsw (%rax), %xmm1, %xmm2
# CHECK-NEXT: - - 0.33 0.33 - - - 2.33 - - vphaddw %xmm0, %xmm1, %xmm2
# CHECK-NEXT: - - 0.33 0.33 0.50 0.50 - 2.33 - - vphaddw (%rax), %xmm1, %xmm2
-# CHECK-NEXT: - - 0.50 0.50 - - - - - - vphminposuw %xmm0, %xmm2
-# CHECK-NEXT: - - 0.50 0.50 0.50 0.50 - - - - vphminposuw (%rax), %xmm2
+# CHECK-NEXT: - - 1.00 - - - - - - - vphminposuw %xmm0, %xmm2
+# CHECK-NEXT: - - 1.00 - 0.50 0.50 - - - - vphminposuw (%rax), %xmm2
# CHECK-NEXT: - - 0.33 0.33 - - - 2.33 - - vphsubd %xmm0, %xmm1, %xmm2
# CHECK-NEXT: - - 0.33 0.33 0.50 0.50 - 2.33 - - vphsubd (%rax), %xmm1, %xmm2
# CHECK-NEXT: - - 0.50 0.50 - - - 2.00 - - vphsubsw %xmm0, %xmm1, %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-sse41.s b/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-sse41.s
index 1d8d67f..0eec7e4 100644
--- a/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-sse41.s
+++ b/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-sse41.s
@@ -189,8 +189,8 @@ roundss $1, (%rax), %xmm2
# CHECK-NEXT: 2 3 1.00 pextrq $1, %xmm0, %rcx
# CHECK-NEXT: 3 2 1.00 * pextrq $1, %xmm0, (%rax)
# CHECK-NEXT: 3 2 1.00 * pextrw $1, %xmm0, (%rax)
-# CHECK-NEXT: 1 4 0.50 phminposuw %xmm0, %xmm2
-# CHECK-NEXT: 2 10 0.50 * phminposuw (%rax), %xmm2
+# CHECK-NEXT: 1 4 1.00 phminposuw %xmm0, %xmm2
+# CHECK-NEXT: 2 10 1.00 * phminposuw (%rax), %xmm2
# CHECK-NEXT: 2 2 2.00 pinsrb $1, %eax, %xmm1
# CHECK-NEXT: 2 6 1.00 * pinsrb $1, (%rax), %xmm1
# CHECK-NEXT: 2 2 2.00 pinsrd $1, %eax, %xmm1
@@ -266,7 +266,7 @@ roundss $1, (%rax), %xmm2
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
-# CHECK-NEXT: - - 37.83 31.33 23.67 23.67 5.00 63.33 0.50 1.67
+# CHECK-NEXT: - - 38.83 30.33 23.67 23.67 5.00 63.33 0.50 1.67
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
@@ -304,8 +304,8 @@ roundss $1, (%rax), %xmm2
# CHECK-NEXT: - - 1.00 - - - - 1.00 - - pextrq $1, %xmm0, %rcx
# CHECK-NEXT: - - - - 0.33 0.33 1.00 1.00 - 0.33 pextrq $1, %xmm0, (%rax)
# CHECK-NEXT: - - - - 0.33 0.33 1.00 1.00 - 0.33 pextrw $1, %xmm0, (%rax)
-# CHECK-NEXT: - - 0.50 0.50 - - - - - - phminposuw %xmm0, %xmm2
-# CHECK-NEXT: - - 0.50 0.50 0.50 0.50 - - - - phminposuw (%rax), %xmm2
+# CHECK-NEXT: - - 1.00 - - - - - - - phminposuw %xmm0, %xmm2
+# CHECK-NEXT: - - 1.00 - 0.50 0.50 - - - - phminposuw (%rax), %xmm2
# CHECK-NEXT: - - - - - - - 2.00 - - pinsrb $1, %eax, %xmm1
# CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - pinsrb $1, (%rax), %xmm1
# CHECK-NEXT: - - - - - - - 2.00 - - pinsrd $1, %eax, %xmm1
diff --git a/llvm/tools/llvm-driver/llvm-driver.cpp b/llvm/tools/llvm-driver/llvm-driver.cpp
index 53a8b93..14ce162 100644
--- a/llvm/tools/llvm-driver/llvm-driver.cpp
+++ b/llvm/tools/llvm-driver/llvm-driver.cpp
@@ -33,7 +33,7 @@ static void printHelpMessage() {
<< subcommands
<< "\n Type \"llvm <subcommand> --help\" to get more help on a "
"specific subcommand\n\n"
- << "OPTIONS:\n\n --help - Display this message";
+ << "OPTIONS:\n\n --help - Display this message\n";
}
static int findTool(int Argc, char **Argv, const char *Argv0) {
diff --git a/llvm/tools/llvm-reduce/ReducerWorkItem.cpp b/llvm/tools/llvm-reduce/ReducerWorkItem.cpp
index 629996d..3d36d80 100644
--- a/llvm/tools/llvm-reduce/ReducerWorkItem.cpp
+++ b/llvm/tools/llvm-reduce/ReducerWorkItem.cpp
@@ -232,7 +232,7 @@ static std::unique_ptr<MachineFunction> cloneMF(MachineFunction *SrcMF,
MachineModuleInfo &DestMMI) {
auto DstMF = std::make_unique<MachineFunction>(
SrcMF->getFunction(), SrcMF->getTarget(), SrcMF->getSubtarget(),
- SrcMF->getFunctionNumber(), DestMMI);
+ SrcMF->getContext(), SrcMF->getFunctionNumber());
DenseMap<MachineBasicBlock *, MachineBasicBlock *> Src2DstMBB;
auto *SrcMRI = &SrcMF->getRegInfo();
diff --git a/llvm/unittests/CodeGen/AArch64SelectionDAGTest.cpp b/llvm/unittests/CodeGen/AArch64SelectionDAGTest.cpp
index 877793a..79e27c7 100644
--- a/llvm/unittests/CodeGen/AArch64SelectionDAGTest.cpp
+++ b/llvm/unittests/CodeGen/AArch64SelectionDAGTest.cpp
@@ -61,8 +61,8 @@ protected:
MachineModuleInfo MMI(TM.get());
- MF = std::make_unique<MachineFunction>(*F, *TM, *TM->getSubtargetImpl(*F), 0,
- MMI);
+ MF = std::make_unique<MachineFunction>(*F, *TM, *TM->getSubtargetImpl(*F),
+ MMI.getContext(), 0);
DAG = std::make_unique<SelectionDAG>(*TM, CodeGenOptLevel::None);
if (!DAG)
diff --git a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
index 28cfb30..d5365d9 100644
--- a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
+++ b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
@@ -94,7 +94,7 @@ public:
const TargetSubtargetInfo &STI = *Machine->getSubtargetImpl(*F);
MF = std::make_unique<MachineFunction>(*F, (LLVMTargetMachine &)*Machine,
- STI, FunctionNum, *MMI);
+ STI, MMI->getContext(), FunctionNum);
// Create metadata: CU, subprogram, some blocks and an inline function
// scope.
diff --git a/llvm/unittests/CodeGen/MFCommon.inc b/llvm/unittests/CodeGen/MFCommon.inc
index 1997e80..d464a16 100644
--- a/llvm/unittests/CodeGen/MFCommon.inc
+++ b/llvm/unittests/CodeGen/MFCommon.inc
@@ -135,6 +135,7 @@ std::unique_ptr<MachineFunction> createMachineFunction(LLVMContext &Ctx,
MachineModuleInfo MMI(TM);
const TargetSubtargetInfo &STI = *TM->getSubtargetImpl(*F);
- return std::make_unique<MachineFunction>(*F, *TM, STI, FunctionNum, MMI);
+ return std::make_unique<MachineFunction>(*F, *TM, STI, MMI.getContext(),
+ FunctionNum);
}
diff --git a/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp b/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp
index ef30bd4..c89e5a4 100644
--- a/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp
+++ b/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp
@@ -72,7 +72,7 @@ protected:
MachineModuleInfo MMI(TM.get());
MF = std::make_unique<MachineFunction>(*F, *TM, *TM->getSubtargetImpl(*F),
- 0, MMI);
+ MMI.getContext(), 0);
DAG = std::make_unique<SelectionDAG>(*TM, CodeGenOptLevel::None);
if (!DAG)
diff --git a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
index e318f46..549931b 100644
--- a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
+++ b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
@@ -71,7 +71,7 @@ protected:
MachineModuleInfo MMI(TM.get());
MF = std::make_unique<MachineFunction>(*F, *TM, *TM->getSubtargetImpl(*F),
- 0, MMI);
+ MMI.getContext(), 0);
DAG = std::make_unique<SelectionDAG>(*TM, CodeGenOptLevel::None);
if (!DAG)
@@ -131,6 +131,12 @@ TEST_F(SelectionDAGPatternMatchTest, matchTernaryOp) {
SDValue ICMP_EQ01 = DAG->getSetCC(DL, MVT::i1, Op0, Op1, ISD::SETEQ);
SDValue ICMP_EQ10 = DAG->getSetCC(DL, MVT::i1, Op1, Op0, ISD::SETEQ);
+ auto Int1VT = EVT::getIntegerVT(Context, 1);
+ SDValue Cond = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 3, Int1VT);
+ SDValue T = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 4, Int1VT);
+ SDValue F = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 5, Int1VT);
+ SDValue Select = DAG->getSelect(DL, MVT::i1, Cond, T, F);
+
using namespace SDPatternMatch;
ISD::CondCode CC;
EXPECT_TRUE(sd_match(ICMP_UGT, m_SetCC(m_Value(), m_Value(),
@@ -153,6 +159,13 @@ TEST_F(SelectionDAGPatternMatchTest, matchTernaryOp) {
m_SpecificCondCode(ISD::SETEQ))));
EXPECT_TRUE(sd_match(ICMP_EQ10, m_c_SetCC(m_Specific(Op0), m_Specific(Op1),
m_SpecificCondCode(ISD::SETEQ))));
+
+ EXPECT_TRUE(sd_match(
+ Select, m_Select(m_Specific(Cond), m_Specific(T), m_Specific(F))));
+ EXPECT_FALSE(sd_match(
+ Select, m_Select(m_Specific(Cond), m_Specific(F), m_Specific(T))));
+ EXPECT_FALSE(sd_match(ICMP_EQ01, m_Select(m_Specific(Op0), m_Specific(Op1),
+ m_SpecificCondCode(ISD::SETEQ))));
}
TEST_F(SelectionDAGPatternMatchTest, matchBinaryOp) {
diff --git a/llvm/unittests/Target/AMDGPU/ExecMayBeModifiedBeforeAnyUse.cpp b/llvm/unittests/Target/AMDGPU/ExecMayBeModifiedBeforeAnyUse.cpp
index ece288a..5ac4eda 100644
--- a/llvm/unittests/Target/AMDGPU/ExecMayBeModifiedBeforeAnyUse.cpp
+++ b/llvm/unittests/Target/AMDGPU/ExecMayBeModifiedBeforeAnyUse.cpp
@@ -29,7 +29,8 @@ TEST(AMDGPU, ExecMayBeModifiedBeforeAnyUse) {
auto *F = Function::Create(Type, GlobalValue::ExternalLinkage, "Test", &Mod);
MachineModuleInfo MMI(TM.get());
- auto MF = std::make_unique<MachineFunction>(*F, *TM, ST, 42, MMI);
+ auto MF =
+ std::make_unique<MachineFunction>(*F, *TM, ST, MMI.getContext(), 42);
auto *BB = MF->CreateMachineBasicBlock();
MF->push_back(BB);
diff --git a/llvm/unittests/Target/AMDGPU/PALMetadata.cpp b/llvm/unittests/Target/AMDGPU/PALMetadata.cpp
index 7d4f383..86aa475 100644
--- a/llvm/unittests/Target/AMDGPU/PALMetadata.cpp
+++ b/llvm/unittests/Target/AMDGPU/PALMetadata.cpp
@@ -62,7 +62,7 @@ protected:
TM->getTargetCPU(),
TM->getTargetFeatureString(), *TM);
- MF = std::make_unique<MachineFunction>(*F, *TM, *ST, 1, *MMI);
+ MF = std::make_unique<MachineFunction>(*F, *TM, *ST, MMI->getContext(), 1);
}
};
diff --git a/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp b/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp
index 62cfda4..fe71161 100644
--- a/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp
+++ b/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp
@@ -63,7 +63,7 @@ protected:
TM->getTargetFeatureString(),
TM->getTargetTriple().isArch64Bit() ? "lp64" : "ilp32", 0, 0, *TM);
- MF = std::make_unique<MachineFunction>(*F, *TM, *ST, 42, *MMI);
+ MF = std::make_unique<MachineFunction>(*F, *TM, *ST, MMI->getContext(), 42);
}
};
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index d9d6789..9cf9060 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -1140,7 +1140,7 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
VPReductionRecipe Recipe(RecurrenceDescriptor(), nullptr, &ChainOp, &CondOp,
&VecOp, false);
VPValue EVL;
- VPReductionEVLRecipe EVLRecipe(&Recipe, &EVL, &CondOp);
+ VPReductionEVLRecipe EVLRecipe(Recipe, EVL, &CondOp);
EXPECT_FALSE(EVLRecipe.mayHaveSideEffects());
EXPECT_FALSE(EVLRecipe.mayReadFromMemory());
EXPECT_FALSE(EVLRecipe.mayWriteToMemory());
@@ -1495,7 +1495,7 @@ TEST(VPRecipeTest, CastVPReductionEVLRecipeToVPUser) {
VPReductionRecipe Recipe(RecurrenceDescriptor(), nullptr, &ChainOp, &CondOp,
&VecOp, false);
VPValue EVL;
- VPReductionEVLRecipe EVLRecipe(&Recipe, &EVL, &CondOp);
+ VPReductionEVLRecipe EVLRecipe(Recipe, EVL, &CondOp);
EXPECT_TRUE(isa<VPUser>(&EVLRecipe));
VPRecipeBase *BaseR = &EVLRecipe;
EXPECT_TRUE(isa<VPUser>(BaseR));
diff --git a/llvm/utils/gn/secondary/lldb/source/Interpreter/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Interpreter/BUILD.gn
index c6c6ef9..7825396 100644
--- a/llvm/utils/gn/secondary/lldb/source/Interpreter/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/Interpreter/BUILD.gn
@@ -15,7 +15,6 @@ static_library("Interpreter") {
deps = [
":InterpreterProperties",
":InterpreterPropertiesEnum",
- "Interfaces",
"//lldb/source/Commands",
"//lldb/source/Core",
"//lldb/source/DataFormatters",
diff --git a/llvm/utils/gn/secondary/lldb/source/Interpreter/Interfaces/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Interpreter/Interfaces/BUILD.gn
deleted file mode 100644
index 2e70c54..0000000
--- a/llvm/utils/gn/secondary/lldb/source/Interpreter/Interfaces/BUILD.gn
+++ /dev/null
@@ -1,9 +0,0 @@
-static_library("Interfaces") {
- output_name = "lldbInterpreterInterfaces"
- configs += [ "//llvm/utils/gn/build:lldb_code" ]
- deps = [
- "//lldb/source/Utility",
- "//llvm/lib/Support",
- ]
- sources = [ "ScriptedInterfaceUsages.cpp" ]
-}
diff --git a/llvm/utils/gn/secondary/llvm/test/BUILD.gn b/llvm/utils/gn/secondary/llvm/test/BUILD.gn
index 945fb0e..a2b0603 100644
--- a/llvm/utils/gn/secondary/llvm/test/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/test/BUILD.gn
@@ -266,6 +266,7 @@ group("test") {
"//llvm/tools/llvm-cat",
"//llvm/tools/llvm-cfi-verify",
"//llvm/tools/llvm-cov",
+ "//llvm/tools/llvm-ctxprof-util",
"//llvm/tools/llvm-cvtres",
"//llvm/tools/llvm-cxxdump",
"//llvm/tools/llvm-cxxfilt",
diff --git a/llvm/utils/gn/secondary/llvm/tools/llvm-ctxprof-util/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llvm-ctxprof-util/BUILD.gn
new file mode 100644
index 0000000..fd921f6
--- /dev/null
+++ b/llvm/utils/gn/secondary/llvm/tools/llvm-ctxprof-util/BUILD.gn
@@ -0,0 +1,11 @@
+import("//llvm/utils/gn/build/driver_executable.gni")
+
+driver_executable("llvm-ctxprof-util") {
+ deps = [
+ "//llvm/lib/IR",
+ "//llvm/lib/Object",
+ "//llvm/lib/ProfileData",
+ "//llvm/lib/Support",
+ ]
+ sources = [ "llvm-ctxprof-util.cpp" ]
+}
diff --git a/mlir/docs/DefiningDialects/Operations.md b/mlir/docs/DefiningDialects/Operations.md
index 01fadef..c252d9c 100644
--- a/mlir/docs/DefiningDialects/Operations.md
+++ b/mlir/docs/DefiningDialects/Operations.md
@@ -101,6 +101,9 @@ their semantics via a special [TableGen backend][TableGenBackend]:
* The `AttrConstraint` class hierarchy: They are used to specify the
constraints over attributes. A notable subclass hierarchy is `Attr`, which
stands for constraints for attributes whose values are of common types.
+* The `Property` class hierarchy: They are used to specify non-attribute-backed
+ properties that are inherent to operations. This will be expanded to a
+ `PropertyConstraint` class or something similar in the future.
An operation is defined by specializing the `Op` class with concrete contents
for all the fields it requires. For example, `tf.AvgPool` is defined as
@@ -172,9 +175,9 @@ understanding the operation.
### Operation arguments
-There are two kinds of arguments: operands and attributes. Operands are runtime
-values produced by other ops; while attributes are compile-time known constant
-values, including two categories:
+There are three kinds of arguments: operands, attributes, and properties.
+Operands are runtime values produced by other ops; while attributes and properties
+are compile-time known constant values, including two categories:
1. Natural attributes: these attributes affect the behavior of the operations
(e.g., padding for convolution);
@@ -187,8 +190,11 @@ values, including two categories:
even though they are not materialized, it should be possible to store as an
attribute.
-Both operands and attributes are specified inside the `dag`-typed `arguments`,
-led by `ins`:
+Properties are similar to attributes, except that they are not stored within
+the MLIR context but are stored inline with the operation.
+
+Operands, attributes, and properties are specified inside the `dag`-typed
+`arguments`, led by `ins`:
```tablegen
let arguments = (ins
@@ -196,13 +202,15 @@ let arguments = (ins
...
<attr-constraint>:$<attr-name>,
...
+ <property-constraint>:$<property-name>,
);
```
Here `<type-constraint>` is a TableGen `def` from the `TypeConstraint` class
hierarchy. Similarly, `<attr-constraint>` is a TableGen `def` from the
-`AttrConstraint` class hierarchy. See [Constraints](#constraints) for more
-information.
+`AttrConstraint` class hierarchy and `<property-constraint>` is a subclass
+of `Property` (though a `PropertyConstraint` hierarchy is planned).
+See [Constraints](#constraints) for more information.
There is no requirements on the relative order of operands and attributes; they
can mix freely. The relative order of operands themselves matters. From each
@@ -324,6 +332,18 @@ Right now, the following primitive constraints are supported:
TODO: Design and implement more primitive constraints
+#### Optional and default-valued properties
+
+To declare a property with a default value, use `DefaultValuedProperty<..., "...">`.
+If the property's storage data type is different from its interface type,
+for example, in the case of array properties (which are stored as `SmallVector`s
+but use `ArrayRef` as an interface type), add the storage-type equivalent
+of the default value as the third argument.
+
+To declare an optional property, use `OptionalProperty<...>`.
+This wraps the underlying property in an `std::optional` and gives it a
+default value of `std::nullopt`.
+
#### Combining constraints
`AllAttrOf` is provided to allow combination of multiple constraints which
@@ -429,6 +449,8 @@ def MyOp : ... {
I32Attr:$i32_attr,
F32Attr:$f32_attr,
...
+ I32Property:$i32_prop,
+ ...
);
let results = (outs
@@ -453,7 +475,8 @@ static void build(OpBuilder &odsBuilder, OperationState &odsState,
static void build(OpBuilder &odsBuilder, OperationState &odsState,
Type i32_result, Type f32_result, ...,
Value i32_operand, Value f32_operand, ...,
- IntegerAttr i32_attr, FloatAttr f32_attr, ...);
+ IntegerAttr i32_attr, FloatAttr f32_attr, ...,
+ int32_t i32_prop);
// Each result-type/operand/attribute has a separate parameter. The parameters
// for attributes are raw values unwrapped with mlir::Attribute instances.
@@ -462,13 +485,15 @@ static void build(OpBuilder &odsBuilder, OperationState &odsState,
static void build(OpBuilder &odsBuilder, OperationState &odsState,
Type i32_result, Type f32_result, ...,
Value i32_operand, Value f32_operand, ...,
- APInt i32_attr, StringRef f32_attr, ...);
+ APInt i32_attr, StringRef f32_attr, ...,
+ int32_t i32_prop, ...);
// Each operand/attribute has a separate parameter but result type is aggregate.
static void build(OpBuilder &odsBuilder, OperationState &odsState,
TypeRange resultTypes,
Value i32_operand, Value f32_operand, ...,
- IntegerAttr i32_attr, FloatAttr f32_attr, ...);
+ IntegerAttr i32_attr, FloatAttr f32_attr, ...,
+ int32_t i32_prop, ...);
// All operands/attributes have aggregate parameters.
// Generated if return type can be inferred.
@@ -921,8 +946,10 @@ optional-group: `(` then-elements `)` (`:` `(` else-elements `)`)? `?`
The elements of an optional group have the following requirements:
* The first element of `then-elements` must either be a attribute, literal,
- operand, or region.
+ operand, property, or region.
- This is because the first element must be optionally parsable.
+ - If a property is used, it must have an `optionalParser` defined and have a
+ default value.
* Exactly one argument variable or type directive within either
`then-elements` or `else-elements` must be marked as the anchor of the
group.
@@ -984,6 +1011,8 @@ foo.op is_read_only
foo.op
```
+The same logic applies to a `UnitProperty`.
+
##### Optional "else" Group
Optional groups also have support for an "else" group of elements. These are
@@ -1026,6 +1055,8 @@ to:
1. All operand and result types must appear within the format using the various
`type` directives, either individually or with the `operands` or `results`
directives.
+1. Unless all non-attribute properties appear in the format, the `prop-dict`
+ directive must be present.
1. The `attr-dict` directive must always be present.
1. Must not contain overlapping information; e.g. multiple instances of
'attr-dict', types, operands, etc.
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
index b14bd83..260d421 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -59,22 +59,9 @@ class LLVM_IntArithmeticOpWithOverflowFlag<string mnemonic, string instName,
list<Trait> traits = []> :
LLVM_ArithmeticOpBase<AnySignlessInteger, mnemonic, instName,
!listconcat([DeclareOpInterfaceMethods<IntegerOverflowFlagsInterface>], traits)> {
- dag iofArg = (ins EnumProperty<"IntegerOverflowFlags">:$overflowFlags);
+ dag iofArg = (ins EnumProperty<"IntegerOverflowFlags", "", "IntegerOverflowFlags::none">:$overflowFlags);
let arguments = !con(commonArgs, iofArg);
- let builders = [
- OpBuilder<(ins "Type":$type, "Value":$lhs, "Value":$rhs,
- "IntegerOverflowFlags":$overflowFlags), [{
- $_state.getOrAddProperties<Properties>().overflowFlags = overflowFlags;
- build($_builder, $_state, type, lhs, rhs);
- }]>,
- OpBuilder<(ins "Value":$lhs, "Value":$rhs,
- "IntegerOverflowFlags":$overflowFlags), [{
- $_state.getOrAddProperties<Properties>().overflowFlags = overflowFlags;
- build($_builder, $_state, lhs, rhs);
- }]>
- ];
-
string mlirBuilder = [{
auto op = $_builder.create<$_qualCppClassName>($_location, $lhs, $rhs);
moduleImport.setIntegerOverflowFlags(inst, op);
diff --git a/mlir/include/mlir/IR/ODSSupport.h b/mlir/include/mlir/IR/ODSSupport.h
index 70e3f98..25d6f3d 100644
--- a/mlir/include/mlir/IR/ODSSupport.h
+++ b/mlir/include/mlir/IR/ODSSupport.h
@@ -33,6 +33,37 @@ convertFromAttribute(int64_t &storage, Attribute attr,
/// Convert the provided int64_t to an IntegerAttr attribute.
Attribute convertToAttribute(MLIRContext *ctx, int64_t storage);
+/// Convert an IntegerAttr attribute to an int32_t, or return an error if the
+/// attribute isn't an IntegerAttr. If the optional diagnostic is provided an
+/// error message is also emitted.
+LogicalResult
+convertFromAttribute(int32_t &storage, Attribute attr,
+ function_ref<InFlightDiagnostic()> emitError);
+
+/// Convert the provided int32_t to an IntegerAttr attribute.
+Attribute convertToAttribute(MLIRContext *ctx, int32_t storage);
+
+/// Extract the string from `attr` into `storage`. If `attr` is not a
+/// `StringAttr`, return failure and emit an error into the diagnostic from
+/// `emitError`.
+LogicalResult
+convertFromAttribute(std::string &storage, Attribute attr,
+ function_ref<InFlightDiagnostic()> emitError);
+
+/// Convert the given string into a StringAttr. Note that this takes a reference
+/// to the storage of a string property, which is an std::string.
+Attribute convertToAttribute(MLIRContext *ctx, const std::string &storage);
+
+/// Extract the boolean from `attr` into `storage`. If `attr` is not a
+/// `BoolAttr`, return failure and emit an error into the diagnostic from
+/// `emitError`.
+LogicalResult
+convertFromAttribute(bool &storage, Attribute attr,
+ function_ref<InFlightDiagnostic()> emitError);
+
+/// Convert the given string into a BooleanAttr.
+Attribute convertToAttribute(MLIRContext *ctx, bool storage);
+
/// Convert a DenseI64ArrayAttr to the provided storage. It is expected that the
/// storage has the same size as the array. An error is returned if the
/// attribute isn't a DenseI64ArrayAttr or it does not have the same size. If
@@ -49,9 +80,24 @@ LogicalResult
convertFromAttribute(MutableArrayRef<int32_t> storage, Attribute attr,
function_ref<InFlightDiagnostic()> emitError);
+/// Convert a DenseI64ArrayAttr to the provided storage, which will be
+/// cleared before writing. An error is returned and emitted to the optional
+/// `emitError` function if the attribute isn't a DenseI64ArrayAttr.
+LogicalResult
+convertFromAttribute(SmallVectorImpl<int64_t> &storage, Attribute attr,
+ function_ref<InFlightDiagnostic()> emitError);
+
+/// Convert a DenseI32ArrayAttr to the provided storage, which will be
+/// cleared before writing. It is expected that the storage has the same size as
+/// the array. An error is returned and emitted to the optional `emitError`
+/// function if the attribute isn't a DenseI32ArrayAttr.
+LogicalResult
+convertFromAttribute(SmallVectorImpl<int32_t> &storage, Attribute attr,
+ function_ref<InFlightDiagnostic()> emitError);
+
/// Convert the provided ArrayRef<int64_t> to a DenseI64ArrayAttr attribute.
Attribute convertToAttribute(MLIRContext *ctx, ArrayRef<int64_t> storage);
} // namespace mlir
-#endif // MLIR_IR_ODSSUPPORT_H \ No newline at end of file
+#endif // MLIR_IR_ODSSUPPORT_H
diff --git a/mlir/include/mlir/IR/Properties.td b/mlir/include/mlir/IR/Properties.td
index 0babdbb..0becf7d 100644
--- a/mlir/include/mlir/IR/Properties.td
+++ b/mlir/include/mlir/IR/Properties.td
@@ -29,7 +29,6 @@ class Property<string storageTypeParam = "", string desc = ""> {
//
// Format:
// - `$_storage` will contain the property in the storage type.
- // - `$_ctxt` will contain an `MLIRContext *`.
code convertFromStorage = "$_storage";
// The call expression to build a property storage from the interface type.
@@ -40,24 +39,26 @@ class Property<string storageTypeParam = "", string desc = ""> {
code assignToStorage = "$_storage = $_value";
// The call expression to convert from the storage type to an attribute.
+ // The resulting attribute must be non-null in non-error cases.
//
// Format:
// - `$_storage` is the storage type value.
// - `$_ctxt` is a `MLIRContext *`.
//
- // The expression must result in an Attribute.
+ // The expression must return an `Attribute` and will be used as a function body.
code convertToAttribute = [{
- convertToAttribute($_ctxt, $_storage)
+ return convertToAttribute($_ctxt, $_storage);
}];
// The call expression to convert from an Attribute to the storage type.
//
// Format:
- // - `$_storage` is the storage type value.
+ // - `$_storage` is a reference to a value of the storage type.
// - `$_attr` is the attribute.
// - `$_diag` is a callback to get a Diagnostic to emit error.
//
- // The expression must return a LogicalResult
+ // The expression must return a LogicalResult and will be used as a function body
+ // or in other similar contexts.
code convertFromAttribute = [{
return convertFromAttribute($_storage, $_attr, $_diag);
}];
@@ -68,18 +69,68 @@ class Property<string storageTypeParam = "", string desc = ""> {
// - `$_storage` is the variable to hash.
//
// The expression should define a llvm::hash_code.
- code hashProperty = [{
- llvm::hash_value($_storage);
+ // If unspecified, defaults to `llvm::hash_value($_storage)`.
+ // The default is not specified in tablegen because many combinators, like
+ // ArrayProperty, can fall back to more efficient implementations of
+ // `hashProperty` when their underlying elements have trivial hashing.
+ code hashProperty = "";
+
+ // The body of the parser for a value of this property.
+ // Format:
+ // - `$_parser` is the OpAsmParser.
+ // - `$_storage` is the location into which the value is to be placed if it is
+ // present.
+ // - `$_ctxt` is a `MLIRContext *`
+ //
+ // This defines the body of a function (typically a lambda) that returns a
+ // ParseResult. There is an implicit `return success()` at the end of the parser
+ // code.
+ //
+ // When this code executes, `$_storage` will be initialized to the property's
+ // default value (if any, accounting for the storage type override).
+ code parser = [{
+ auto value = ::mlir::FieldParser<}] # storageType # [{>::parse($_parser);
+ if (::mlir::failed(value))
+ return ::mlir::failure();
+ $_storage = std::move(*value);
}];
+ // The body of the parser for a value of this property as the anchor of an optional
+ // group. This should parse the property if possible and do nothing if a value of
+ // the relevant type is not next in the parse stream.
+ // You are not required to define this parser if it cannot be meaningfully
+ // implemented.
+ // This has the same context and substitutions as `parser` except that it is
+ // required to return an OptionalParseResult.
+ //
+ // If the optional parser doesn't parse anything, it should not set
+ // $_storage, since the parser doesn't know if the default value has been
+ // overwritten.
+ code optionalParser = "";
+
+ // The printer for a value of this property.
+ // Format:
+ // - `$_storage` is the storage data.
+ // - `$_printer` is the OpAsmPrinter instance.
+ // - `$_ctxt` is a `MLIRContext *`
+ //
+ // This may be called in an expression context, so variable declarations must
+ // be placed within a new scope.
+ //
+ // The printer for a property should always print a non-empty value - default value
+ // printing elision happens outside the context of this printing expression.
+ code printer = "$_printer << $_storage";
+
// The call expression to emit the storage type to bytecode.
//
// Format:
// - `$_storage` is the storage type value.
// - `$_writer` is a `DialectBytecodeWriter`.
// - `$_ctxt` is a `MLIRContext *`.
+ //
+ // This will become the body af a function returning void.
code writeToMlirBytecode = [{
- writeToMlirBytecode($_writer, $_storage)
+ writeToMlirBytecode($_writer, $_storage);
}];
// The call expression to read the storage type from bytecode.
@@ -88,13 +139,31 @@ class Property<string storageTypeParam = "", string desc = ""> {
// - `$_storage` is the storage type value.
// - `$_reader` is a `DialectBytecodeReader`.
// - `$_ctxt` is a `MLIRContext *`.
+ //
+ // This will become the body of a function returning LogicalResult.
+ // There is an implicit `return success()` at the end of this function.
+ //
+ // When this code executes, `$_storage` will be initialized to the property's
+ // default value (if any, accounting for the storage type override).
code readFromMlirBytecode = [{
if (::mlir::failed(readFromMlirBytecode($_reader, $_storage)))
return ::mlir::failure();
}];
- // Default value for the property.
- string defaultValue = ?;
+ // Base definition for the property. (Will be) used for `OptionalProperty` and
+ // such cases, analogously to `baseAttr`.
+ Property baseProperty = ?;
+
+ // Default value for the property within its storage. This should be an expression
+ // of type `interfaceType` and should be comparable with other types of that
+ // interface typ with `==`. The empty string means there is no default value.
+ string defaultValue = "";
+
+ // If set, the default value the storage of the property should be initilized to.
+ // This is only needed when the storage and interface types of the property
+ // are distinct (ex. SmallVector for storage vs. ArrayRef for interfacing), as it
+ // will fall back to `defaultValue` when unspecified.
+ string storageTypeValueOverride = "";
}
/// Implementation of the Property class's `readFromMlirBytecode` field using
@@ -133,12 +202,16 @@ defvar writeMlirBytecodeWithConvertToAttribute = [{
// Primitive property kinds
// Any kind of integer stored as properties.
-class IntProperty<string storageTypeParam = "", string desc = ""> :
+class IntProperty<string storageTypeParam, string desc = ""> :
Property<storageTypeParam, desc> {
- code writeToMlirBytecode = [{
+ let summary = !if(!empty(desc), storageTypeParam, desc);
+ let optionalParser = [{
+ return $_parser.parseOptionalInteger($_storage);
+ }];
+ let writeToMlirBytecode = [{
$_writer.writeVarInt($_storage);
}];
- code readFromMlirBytecode = [{
+ let readFromMlirBytecode = [{
uint64_t val;
if (failed($_reader.readVarInt(val)))
return ::mlir::failure();
@@ -146,24 +219,472 @@ class IntProperty<string storageTypeParam = "", string desc = ""> :
}];
}
-class ArrayProperty<string storageTypeParam = "", int n, string desc = ""> :
- Property<storageTypeParam # "[" # n # "]", desc> {
- let interfaceType = "::llvm::ArrayRef<" # storageTypeParam # ">";
- let convertFromStorage = "$_storage";
- let assignToStorage = "::llvm::copy($_value, $_storage)";
-}
+def I32Property : IntProperty<"int32_t">;
+def I64Property : IntProperty<"int64_t">;
-class EnumProperty<string storageTypeParam, string desc = ""> :
+class EnumProperty<string storageTypeParam, string desc = "", string default = ""> :
Property<storageTypeParam, desc> {
- code writeToMlirBytecode = [{
+ // TODO: take advantage of EnumAttrInfo and the like to make this share nice
+ // parsing code with EnumAttr.
+ let writeToMlirBytecode = [{
$_writer.writeVarInt(static_cast<uint64_t>($_storage));
}];
- code readFromMlirBytecode = [{
+ let readFromMlirBytecode = [{
uint64_t val;
if (failed($_reader.readVarInt(val)))
return ::mlir::failure();
$_storage = static_cast<}] # storageTypeParam # [{>(val);
}];
+ let defaultValue = default;
}
+def StringProperty : Property<"std::string", "string"> {
+ let interfaceType = "::llvm::StringRef";
+ let convertFromStorage = "::llvm::StringRef{$_storage}";
+ let assignToStorage = "$_storage = $_value.str()";
+ let optionalParser = [{
+ if (::mlir::failed($_parser.parseOptionalString(&$_storage)))
+ return std::nullopt;
+ }];
+ let printer = "$_printer.printString($_storage)";
+ let readFromMlirBytecode = [{
+ StringRef val;
+ if (::mlir::failed($_reader.readString(val)))
+ return ::mlir::failure();
+ $_storage = val.str();
+ }];
+ let writeToMlirBytecode = [{
+ $_writer.writeOwnedString($_storage);
+ }];
+}
+
+def BoolProperty : IntProperty<"bool", "boolean"> {
+ let printer = [{ $_printer << ($_storage ? "true" : "false") }];
+ let readFromMlirBytecode = [{
+ return $_reader.readBool($_storage);
+ }];
+ let writeToMlirBytecode = [{
+ $_writer.writeOwnedBool($_storage);
+ }];
+}
+
+def UnitProperty : Property<"bool", "unit property"> {
+ let summary = "unit property";
+ let description = [{
+ A property whose presence or abscence is used as a flag.
+
+ This is stored as a boolean that defaults to false, and is named UnitProperty
+ by analogy with UnitAttr, which has the more comprehensive rationale and
+ explains the less typical syntax.
+
+ Note that this attribute does have a syntax for the false case to allow for its
+ use in contexts where default values shouldn't be elided.
+ }];
+ let defaultValue = "false";
+
+ let convertToAttribute = [{
+ if ($_storage)
+ return ::mlir::UnitAttr::get($_ctxt);
+ else
+ return ::mlir::BoolAttr::get($_ctxt, false);
+ }];
+ let convertFromAttribute = [{
+ if (::llvm::isa<::mlir::UnitAttr>($_attr)) {
+ $_storage = true;
+ return ::mlir::success();
+ }
+ if (auto boolAttr = ::llvm::dyn_cast<::mlir::BoolAttr>($_attr)) {
+ $_storage = boolAttr.getValue();
+ return ::mlir::success();
+ }
+ return ::mlir::failure();
+ }];
+
+ let parser = [{
+ ::llvm::StringRef keyword;
+ if (::mlir::failed($_parser.parseOptionalKeyword(&keyword,
+ {"unit", "unit_absent"})))
+ return $_parser.emitError($_parser.getCurrentLocation(),
+ "expected 'unit' or 'unit_absent'");
+ $_storage = (keyword == "unit");
+ }];
+
+ let optionalParser = [{
+ ::llvm::StringRef keyword;
+ if (::mlir::failed($_parser.parseOptionalKeyword(&keyword,
+ {"unit", "unit_absent"})))
+ return std::nullopt;
+ $_storage = (keyword == "unit");
+ }];
+
+ let printer = [{
+ $_printer << ($_storage ? "unit" : "unit_absent")
+ }];
+
+ let writeToMlirBytecode = [{
+ $_writer.writeOwnedBool($_storage);
+ }];
+ let readFromMlirBytecode = [{
+ if (::mlir::failed($_reader.readBool($_storage)))
+ return ::mlir::failure();
+ }];
+}
+
+//===----------------------------------------------------------------------===//
+// Primitive property combinators
+
+/// Create a variable named `name` of `prop`'s storage type that is initialized
+/// to the correct default value, if there is one.
+class _makePropStorage<Property prop, string name> {
+ code ret = prop.storageType # " " # name
+ # !cond(!not(!empty(prop.storageTypeValueOverride)) : " = " # prop.storageTypeValueOverride,
+ !not(!empty(prop.defaultValue)) : " = " # prop.defaultValue,
+ true : "") # ";";
+}
+
+/// The generic class for arrays of some other property, which is stored as a
+/// `SmallVector` of that property. This uses an `ArrayAttr` as its attribute form
+/// though subclasses can override this, as is the case with IntArrayAttr below.
+/// Those wishing to use a non-default number of SmallVector elements should
+/// subclass `ArrayProperty`.
+class ArrayProperty<Property elem = Property<>, string desc = ""> :
+ Property<"::llvm::SmallVector<" # elem.storageType # ">", desc> {
+ let summary = "array of " # elem.summary;
+ let interfaceType = "::llvm::ArrayRef<" # elem.storageType # ">";
+ let convertFromStorage = "::llvm::ArrayRef<" # elem.storageType # ">{$_storage}";
+ let assignToStorage = "$_storage.assign($_value.begin(), $_value.end())";
+
+ let convertFromAttribute = [{
+ auto arrayAttr = ::llvm::dyn_cast_if_present<::mlir::ArrayAttr>($_attr);
+ if (!arrayAttr)
+ return $_diag() << "expected array attribute";
+ for (::mlir::Attribute elemAttr : arrayAttr) {
+ }] # _makePropStorage<elem, "elemVal">.ret # [{
+ auto elemRes = [&](Attribute propAttr, }] # elem.storageType # [{& propStorage) -> ::mlir::LogicalResult {
+ }] # !subst("$_attr", "propAttr",
+ !subst("$_storage", "propStorage", elem.convertFromAttribute)) # [{
+ }(elemAttr, elemVal);
+ if (::mlir::failed(elemRes))
+ return ::mlir::failure();
+ $_storage.push_back(std::move(elemVal));
+ }
+ return ::mlir::success();
+ }];
+
+ let convertToAttribute = [{
+ SmallVector<Attribute> elems;
+ for (const auto& elemVal : $_storage) {
+ auto elemAttr = [&](const }] # elem.storageType #[{& propStorage) -> ::mlir::Attribute {
+ }] # !subst("$_storage", "propStorage", elem.convertToAttribute) # [{
+ }(elemVal);
+ elems.push_back(elemAttr);
+ }
+ return ::mlir::ArrayAttr::get($_ctxt, elems);
+ }];
+
+ defvar theParserBegin = [{
+ auto& storage = $_storage;
+ auto parseElemFn = [&]() -> ::mlir::ParseResult {
+ }] # _makePropStorage<elem, "elemVal">.ret # [{
+ auto elemParse = [&](}] # elem.storageType # [{& propStorage) -> ::mlir::ParseResult {
+ }] # !subst("$_storage", "propStorage", elem.parser) # [{
+ return ::mlir::success();
+ }(elemVal);
+ if (::mlir::failed(elemParse))
+ return ::mlir::failure();
+ storage.push_back(std::move(elemVal));
+ return ::mlir::success();
+ };
+ }];
+ let parser = theParserBegin # [{
+ return $_parser.parseCommaSeparatedList(
+ ::mlir::OpAsmParser::Delimiter::Square, parseElemFn);
+ }];
+ // Hack around the lack of a peek method
+ let optionalParser = theParserBegin # [{
+ auto oldLoc = $_parser.getCurrentLocation();
+ auto parseResult = $_parser.parseCommaSeparatedList(
+ ::mlir::OpAsmParser::Delimiter::OptionalSquare, parseElemFn);
+ if (::mlir::failed(parseResult))
+ return ::mlir::failure();
+ auto newLoc = $_parser.getCurrentLocation();
+ if (oldLoc == newLoc)
+ return std::nullopt;
+ return ::mlir::success();
+ }];
+
+ let printer = [{ [&](){
+ $_printer << "[";
+ auto elemPrinter = [&](const }] # elem.storageType # [{& elemVal) {
+ }] # !subst("$_storage", "elemVal", elem.printer) #[{;
+ };
+ ::llvm::interleaveComma($_storage, $_printer, elemPrinter);
+ $_printer << "]";
+ }()}];
+
+ let readFromMlirBytecode = [{
+ uint64_t length;
+ if (::mlir::failed($_reader.readVarInt(length)))
+ return ::mlir::failure();
+ $_storage.reserve(length);
+ for (uint64_t i = 0; i < length; ++i) {
+ }]# _makePropStorage<elem, "elemVal">.ret # [{
+ auto elemRead = [&](}] # elem.storageType # [{& propStorage) -> ::mlir::LogicalResult {
+ }] # !subst("$_storage", "propStorage", elem.readFromMlirBytecode) # [{;
+ return ::mlir::success();
+ }(elemVal);
+ if (::mlir::failed(elemRead))
+ return ::mlir::failure();
+ $_storage.push_back(std::move(elemVal));
+ }
+ }];
+
+ let writeToMlirBytecode = [{
+ $_writer.writeVarInt($_storage.size());
+ for (const auto& elemVal : $_storage) {
+ [&]() {
+ }] # !subst("$_storage", "elemVal", elem.writeToMlirBytecode) #[{;
+ }();
+ }
+ }];
+
+ // There's no hash_value for SmallVector<T>, so we construct the ArrayRef ourselves.
+ // In the non-trivial case, we define a mapped range to get internal hash
+ // codes.
+ let hashProperty = !if(!empty(elem.hashProperty),
+ [{::llvm::hash_value(::llvm::ArrayRef<}] # elem.storageType # [{>{$_storage})}],
+ [{[&]() -> ::llvm::hash_code {
+ auto getElemHash = [](const auto& propStorage) -> ::llvm::hash_code {
+ return }] # !subst("$_storage", "propStorage", elem.hashProperty) # [{;
+ };
+ auto mapped = ::llvm::map_range($_storage, getElemHash);
+ return ::llvm::hash_combine_range(mapped.begin(), mapped.end());
+ }()
+ }]);
+}
+
+class IntArrayProperty<string storageTypeParam = "", string desc = ""> :
+ ArrayProperty<IntProperty<storageTypeParam, desc>> {
+ // Bring back the trivial conversions we don't get in the general case.
+ let convertFromAttribute = [{
+ return convertFromAttribute($_storage, $_attr, $_diag);
+ }];
+ let convertToAttribute = [{
+ return convertToAttribute($_ctxt, $_storage);
+ }];
+}
+
+/// Class for giving a property a default value.
+/// This doesn't change anything about the property other than giving it a default
+/// which can be used by ODS to elide printing.
+class DefaultValuedProperty<Property p, string default = "", string storageDefault = ""> : Property<p.storageType, p.summary> {
+ let defaultValue = default;
+ let storageTypeValueOverride = storageDefault;
+ let baseProperty = p;
+ // Keep up to date with `Property` above.
+ let summary = p.summary;
+ let description = p.description;
+ let storageType = p.storageType;
+ let interfaceType = p.interfaceType;
+ let convertFromStorage = p.convertFromStorage;
+ let assignToStorage = p.assignToStorage;
+ let convertToAttribute = p.convertToAttribute;
+ let convertFromAttribute = p.convertFromAttribute;
+ let hashProperty = p.hashProperty;
+ let parser = p.parser;
+ let optionalParser = p.optionalParser;
+ let printer = p.printer;
+ let readFromMlirBytecode = p.readFromMlirBytecode;
+ let writeToMlirBytecode = p.writeToMlirBytecode;
+}
+
+/// An optional property, stored as an std::optional<p.storageType>
+/// interfaced with as an std::optional<p.interfaceType>..
+/// The syntax is `none` (or empty string if elided) for an absent value or
+/// `some<[underlying property]>` when a value is set.
+///
+/// As a special exception, if the underlying property has an optional parser and
+/// no default value (ex. an integer property), the printer will skip the `some`
+/// bracketing and delegate to the optional parser. In that case, the syntax is the
+/// syntax of the underlying property, or the keyword `none` in the rare cases that
+/// it is needed. This behavior can be disabled by setting `canDelegateParsing` to 0.
+class OptionalProperty<Property p, bit canDelegateParsing = 1>
+ : Property<"std::optional<" # p.storageType # ">", "optional " # p.summary> {
+
+ // In the cases where the underlying attribute is plain old data that's passed by
+ // value, the conversion code is trivial.
+ defvar hasTrivialStorage = !and(!eq(p.convertFromStorage, "$_storage"),
+ !eq(p.assignToStorage, "$_storage = $_value"),
+ !eq(p.storageType, p.interfaceType));
+
+ defvar delegatesParsing = !and(!empty(p.defaultValue),
+ !not(!empty(p.optionalParser)), canDelegateParsing);
+
+ let interfaceType = "std::optional<" # p.interfaceType # ">";
+ let defaultValue = "std::nullopt";
+
+ let convertFromStorage = !if(hasTrivialStorage,
+ p.convertFromStorage,
+ [{($_storage.has_value() ? std::optional<}] # p.interfaceType # ">{"
+ # !subst("$_storage", "(*($_storage))", p.convertFromStorage)
+ # [{} : std::nullopt)}]);
+ let assignToStorage = !if(hasTrivialStorage,
+ p.assignToStorage,
+ [{[&]() {
+ if (!$_value.has_value()) {
+ $_storage = std::nullopt;
+ return;
+ }
+ }] # _makePropStorage<p, "presentVal">.ret # [{
+ [&](}] # p.storageType # [{& propStorage) {
+ }] # !subst("$_storage", "propStorage",
+ !subst("$_value", "(*($_value))", p.assignToStorage)) # [{;
+ }(presentVal);
+ $_storage = std::move(presentVal);
+ }()}]);
+
+ let convertFromAttribute = [{
+ auto arrayAttr = ::llvm::dyn_cast<::mlir::ArrayAttr>($_attr);
+ if (!arrayAttr)
+ return $_diag() << "expected optional properties to materialize as arrays";
+ if (arrayAttr.size() > 1)
+ return $_diag() << "expected optional properties to become 0- or 1-element arrays";
+ if (arrayAttr.empty()) {
+ $_storage = std::nullopt;
+ return ::mlir::success();
+ }
+ ::mlir::Attribute presentAttr = arrayAttr[0];
+ }] # _makePropStorage<p, "presentVal">.ret # [{
+ auto presentRes = [&](Attribute propAttr, }] # p.storageType # [{& propStorage) -> ::mlir::LogicalResult {
+ }] # !subst("$_storage", "propStorage",
+ !subst("$_attr", "propAttr", p.convertFromAttribute)) # [{
+ }(presentAttr, presentVal);
+ if (::mlir::failed(presentRes))
+ return ::mlir::failure();
+ $_storage = std::move(presentVal);
+ return ::mlir::success();
+ }];
+
+ let convertToAttribute = [{
+ if (!$_storage.has_value()) {
+ return ::mlir::ArrayAttr::get($_ctxt, {});
+ }
+ auto attr = [&]() -> ::mlir::Attribute {
+ }] # !subst("$_storage", "(*($_storage))", p.convertToAttribute) # [{
+ }();
+ return ::mlir::ArrayAttr::get($_ctxt, {attr});
+ }];
+
+ defvar delegatedParserBegin = [{
+ if (::mlir::succeeded($_parser.parseOptionalKeyword("none"))) {
+ $_storage = std::nullopt;
+ return ::mlir::success();
+ }
+ }] #_makePropStorage<p, "presentVal">.ret # [{
+ auto delegParseResult = [&](}] # p.storageType # [{& propStorage) -> ::mlir::OptionalParseResult {
+ }] # !subst("$_storage", "propStorage", p.optionalParser) # [{
+ return ::mlir::success();
+ }(presentVal);
+ if (!delegParseResult.has_value()) {
+ }];
+
+ defvar delegatedParserEnd = [{
+ }
+ if (delegParseResult.has_value() && ::mlir::failed(*delegParseResult))
+ return ::mlir::failure();
+ $_storage = std::move(presentVal);
+ return ::mlir::success();
+ }];
+ // If we're being explicitly called for our parser, we're expecting to have been
+ // printede into a context where the default value isn't elided. Therefore,
+ // not-present from the underlying parser is a failure.
+ defvar delegatedParser = delegatedParserBegin # [{
+ return ::mlir::failure();
+ }] # delegatedParserEnd;
+ defvar delegatedOptionalParser = delegatedParserBegin # [{
+ return std::nullopt;
+ }] # delegatedParserEnd;
+
+ defvar generalParserBegin = [{
+ ::llvm::StringRef keyword;
+ if (::mlir::failed($_parser.parseOptionalKeyword(&keyword, {"none", "some"}))) {
+ }];
+ defvar generalParserEnd = [{
+ }
+ if (keyword == "none") {
+ $_storage = std::nullopt;
+ return ::mlir::success();
+ }
+ if (::mlir::failed($_parser.parseLess()))
+ return ::mlir::failure();
+ }] # _makePropStorage<p, "presentVal">.ret # [{
+ auto presentParse = [&](}] # p.storageType # [{& propStorage) -> ::mlir::ParseResult {
+ }] # !subst("$_storage", "propStorage", p.parser) # [{
+ return ::mlir::success();
+ }(presentVal);
+ if (presentParse || $_parser.parseGreater())
+ return ::mlir::failure();
+ $_storage = std::move(presentVal);
+ }];
+ defvar generalParser = generalParserBegin # [{
+ return $_parser.emitError($_parser.getCurrentLocation(), "expected 'none' or 'some<prop>'");
+ }] # generalParserEnd;
+ defvar generalOptionalParser = generalParserBegin # [{
+ return std::nullopt;
+ }] # generalParserEnd;
+
+ let parser = !if(delegatesParsing, delegatedParser, generalParser);
+ let optionalParser = !if(delegatesParsing,
+ delegatedOptionalParser, generalOptionalParser);
+
+ defvar delegatedPrinter = [{
+ [&]() {
+ if (!$_storage.has_value()) {
+ $_printer << "none";
+ return;
+ }
+ }] # !subst("$_storage", "(*($_storage))", p.printer) # [{;
+ }()}];
+ defvar generalPrinter = [{
+ [&]() {
+ if (!$_storage.has_value()) {
+ $_printer << "none";
+ return;
+ }
+ $_printer << "some<";
+ }] # !subst("$_storage", "(*($_storage))", p.printer) # [{;
+ $_printer << ">";
+ }()}];
+ let printer = !if(delegatesParsing, delegatedPrinter, generalPrinter);
+
+ let readFromMlirBytecode = [{
+ bool isPresent = false;
+ if (::mlir::failed($_reader.readBool(isPresent)))
+ return ::mlir::failure();
+ if (!isPresent) {
+ $_storage = std::nullopt;
+ return ::mlir::success();
+ }
+ }] # _makePropStorage<p, "presentVal">.ret # [{
+ auto presentResult = [&](}] # p.storageType # [{& propStorage) -> ::mlir::LogicalResult {
+ }] # !subst("$_storage", "propStorage", p.readFromMlirBytecode) # [{;
+ return ::mlir::success();
+ }(presentVal);
+ if (::mlir::failed(presentResult))
+ return ::mlir::failure();
+ $_storage = std::move(presentVal);
+ }];
+ let writeToMlirBytecode = [{
+ $_writer.writeOwnedBool($_storage.has_value());
+ if (!$_storage.has_value())
+ return;
+ }] # !subst("$_storage", "(*($_storage))", p.writeToMlirBytecode);
+
+ let hashProperty = !if(!empty(p.hashProperty), p.hashProperty,
+ [{ ::llvm::hash_value($_storage.has_value() ? std::optional<::llvm::hash_code>{}] #
+ !subst("$_storage", "(*($_storage))", p.hashProperty) #[{} : std::nullopt) }]);
+ assert !or(!not(delegatesParsing), !eq(defaultValue, "std::nullopt")),
+ "For delegated parsing to be used, the default value must be nullopt. " #
+ "To use a non-trivial default, set the canDelegateParsing argument to 0";
+}
#endif // PROPERTIES
diff --git a/mlir/include/mlir/TableGen/Operator.h b/mlir/include/mlir/TableGen/Operator.h
index cc5853c..768291a 100644
--- a/mlir/include/mlir/TableGen/Operator.h
+++ b/mlir/include/mlir/TableGen/Operator.h
@@ -384,7 +384,7 @@ private:
SmallVector<NamedAttribute, 4> attributes;
/// The properties of the op.
- SmallVector<NamedProperty> properties;
+ SmallVector<NamedProperty, 4> properties;
/// The arguments of the op (operands and native attributes).
SmallVector<Argument, 4> arguments;
diff --git a/mlir/include/mlir/TableGen/Property.h b/mlir/include/mlir/TableGen/Property.h
index d0d6f49..702e675 100644
--- a/mlir/include/mlir/TableGen/Property.h
+++ b/mlir/include/mlir/TableGen/Property.h
@@ -35,12 +35,20 @@ class Property {
public:
explicit Property(const llvm::Record *record);
explicit Property(const llvm::DefInit *init);
- Property(StringRef storageType, StringRef interfaceType,
- StringRef convertFromStorageCall, StringRef assignToStorageCall,
- StringRef convertToAttributeCall, StringRef convertFromAttributeCall,
+ Property(StringRef summary, StringRef description, StringRef storageType,
+ StringRef interfaceType, StringRef convertFromStorageCall,
+ StringRef assignToStorageCall, StringRef convertToAttributeCall,
+ StringRef convertFromAttributeCall, StringRef parserCall,
+ StringRef optionalParserCall, StringRef printerCall,
StringRef readFromMlirBytecodeCall,
StringRef writeToMlirBytecodeCall, StringRef hashPropertyCall,
- StringRef defaultValue);
+ StringRef defaultValue, StringRef storageTypeValueOverride);
+
+ // Returns the summary (for error messages) of this property's type.
+ StringRef getSummary() const { return summary; }
+
+ // Returns the description of this property.
+ StringRef getDescription() const { return description; }
// Returns the storage type.
StringRef getStorageType() const { return storageType; }
@@ -66,6 +74,19 @@ public:
return convertFromAttributeCall;
}
+ // Returns the method call which parses this property from textual MLIR.
+ StringRef getParserCall() const { return parserCall; }
+
+ // Returns true if this property has defined an optional parser.
+ bool hasOptionalParser() const { return !optionalParserCall.empty(); }
+
+ // Returns the method call which optionally parses this property from textual
+ // MLIR.
+ StringRef getOptionalParserCall() const { return optionalParserCall; }
+
+ // Returns the method call which prints this property to textual MLIR.
+ StringRef getPrinterCall() const { return printerCall; }
+
// Returns the method call which reads this property from
// bytecode and assign it to the storage.
StringRef getReadFromMlirBytecodeCall() const {
@@ -87,6 +108,24 @@ public:
// Returns the default value for this Property.
StringRef getDefaultValue() const { return defaultValue; }
+ // Returns whether this Property has a default storage-type value that is
+ // distinct from its default interface-type value.
+ bool hasStorageTypeValueOverride() const {
+ return !storageTypeValueOverride.empty();
+ }
+
+ StringRef getStorageTypeValueOverride() const {
+ return storageTypeValueOverride;
+ }
+
+ // Returns this property's TableGen def-name.
+ StringRef getPropertyDefName() const;
+
+ // Returns the base-level property that this Property constraint is based on
+ // or the Property itself otherwise. (Note: there are currently no
+ // property constraints, this function is added for future-proofing)
+ Property getBaseProperty() const;
+
// Returns the TableGen definition this Property was constructed from.
const llvm::Record &getDef() const { return *def; }
@@ -95,16 +134,22 @@ private:
const llvm::Record *def;
// Elements describing a Property, in general fetched from the record.
+ StringRef summary;
+ StringRef description;
StringRef storageType;
StringRef interfaceType;
StringRef convertFromStorageCall;
StringRef assignToStorageCall;
StringRef convertToAttributeCall;
StringRef convertFromAttributeCall;
+ StringRef parserCall;
+ StringRef optionalParserCall;
+ StringRef printerCall;
StringRef readFromMlirBytecodeCall;
StringRef writeToMlirBytecodeCall;
StringRef hashPropertyCall;
StringRef defaultValue;
+ StringRef storageTypeValueOverride;
};
// A struct wrapping an op property and its name together
diff --git a/mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h b/mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h
index aca25ca..6be5548 100644
--- a/mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h
+++ b/mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h
@@ -9,6 +9,7 @@
#define MLIR_CONVERSION_GPUCOMMON_OPTOFUNCCALLLOWERING_H_
#include "mlir/Conversion/LLVMCommon/Pattern.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/IR/Builders.h"
diff --git a/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp b/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp
index 004428b..c76d489 100644
--- a/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp
+++ b/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp
@@ -774,6 +774,94 @@ struct ConvertIllegalShapeCastOpsToTransposes
}
};
+/// Returns an iterator over the dims (inc scalability) of a VectorType.
+static auto getDims(VectorType vType) {
+ return llvm::zip_equal(vType.getShape(), vType.getScalableDims());
+}
+
+/// Helper to drop (fixed-size) unit dims from a VectorType.
+static VectorType dropUnitDims(VectorType vType) {
+ SmallVector<bool> scalableFlags;
+ SmallVector<int64_t> dimSizes;
+ for (auto dim : getDims(vType)) {
+ if (dim == std::make_tuple(1, false))
+ continue;
+ auto [size, scalableFlag] = dim;
+ dimSizes.push_back(size);
+ scalableFlags.push_back(scalableFlag);
+ }
+ return VectorType::get(dimSizes, vType.getElementType(), scalableFlags);
+}
+
+/// A pattern to swap shape_cast(tranpose) with transpose(shape_cast) if the
+/// shape_cast only drops unit dimensions.
+///
+/// This simplifies the transpose making it possible for other legalization
+/// rewrites to handle it.
+///
+/// Example:
+///
+/// BEFORE:
+/// ```mlir
+/// %0 = vector.transpose %vector, [3, 0, 1, 2]
+/// : vector<1x1x4x[4]xf32> to vector<[4]x1x1x4xf32>
+/// %1 = vector.shape_cast %0 : vector<[4]x1x1x4xf32> to vector<[4]x4xf32>
+/// ```
+///
+/// AFTER:
+/// ```mlir
+/// %0 = vector.shape_cast %arg0 : vector<1x1x4x[4]xf32> to vector<4x[4]xf32>
+/// %1 = vector.transpose %0, [1, 0] : vector<4x[4]xf32> to vector<[4]x4xf32>
+/// ```
+struct SwapShapeCastOfTranspose : public OpRewritePattern<vector::ShapeCastOp> {
+ using OpRewritePattern::OpRewritePattern;
+
+ LogicalResult matchAndRewrite(vector::ShapeCastOp shapeCastOp,
+ PatternRewriter &rewriter) const override {
+ auto transposeOp =
+ shapeCastOp.getSource().getDefiningOp<vector::TransposeOp>();
+ if (!transposeOp)
+ return rewriter.notifyMatchFailure(shapeCastOp, "not TransposeOp");
+
+ auto resultType = shapeCastOp.getResultVectorType();
+ if (resultType.getRank() <= 1)
+ return rewriter.notifyMatchFailure(shapeCastOp, "result rank too low");
+
+ if (resultType != dropUnitDims(shapeCastOp.getSourceVectorType()))
+ return rewriter.notifyMatchFailure(
+ shapeCastOp, "ShapeCastOp changes non-unit dimension(s)");
+
+ auto transposeSourceVectorType = transposeOp.getSourceVectorType();
+ auto transposeSourceDims =
+ llvm::to_vector(getDims(transposeSourceVectorType));
+
+ // Construct a map from dimIdx -> number of dims dropped before dimIdx.
+ SmallVector<int64_t> droppedDimsBefore(transposeSourceVectorType.getRank());
+ int64_t droppedDims = 0;
+ for (auto [i, dim] : llvm::enumerate(transposeSourceDims)) {
+ droppedDimsBefore[i] = droppedDims;
+ if (dim == std::make_tuple(1, false))
+ ++droppedDims;
+ }
+
+ // Drop unit dims from transpose permutation.
+ auto perm = transposeOp.getPermutation();
+ SmallVector<int64_t> newPerm;
+ for (int64_t idx : perm) {
+ if (transposeSourceDims[idx] == std::make_tuple(1, false))
+ continue;
+ newPerm.push_back(idx - droppedDimsBefore[idx]);
+ }
+
+ auto loc = shapeCastOp.getLoc();
+ auto newShapeCastOp = rewriter.create<vector::ShapeCastOp>(
+ loc, dropUnitDims(transposeSourceVectorType), transposeOp.getVector());
+ rewriter.replaceOpWithNewOp<vector::TransposeOp>(shapeCastOp,
+ newShapeCastOp, newPerm);
+ return success();
+ }
+};
+
/// Rewrites an illegal/unsupported SVE transfer_write(transpose) to instead use
/// the ZA state. This workaround rewrite to support these transposes when ZA is
/// available.
@@ -939,7 +1027,8 @@ struct VectorLegalizationPass
patterns.add<FoldExtractFromVectorOfSMELikeCreateMasks,
LiftIllegalVectorTransposeToMemory,
ConvertIllegalShapeCastOpsToTransposes,
- LowerIllegalTransposeStoreViaZA>(context);
+ SwapShapeCastOfTranspose, LowerIllegalTransposeStoreViaZA>(
+ context);
// Note: These two patterns are added with a high benefit to ensure:
// - Masked outer products are handled before unmasked ones
// - Multi-tile writes are lowered as a store loop (if possible)
diff --git a/mlir/lib/IR/ODSSupport.cpp b/mlir/lib/IR/ODSSupport.cpp
index 6e968d62..d56c75e 100644
--- a/mlir/lib/IR/ODSSupport.cpp
+++ b/mlir/lib/IR/ODSSupport.cpp
@@ -33,6 +33,50 @@ Attribute mlir::convertToAttribute(MLIRContext *ctx, int64_t storage) {
return IntegerAttr::get(IntegerType::get(ctx, 64), storage);
}
+LogicalResult
+mlir::convertFromAttribute(int32_t &storage, Attribute attr,
+ function_ref<InFlightDiagnostic()> emitError) {
+ auto valueAttr = dyn_cast<IntegerAttr>(attr);
+ if (!valueAttr) {
+ emitError() << "expected IntegerAttr for key `value`";
+ return failure();
+ }
+ storage = valueAttr.getValue().getSExtValue();
+ return success();
+}
+Attribute mlir::convertToAttribute(MLIRContext *ctx, int32_t storage) {
+ return IntegerAttr::get(IntegerType::get(ctx, 32), storage);
+}
+
+LogicalResult
+mlir::convertFromAttribute(std::string &storage, Attribute attr,
+ function_ref<InFlightDiagnostic()> emitError) {
+ auto valueAttr = dyn_cast<StringAttr>(attr);
+ if (!valueAttr)
+ return emitError()
+ << "expected string property to come from string attribute";
+ storage = valueAttr.getValue().str();
+ return success();
+}
+Attribute mlir::convertToAttribute(MLIRContext *ctx,
+ const std::string &storage) {
+ return StringAttr::get(ctx, storage);
+}
+
+LogicalResult
+mlir::convertFromAttribute(bool &storage, Attribute attr,
+ function_ref<InFlightDiagnostic()> emitError) {
+ auto valueAttr = dyn_cast<BoolAttr>(attr);
+ if (!valueAttr)
+ return emitError()
+ << "expected string property to come from string attribute";
+ storage = valueAttr.getValue();
+ return success();
+}
+Attribute mlir::convertToAttribute(MLIRContext *ctx, bool storage) {
+ return BoolAttr::get(ctx, storage);
+}
+
template <typename DenseArrayTy, typename T>
LogicalResult
convertDenseArrayFromAttr(MutableArrayRef<T> storage, Attribute attr,
@@ -64,6 +108,33 @@ mlir::convertFromAttribute(MutableArrayRef<int32_t> storage, Attribute attr,
"DenseI32ArrayAttr");
}
+template <typename DenseArrayTy, typename T>
+LogicalResult
+convertDenseArrayFromAttr(SmallVectorImpl<T> &storage, Attribute attr,
+ function_ref<InFlightDiagnostic()> emitError,
+ StringRef denseArrayTyStr) {
+ auto valueAttr = dyn_cast<DenseArrayTy>(attr);
+ if (!valueAttr) {
+ emitError() << "expected " << denseArrayTyStr << " for key `value`";
+ return failure();
+ }
+ storage.resize_for_overwrite(valueAttr.size());
+ llvm::copy(valueAttr.asArrayRef(), storage.begin());
+ return success();
+}
+LogicalResult
+mlir::convertFromAttribute(SmallVectorImpl<int64_t> &storage, Attribute attr,
+ function_ref<InFlightDiagnostic()> emitError) {
+ return convertDenseArrayFromAttr<DenseI64ArrayAttr>(storage, attr, emitError,
+ "DenseI64ArrayAttr");
+}
+LogicalResult
+mlir::convertFromAttribute(SmallVectorImpl<int32_t> &storage, Attribute attr,
+ function_ref<InFlightDiagnostic()> emitError) {
+ return convertDenseArrayFromAttr<DenseI32ArrayAttr>(storage, attr, emitError,
+ "DenseI32ArrayAttr");
+}
+
Attribute mlir::convertToAttribute(MLIRContext *ctx,
ArrayRef<int64_t> storage) {
return DenseI64ArrayAttr::get(ctx, storage);
diff --git a/mlir/lib/TableGen/Property.cpp b/mlir/lib/TableGen/Property.cpp
index e61d2fd..b86b87df 100644
--- a/mlir/lib/TableGen/Property.cpp
+++ b/mlir/lib/TableGen/Property.cpp
@@ -33,16 +33,23 @@ static StringRef getValueAsString(const Init *init) {
}
Property::Property(const Record *def)
- : Property(getValueAsString(def->getValueInit("storageType")),
- getValueAsString(def->getValueInit("interfaceType")),
- getValueAsString(def->getValueInit("convertFromStorage")),
- getValueAsString(def->getValueInit("assignToStorage")),
- getValueAsString(def->getValueInit("convertToAttribute")),
- getValueAsString(def->getValueInit("convertFromAttribute")),
- getValueAsString(def->getValueInit("readFromMlirBytecode")),
- getValueAsString(def->getValueInit("writeToMlirBytecode")),
- getValueAsString(def->getValueInit("hashProperty")),
- getValueAsString(def->getValueInit("defaultValue"))) {
+ : Property(
+ getValueAsString(def->getValueInit("summary")),
+ getValueAsString(def->getValueInit("description")),
+ getValueAsString(def->getValueInit("storageType")),
+ getValueAsString(def->getValueInit("interfaceType")),
+ getValueAsString(def->getValueInit("convertFromStorage")),
+ getValueAsString(def->getValueInit("assignToStorage")),
+ getValueAsString(def->getValueInit("convertToAttribute")),
+ getValueAsString(def->getValueInit("convertFromAttribute")),
+ getValueAsString(def->getValueInit("parser")),
+ getValueAsString(def->getValueInit("optionalParser")),
+ getValueAsString(def->getValueInit("printer")),
+ getValueAsString(def->getValueInit("readFromMlirBytecode")),
+ getValueAsString(def->getValueInit("writeToMlirBytecode")),
+ getValueAsString(def->getValueInit("hashProperty")),
+ getValueAsString(def->getValueInit("defaultValue")),
+ getValueAsString(def->getValueInit("storageTypeValueOverride"))) {
this->def = def;
assert((def->isSubClassOf("Property") || def->isSubClassOf("Attr")) &&
"must be subclass of TableGen 'Property' class");
@@ -50,22 +57,44 @@ Property::Property(const Record *def)
Property::Property(const DefInit *init) : Property(init->getDef()) {}
-Property::Property(StringRef storageType, StringRef interfaceType,
+Property::Property(StringRef summary, StringRef description,
+ StringRef storageType, StringRef interfaceType,
StringRef convertFromStorageCall,
StringRef assignToStorageCall,
StringRef convertToAttributeCall,
- StringRef convertFromAttributeCall,
+ StringRef convertFromAttributeCall, StringRef parserCall,
+ StringRef optionalParserCall, StringRef printerCall,
StringRef readFromMlirBytecodeCall,
StringRef writeToMlirBytecodeCall,
- StringRef hashPropertyCall, StringRef defaultValue)
- : storageType(storageType), interfaceType(interfaceType),
+ StringRef hashPropertyCall, StringRef defaultValue,
+ StringRef storageTypeValueOverride)
+ : summary(summary), description(description), storageType(storageType),
+ interfaceType(interfaceType),
convertFromStorageCall(convertFromStorageCall),
assignToStorageCall(assignToStorageCall),
convertToAttributeCall(convertToAttributeCall),
convertFromAttributeCall(convertFromAttributeCall),
+ parserCall(parserCall), optionalParserCall(optionalParserCall),
+ printerCall(printerCall),
readFromMlirBytecodeCall(readFromMlirBytecodeCall),
writeToMlirBytecodeCall(writeToMlirBytecodeCall),
- hashPropertyCall(hashPropertyCall), defaultValue(defaultValue) {
+ hashPropertyCall(hashPropertyCall), defaultValue(defaultValue),
+ storageTypeValueOverride(storageTypeValueOverride) {
if (storageType.empty())
storageType = "Property";
}
+
+StringRef Property::getPropertyDefName() const {
+ if (def->isAnonymous()) {
+ return getBaseProperty().def->getName();
+ }
+ return def->getName();
+}
+
+Property Property::getBaseProperty() const {
+ if (const auto *defInit =
+ llvm::dyn_cast<llvm::DefInit>(def->getValueInit("baseProperty"))) {
+ return Property(defInit).getBaseProperty();
+ }
+ return *this;
+}
diff --git a/mlir/test/Dialect/ArmSME/vector-legalization.mlir b/mlir/test/Dialect/ArmSME/vector-legalization.mlir
index 458906a..adc02ad 100644
--- a/mlir/test/Dialect/ArmSME/vector-legalization.mlir
+++ b/mlir/test/Dialect/ArmSME/vector-legalization.mlir
@@ -646,3 +646,29 @@ func.func @negative_transpose_store_scalable_via_za__bad_source_shape(%vec: vect
vector.transfer_write %tr, %dest[%i, %j] {in_bounds = [true, true]} : vector<[7]x2xf32>, memref<?x?xf32>
return
}
+
+// -----
+
+// CHECK-LABEL: @swap_shape_cast_of_transpose(
+// CHECK-SAME: %[[VEC:.*]]: vector<1x1x4x[4]xf32>)
+func.func @swap_shape_cast_of_transpose(%vector: vector<1x1x4x[4]xf32>) -> vector<[4]x4xf32> {
+ // CHECK: %[[SHAPE_CAST:.*]] = vector.shape_cast %[[VEC]] : vector<1x1x4x[4]xf32> to vector<4x[4]xf32>
+ // CHECK: %[[TRANSPOSE:.*]] = vector.transpose %[[SHAPE_CAST]], [1, 0] : vector<4x[4]xf32> to vector<[4]x4xf32>
+ // CHECK: return %[[TRANSPOSE]]
+ %0 = vector.transpose %vector, [3, 0, 1, 2] : vector<1x1x4x[4]xf32> to vector<[4]x1x1x4xf32>
+ %1 = vector.shape_cast %0 : vector<[4]x1x1x4xf32> to vector<[4]x4xf32>
+ return %1 : vector<[4]x4xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @swap_shape_cast_of_transpose_units_dims_before_and_after(
+// CHECK-SAME: %[[VEC:.*]]: vector<1x1x1x4x[4]x1xf32>)
+func.func @swap_shape_cast_of_transpose_units_dims_before_and_after(%vector: vector<1x1x1x4x[4]x1xf32>) -> vector<[4]x4xf32> {
+ // CHECK: %[[SHAPE_CAST:.*]] = vector.shape_cast %[[VEC]] : vector<1x1x1x4x[4]x1xf32> to vector<4x[4]xf32>
+ // CHECK: %[[TRANSPOSE:.*]] = vector.transpose %[[SHAPE_CAST]], [1, 0] : vector<4x[4]xf32> to vector<[4]x4xf32>
+ // CHECK: return %[[TRANSPOSE]]
+ %0 = vector.transpose %vector, [4, 1, 0, 2, 3, 5] : vector<1x1x1x4x[4]x1xf32> to vector<[4]x1x1x1x4x1xf32>
+ %1 = vector.shape_cast %0 : vector<[4]x1x1x1x4x1xf32> to vector<[4]x4xf32>
+ return %1 : vector<[4]x4xf32>
+}
diff --git a/mlir/test/IR/properties.mlir b/mlir/test/IR/properties.mlir
index 01ea856..418b81d 100644
--- a/mlir/test/IR/properties.mlir
+++ b/mlir/test/IR/properties.mlir
@@ -2,10 +2,10 @@
// # RUN: mlir-opt %s -mlir-print-op-generic -split-input-file | mlir-opt -mlir-print-op-generic | FileCheck %s --check-prefix=GENERIC
// CHECK: test.with_properties
-// CHECK-SAME: <{a = 32 : i64, array = array<i64: 1, 2, 3, 4>, b = "foo"}>{{$}}
+// CHECK-SAME: a = 32, b = "foo", c = "bar", flag = true, array = [1, 2, 3, 4]{{$}}
// GENERIC: "test.with_properties"()
-// GENERIC-SAME: <{a = 32 : i64, array = array<i64: 1, 2, 3, 4>, b = "foo"}> : () -> ()
-test.with_properties <{a = 32 : i64, array = array<i64: 1, 2, 3, 4>, b = "foo"}>
+// GENERIC-SAME: <{a = 32 : i64, array = array<i64: 1, 2, 3, 4>, b = "foo", c = "bar", flag = true}> : () -> ()
+test.with_properties a = 32, b = "foo", c = "bar", flag = true, array = [1, 2, 3, 4]
// CHECK: test.with_nice_properties
// CHECK-SAME: "foo bar" is -3{{$}}
@@ -34,18 +34,48 @@ test.using_property_in_custom [1, 4, 20]
// GENERIC-SAME: }>
test.using_property_ref_in_custom 1 + 4 = 5
-// CHECK: test.with_default_valued_properties {{$}}
+// CHECK: test.with_default_valued_properties na{{$}}
// GENERIC: "test.with_default_valued_properties"()
-// GENERIC-SAME: <{a = 0 : i32}>
-test.with_default_valued_properties <{a = 0 : i32}>
+// GENERIC-SAME: <{a = 0 : i32, b = "", c = -1 : i32, unit = false}> : () -> ()
+test.with_default_valued_properties 0 "" -1 unit_absent
+
+// CHECK: test.with_default_valued_properties 1 "foo" 0 unit{{$}}
+// GENERIC: "test.with_default_valued_properties"()
+// GENERIC-SAME: <{a = 1 : i32, b = "foo", c = 0 : i32, unit}> : () -> ()
+test.with_default_valued_properties 1 "foo" 0 unit
// CHECK: test.with_optional_properties
-// CHECK-SAME: <{b = 0 : i32}>
+// CHECK-SAME: simple = 0
+// GENERIC: "test.with_optional_properties"()
+// GENERIC-SAME: <{hasDefault = [], hasUnit = false, longSyntax = [], maybeUnit = [], nested = [], nonTrivialStorage = [], simple = [0]}> : () -> ()
+test.with_optional_properties simple = 0
+
+// CHECK: test.with_optional_properties{{$}}
// GENERIC: "test.with_optional_properties"()
-// GENERIC-SAME: <{b = 0 : i32}>
-test.with_optional_properties <{b = 0 : i32}>
+// GENERIC-SAME: simple = []
+test.with_optional_properties
-// CHECK: test.with_optional_properties {{$}}
+// CHECK: test.with_optional_properties
+// CHECK-SAME: anAttr = 0 simple = 1 nonTrivialStorage = "foo" hasDefault = some<0> nested = some<1> longSyntax = some<"bar"> hasUnit maybeUnit = some<unit>
// GENERIC: "test.with_optional_properties"()
-// GENERIC-SAME: : () -> ()
+// GENERIC-SAME: <{anAttr = 0 : i32, hasDefault = [0], hasUnit, longSyntax = ["bar"], maybeUnit = [unit], nested = {{\[}}[1]], nonTrivialStorage = ["foo"], simple = [1]}> : () -> ()
test.with_optional_properties
+ anAttr = 0
+ simple = 1
+ nonTrivialStorage = "foo"
+ hasDefault = some<0>
+ nested = some<1>
+ longSyntax = some<"bar">
+ hasUnit
+ maybeUnit = some<unit>
+
+// CHECK: test.with_optional_properties
+// CHECK-SAME: nested = some<none>
+// GENERIC: "test.with_optional_properties"()
+// GENERIC-SAME: nested = {{\[}}[]]
+test.with_optional_properties nested = some<none>
+
+// CHECK: test.with_array_properties
+// CHECK-SAME: ints = [1, 2] strings = ["a", "b"] nested = {{\[}}[1, 2], [3, 4]] opt = [-1, -2] explicitOptions = [none, 0] explicitUnits = [unit, unit_absent]
+// GENERIC: "test.with_array_properties"()
+test.with_array_properties ints = [1, 2] strings = ["a", "b"] nested = [[1, 2], [3, 4]] opt = [-1, -2] explicitOptions = [none, 0] explicitUnits = [unit, unit_absent] [] thats_has_default
diff --git a/mlir/test/IR/traits.mlir b/mlir/test/IR/traits.mlir
index 1e04670..49cfd7e 100644
--- a/mlir/test/IR/traits.mlir
+++ b/mlir/test/IR/traits.mlir
@@ -502,6 +502,25 @@ func.func @succeededOilistTrivial() {
// -----
+// CHECK-LABEL: @succeededOilistTrivialProperties
+func.func @succeededOilistTrivialProperties() {
+ // CHECK: test.oilist_with_keywords_only_properties keyword
+ test.oilist_with_keywords_only_properties keyword
+ // CHECK: test.oilist_with_keywords_only_properties otherKeyword
+ test.oilist_with_keywords_only_properties otherKeyword
+ // CHECK: test.oilist_with_keywords_only_properties keyword otherKeyword
+ test.oilist_with_keywords_only_properties keyword otherKeyword
+ // CHECK: test.oilist_with_keywords_only_properties keyword otherKeyword
+ test.oilist_with_keywords_only_properties otherKeyword keyword
+ // CHECK: test.oilist_with_keywords_only_properties thirdKeyword
+ test.oilist_with_keywords_only_properties thirdKeyword
+ // CHECK: test.oilist_with_keywords_only_properties keyword thirdKeyword
+ test.oilist_with_keywords_only_properties keyword thirdKeyword
+ return
+}
+
+// -----
+
// CHECK-LABEL: @succeededOilistSimple
func.func @succeededOilistSimple(%arg0 : i32, %arg1 : i32, %arg2 : i32) {
// CHECK: test.oilist_with_simple_args keyword %{{.*}} : i32
diff --git a/mlir/test/Transforms/test-legalizer.mlir b/mlir/test/Transforms/test-legalizer.mlir
index 7a7af2b..a789ab9 100644
--- a/mlir/test/Transforms/test-legalizer.mlir
+++ b/mlir/test/Transforms/test-legalizer.mlir
@@ -408,10 +408,10 @@ func.func @test_move_op_before_rollback() {
// CHECK-LABEL: func @test_properties_rollback()
func.func @test_properties_rollback() {
- // CHECK: test.with_properties <{a = 32 : i64,
+ // CHECK: test.with_properties a = 32,
// expected-remark @below{{op 'test.with_properties' is not legalizable}}
test.with_properties
- <{a = 32 : i64, array = array<i64: 1, 2, 3, 4>, b = "foo"}>
+ a = 32, b = "foo", c = "bar", flag = true, array = [1, 2, 3, 4]
{modify_inplace}
"test.return"() : () -> ()
}
diff --git a/mlir/test/lib/Dialect/Test/TestFormatUtils.cpp b/mlir/test/lib/Dialect/Test/TestFormatUtils.cpp
index 6e75dd3..9ed1b3a 100644
--- a/mlir/test/lib/Dialect/Test/TestFormatUtils.cpp
+++ b/mlir/test/lib/Dialect/Test/TestFormatUtils.cpp
@@ -297,11 +297,17 @@ void test::printSwitchCases(OpAsmPrinter &p, Operation *op,
// CustomUsingPropertyInCustom
//===----------------------------------------------------------------------===//
-bool test::parseUsingPropertyInCustom(OpAsmParser &parser, int64_t value[3]) {
- return parser.parseLSquare() || parser.parseInteger(value[0]) ||
- parser.parseComma() || parser.parseInteger(value[1]) ||
- parser.parseComma() || parser.parseInteger(value[2]) ||
- parser.parseRSquare();
+bool test::parseUsingPropertyInCustom(OpAsmParser &parser,
+ SmallVector<int64_t> &value) {
+ auto elemParser = [&]() {
+ int64_t v = 0;
+ if (failed(parser.parseInteger(v)))
+ return failure();
+ value.push_back(v);
+ return success();
+ };
+ return failed(parser.parseCommaSeparatedList(OpAsmParser::Delimiter::Square,
+ elemParser));
}
void test::printUsingPropertyInCustom(OpAsmPrinter &printer, Operation *op,
diff --git a/mlir/test/lib/Dialect/Test/TestFormatUtils.h b/mlir/test/lib/Dialect/Test/TestFormatUtils.h
index 7e9cd83..6d4df7d 100644
--- a/mlir/test/lib/Dialect/Test/TestFormatUtils.h
+++ b/mlir/test/lib/Dialect/Test/TestFormatUtils.h
@@ -160,7 +160,8 @@ void printSwitchCases(mlir::OpAsmPrinter &p, mlir::Operation *op,
// CustomUsingPropertyInCustom
//===----------------------------------------------------------------------===//
-bool parseUsingPropertyInCustom(mlir::OpAsmParser &parser, int64_t value[3]);
+bool parseUsingPropertyInCustom(mlir::OpAsmParser &parser,
+ llvm::SmallVector<int64_t> &value);
void printUsingPropertyInCustom(mlir::OpAsmPrinter &printer,
mlir::Operation *op,
diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
index 9450764..2d97a02 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.td
+++ b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -2947,11 +2947,18 @@ def TestVersionedOpC : TEST_Op<"versionedC"> {
// Op with a properties struct defined inline.
def TestOpWithProperties : TEST_Op<"with_properties"> {
- let assemblyFormat = "prop-dict attr-dict";
+ let assemblyFormat = [{
+ `a` `=` $a `,`
+ `b` `=` $b `,`
+ `c` `=` $c `,`
+ `flag` `=` $flag `,`
+ `array` `=` $array attr-dict}];
let arguments = (ins
- IntProperty<"int64_t">:$a,
+ I64Property:$a,
StrAttr:$b, // Attributes can directly be used here.
- ArrayProperty<"int64_t", 4>:$array // example of an array
+ StringProperty:$c,
+ BoolProperty:$flag,
+ IntArrayProperty<"int64_t">:$array // example of an array
);
}
@@ -2974,7 +2981,7 @@ def TestOpWithPropertiesAndInferredType
// Demonstrate how to wrap an existing C++ class named MyPropStruct.
def MyStructProperty : Property<"MyPropStruct"> {
- let convertToAttribute = "$_storage.asAttribute($_ctxt)";
+ let convertToAttribute = "return $_storage.asAttribute($_ctxt);";
let convertFromAttribute = "return MyPropStruct::setFromAttr($_storage, $_attr, $_diag);";
let hashProperty = "$_storage.hash();";
}
@@ -2988,14 +2995,14 @@ def TestOpWithWrappedProperties : TEST_Op<"with_wrapped_properties"> {
def TestOpUsingPropertyInCustom : TEST_Op<"using_property_in_custom"> {
let assemblyFormat = "custom<UsingPropertyInCustom>($prop) attr-dict";
- let arguments = (ins ArrayProperty<"int64_t", 3>:$prop);
+ let arguments = (ins IntArrayProperty<"int64_t">:$prop);
}
def TestOpUsingPropertyInCustomAndOther
: TEST_Op<"using_property_in_custom_and_other"> {
let assemblyFormat = "custom<UsingPropertyInCustom>($prop) prop-dict attr-dict";
let arguments = (ins
- ArrayProperty<"int64_t", 3>:$prop,
+ IntArrayProperty<"int64_t">:$prop,
IntProperty<"int64_t">:$other
);
}
@@ -3021,7 +3028,7 @@ def TestOpUsingIntPropertyWithWorseBytecode
def PropertiesWithCustomPrint : Property<"PropertiesWithCustomPrint"> {
let convertToAttribute = [{
- getPropertiesAsAttribute($_ctxt, $_storage)
+ return getPropertiesAsAttribute($_ctxt, $_storage);
}];
let convertFromAttribute = [{
return setPropertiesFromAttribute($_storage, $_attr, $_diag);
@@ -3085,7 +3092,7 @@ def TestOpWithNiceProperties : TEST_Op<"with_nice_properties"> {
def VersionedProperties : Property<"VersionedProperties"> {
let convertToAttribute = [{
- getPropertiesAsAttribute($_ctxt, $_storage)
+ return getPropertiesAsAttribute($_ctxt, $_storage);
}];
let convertFromAttribute = [{
return setPropertiesFromAttribute($_storage, $_attr, $_diag);
@@ -3131,13 +3138,65 @@ def TestOpWithVersionedProperties : TEST_Op<"with_versioned_properties"> {
}
def TestOpWithDefaultValuedProperties : TEST_Op<"with_default_valued_properties"> {
- let assemblyFormat = "prop-dict attr-dict";
- let arguments = (ins DefaultValuedAttr<I32Attr, "0">:$a);
+ let assemblyFormat = [{
+ ($a^) : (`na`)?
+ ($b^)?
+ ($c^)?
+ ($unit^)?
+ attr-dict
+ }];
+ let arguments = (ins DefaultValuedAttr<I32Attr, "0">:$a,
+ DefaultValuedProperty<StringProperty, "\"\"">:$b,
+ DefaultValuedProperty<IntProperty<"int32_t">, "-1">:$c,
+ UnitProperty:$unit);
}
def TestOpWithOptionalProperties : TEST_Op<"with_optional_properties"> {
- let assemblyFormat = "prop-dict attr-dict";
- let arguments = (ins OptionalAttr<I32Attr>:$a, OptionalAttr<I32Attr>:$b);
+ let assemblyFormat = [{
+ (`anAttr` `=` $anAttr^)?
+ (`simple` `=` $simple^)?
+ (`nonTrivialStorage` `=` $nonTrivialStorage^)?
+ (`hasDefault` `=` $hasDefault^)?
+ (`nested` `=` $nested^)?
+ (`longSyntax` `=` $longSyntax^)?
+ (`hasUnit` $hasUnit^)?
+ (`maybeUnit` `=` $maybeUnit^)?
+ attr-dict
+ }];
+ let arguments = (ins
+ OptionalAttr<I32Attr>:$anAttr,
+ OptionalProperty<I64Property>:$simple,
+ OptionalProperty<StringProperty>:$nonTrivialStorage,
+ // Confirm that properties with default values now default to nullopt and have
+ // the long syntax.
+ OptionalProperty<DefaultValuedProperty<I64Property, "0">>:$hasDefault,
+ OptionalProperty<OptionalProperty<I64Property>>:$nested,
+ OptionalProperty<StringProperty, 0>:$longSyntax,
+ UnitProperty:$hasUnit,
+ OptionalProperty<UnitProperty>:$maybeUnit);
+}
+
+def TestOpWithArrayProperties : TEST_Op<"with_array_properties"> {
+ let assemblyFormat = [{
+ `ints` `=` $ints
+ `strings` `=` $strings
+ `nested` `=` $nested
+ `opt` `=` $opt
+ `explicitOptions` `=` $explicitOptions
+ `explicitUnits` `=` $explicitUnits
+ ($hasDefault^ `thats_has_default`)?
+ attr-dict
+ }];
+ let arguments = (ins
+ ArrayProperty<I64Property>:$ints,
+ ArrayProperty<StringProperty>:$strings,
+ ArrayProperty<ArrayProperty<I32Property>>:$nested,
+ OptionalProperty<ArrayProperty<I32Property>>:$opt,
+ ArrayProperty<OptionalProperty<I64Property>>:$explicitOptions,
+ ArrayProperty<UnitProperty>:$explicitUnits,
+ DefaultValuedProperty<ArrayProperty<I64Property>,
+ "::llvm::ArrayRef<int64_t>{}", "::llvm::SmallVector<int64_t>{}">:$hasDefault
+ );
}
//===----------------------------------------------------------------------===//
diff --git a/mlir/test/lib/Dialect/Test/TestOpsSyntax.td b/mlir/test/lib/Dialect/Test/TestOpsSyntax.td
index 3129085..795b9da 100644
--- a/mlir/test/lib/Dialect/Test/TestOpsSyntax.td
+++ b/mlir/test/lib/Dialect/Test/TestOpsSyntax.td
@@ -86,6 +86,17 @@ def OIListTrivial : TEST_Op<"oilist_with_keywords_only"> {
}];
}
+// Ops related to OIList primitive
+def OIListTrivialProperties : TEST_Op<"oilist_with_keywords_only_properties"> {
+ let arguments = (ins UnitProperty:$keyword, UnitProperty:$otherKeyword,
+ UnitProperty:$diffNameUnitPropertyKeyword);
+ let assemblyFormat = [{
+ oilist( `keyword` $keyword
+ | `otherKeyword` $otherKeyword
+ | `thirdKeyword` $diffNameUnitPropertyKeyword) attr-dict
+ }];
+}
+
def OIListSimple : TEST_Op<"oilist_with_simple_args", [AttrSizedOperandSegments]> {
let arguments = (ins Optional<AnyType>:$arg0,
Optional<AnyType>:$arg1,
@@ -392,6 +403,17 @@ def FormatOptionalUnitAttrNoElide
let assemblyFormat = "($is_optional^)? attr-dict";
}
+def FormatOptionalUnitProperty : TEST_Op<"format_optional_unit_property"> {
+ let arguments = (ins UnitProperty:$is_optional);
+ let assemblyFormat = "(`is_optional` $is_optional^)? attr-dict";
+}
+
+def FormatOptionalUnitPropertyNoElide
+ : TEST_Op<"format_optional_unit_property_no_elide"> {
+ let arguments = (ins UnitProperty:$is_optional);
+ let assemblyFormat = "($is_optional^)? attr-dict";
+}
+
def FormatOptionalEnumAttr : TEST_Op<"format_optional_enum_attr"> {
let arguments = (ins OptionalAttr<SomeI64Enum>:$attr);
let assemblyFormat = "($attr^)? attr-dict";
diff --git a/mlir/test/mlir-tblgen/op-format.mlir b/mlir/test/mlir-tblgen/op-format.mlir
index 46d27264..03288ae 100644
--- a/mlir/test/mlir-tblgen/op-format.mlir
+++ b/mlir/test/mlir-tblgen/op-format.mlir
@@ -195,6 +195,16 @@ test.format_optional_unit_attribute
// CHECK: test.format_optional_unit_attribute_no_elide unit
test.format_optional_unit_attribute_no_elide unit
+// CHECK: test.format_optional_unit_property is_optional
+test.format_optional_unit_property is_optional
+
+// CHECK: test.format_optional_unit_property
+// CHECK-NOT: is_optional
+test.format_optional_unit_property
+
+// CHECK: test.format_optional_unit_property_no_elide unit
+test.format_optional_unit_property_no_elide unit
+
// CHECK: test.format_optional_enum_attr case5
test.format_optional_enum_attr case5
diff --git a/mlir/test/mlir-tblgen/op-format.td b/mlir/test/mlir-tblgen/op-format.td
index 4a19ffb..8af4341 100644
--- a/mlir/test/mlir-tblgen/op-format.td
+++ b/mlir/test/mlir-tblgen/op-format.td
@@ -73,7 +73,7 @@ def OptionalGroupA : TestFormat_Op<[{
// CHECK-NEXT: result.addAttribute("a", parser.getBuilder().getUnitAttr())
// CHECK: parser.parseKeyword("bar")
// CHECK-LABEL: OptionalGroupB::print
-// CHECK: if (!getAAttr())
+// CHECK: if (!(getAAttr() && getAAttr() != ((false) ? ::mlir::OpBuilder((*this)->getContext()).getUnitAttr() : nullptr)))
// CHECK-NEXT: odsPrinter << ' ' << "foo"
// CHECK-NEXT: else
// CHECK-NEXT: odsPrinter << ' ' << "bar"
@@ -84,7 +84,7 @@ def OptionalGroupB : TestFormat_Op<[{
// Optional group anchored on a default-valued attribute:
// CHECK-LABEL: OptionalGroupC::parse
-// CHECK: if (getAAttr() && getAAttr() != ::mlir::OpBuilder((*this)->getContext()).getStringAttr("default")) {
+// CHECK: if (getAAttr() != ::mlir::OpBuilder((*this)->getContext()).getStringAttr("default")) {
// CHECK-NEXT: odsPrinter << ' ';
// CHECK-NEXT: odsPrinter.printAttributeWithoutType(getAAttr());
// CHECK-NEXT: }
diff --git a/mlir/test/mlir-tblgen/op-properties.td b/mlir/test/mlir-tblgen/op-properties.td
index 7b0ee6b..918583c 100644
--- a/mlir/test/mlir-tblgen/op-properties.td
+++ b/mlir/test/mlir-tblgen/op-properties.td
@@ -1,8 +1,10 @@
-// RUN: mlir-tblgen -gen-op-decls -I %S/../../include %s | FileCheck %s
+// RUN: mlir-tblgen -gen-op-decls -I %S/../../include %s | FileCheck %s --check-prefix=DECL
+// RUN: mlir-tblgen -gen-op-defs -I %S/../../include %s | FileCheck %s --check-prefix=DEFS
include "mlir/IR/AttrTypeBase.td"
include "mlir/IR/EnumAttr.td"
include "mlir/IR/OpBase.td"
+include "mlir/IR/Properties.td"
def Test_Dialect : Dialect {
let name = "test";
@@ -15,7 +17,115 @@ def OpWithAttr : NS_Op<"op_with_attr">{
let arguments = (ins AnyAttr:$attr, OptionalAttr<AnyAttr>:$optional);
}
-// CHECK: void setAttrAttr(::mlir::Attribute attr)
-// CHECK-NEXT: getProperties().attr = attr
-// CHECK: void setOptionalAttr(::mlir::Attribute attr)
-// CHECK-NEXT: getProperties().optional = attr
+// Test required and optional properties
+// ---
+
+def DefaultI64Array : IntArrayProperty<"int64_t"> {
+ let defaultValue = "::llvm::ArrayRef<int64_t>{}";
+ let storageTypeValueOverride = "::llvm::SmallVector<int64_t>{}";
+}
+
+def OpWithProps : NS_Op<"op_with_props"> {
+ let arguments = (ins
+ BoolProperty:$flag,
+ StringProperty:$string,
+ ArrayProperty<StringProperty>:$strings,
+ DefaultValuedProperty<I32Property, "0">:$default_int,
+ OptionalProperty<I64Property>:$optional,
+ DefaultI64Array:$intArray
+ );
+}
+
+/// Check that optional arguments to builders only go at the end.
+def OpWithSomeOptionalProperties : NS_Op<"op_with_some_optional_props"> {
+ let arguments = (ins
+ OptionalProperty<I64Property>:$mustSpecify,
+ I64Property:$required,
+ OptionalProperty<StringProperty>:$canOmit,
+ DefaultValuedProperty<I64Property, "-1">:$canOmit2
+ );
+}
+
+/// Check that the ambiguous attribute protection correctly stops optional properties
+/// from getting default argument values in builders.
+def OpWithOptionalPropsAndAttrs :
+ NS_Op<"with_some_optional_props_and_atts"> {
+ let arguments = (ins
+ OptionalProperty<BoolProperty>:$mustSpecify,
+ OptionalAttr<BoolAttr>:$ambiguous,
+ OptionalAttr<I32Attr>:$canOmit,
+ OptionalProperty<I32Property>:$canOmitProp
+ );
+}
+
+// DECL: void setAttrAttr(::mlir::Attribute attr)
+// DECL-NEXT: getProperties().attr = attr
+// DECL: void setOptionalAttr(::mlir::Attribute attr)
+// DECL-NEXT: getProperties().optional = attr
+
+// -----
+
+// DECL-LABEL: class OpWithOptionalPropsAndAttrs :
+// DECL: static void build(
+// DECL-SAME: ::mlir::OpBuilder &odsBuilder,
+// DECL-SAME: ::mlir::OperationState &odsState,
+// DECL-SAME: /*optional*/std::optional<bool> mustSpecify,
+// DECL-SAME: /*optional*/::mlir::BoolAttr ambiguous,
+// DECL-SAME: /*optional*/::mlir::IntegerAttr canOmit,
+// DECL-SAME: /*optional*/std::optional<int32_t> canOmitProp = std::nullopt);
+
+// -----
+
+// COM: Ensure the struct is set up how we expect
+// DECL-LABEL: class OpWithPropsGenericAdaptorBase
+// DECL: using flagTy = bool;
+// DECL-NEXT: flagTy flag;
+// DECL-NEXT: bool getFlag()
+// DECL-NEXT: propStorage = this->flag
+// DECL-NEXT: return propStorage;
+// DECL: void setFlag(bool propValue)
+// DECL-NEXT: propStorage = this->flag;
+// DECL-NEXT: propStorage = propValue;
+// DECL: using stringTy = std::string;
+// DECL: llvm::StringRef getString()
+// DECL: auto &propStorage = this->string;
+// DECL-NEXT: return ::llvm::StringRef{propStorage};
+// DECL: using stringsTy = ::llvm::SmallVector<std::string>
+// DECL: ::llvm::ArrayRef<std::string> getStrings()
+// DECL: using default_intTy = int32_t;
+// DECL: default_intTy default_int = 0;
+// DECL: intArrayTy intArray = ::llvm::SmallVector<int64_t>{};
+// DECL: ::llvm::ArrayRef<int64_t> getIntArray()
+// DECL: return ::llvm::ArrayRef<int64_t>{propStorage}
+// DECL: void setIntArray(::llvm::ArrayRef<int64_t> propValue)
+// DECL: propStorage.assign
+// DECL-LABEL: class OpWithProps :
+// DECL: setString(::llvm::StringRef newString)
+// DECL-NEXT: getProperties().setString(newString)
+
+// DECL: static void build(
+// DECL-SAME: ::mlir::OpBuilder &odsBuilder,
+// DECL-SAME: ::mlir::OperationState &odsState,
+// DECL-SAME: bool flag,
+// DECL-SAME: ::llvm::StringRef string,
+// DECL-SAME: ::llvm::ArrayRef<std::string> strings,
+// DECL-SAME: /*optional*/int32_t default_int = 0,
+// DECL-SAME: /*optional*/std::optional<int64_t> optional = std::nullopt,
+// DECL-SAME: /*optional*/::llvm::ArrayRef<int64_t> intArray = ::llvm::ArrayRef<int64_t>{});
+
+// DEFS-LABEL: OpWithProps::computePropertiesHash
+// DEFS: hash_intArray
+// DEFS-NEXT: return ::llvm::hash_value(::llvm::ArrayRef<int64_t>{propStorage})
+// DEFS: ::llvm::hash_value(prop.optional)
+// DEFS: hash_intArray(prop.intArray)
+
+// -----
+
+// DECL-LABEL: class OpWithSomeOptionalProperties :
+// DECL: static void build(
+// DECL-SAME: ::mlir::OpBuilder &odsBuilder,
+// DECL-SAME: ::mlir::OperationState &odsState,
+// DECL-SAME: /*optional*/std::optional<int64_t> mustSpecify,
+// DECL-SAME: int64_t required,
+// DECL-SAME: /*optional*/std::optional<::llvm::StringRef> canOmit = std::nullopt,
+// DECL-SAME: /*optional*/int64_t canOmit2 = -1);
diff --git a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
index 0fc750c..a2ceefb 100644
--- a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
@@ -155,6 +155,36 @@ static const char *const valueRangeReturnCode = R"(
std::next({0}, valueRange.first + valueRange.second)};
)";
+/// Parse operand/result segment_size property.
+/// {0}: Number of elements in the segment array
+static const char *const parseTextualSegmentSizeFormat = R"(
+ size_t i = 0;
+ auto parseElem = [&]() -> ::mlir::ParseResult {
+ if (i >= {0})
+ return $_parser.emitError($_parser.getCurrentLocation(),
+ "expected `]` after {0} segment sizes");
+ if (failed($_parser.parseInteger($_storage[i])))
+ return ::mlir::failure();
+ i += 1;
+ return ::mlir::success();
+ };
+ if (failed($_parser.parseCommaSeparatedList(
+ ::mlir::AsmParser::Delimeter::Square, parseElem)))
+ return failure();
+ if (i < {0})
+ return $_parser.emitError($_parser.getCurrentLocation(),
+ "expected {0} segment sizes, found only ") << i;
+ return success();
+)";
+
+static const char *const printTextualSegmentSize = R"(
+ [&]() {
+ $_printer << '[';
+ ::llvm::interleaveComma($_storage, $_printer);
+ $_printer << ']';
+ }()
+)";
+
/// Read operand/result segment_size from bytecode.
static const char *const readBytecodeSegmentSizeNative = R"(
if ($_reader.getBytecodeVersion() >= /*kNativePropertiesODSSegmentSize=*/6)
@@ -422,8 +452,10 @@ private:
// Property
std::optional<NamedProperty> operandSegmentsSize;
std::string operandSegmentsSizeStorage;
+ std::string operandSegmentsSizeParser;
std::optional<NamedProperty> resultSegmentsSize;
std::string resultSegmentsSizeStorage;
+ std::string resultSegmentsSizeParser;
// Indices to store the position in the emission order of the operand/result
// segment sizes attribute if emitted as part of the properties for legacy
@@ -448,31 +480,40 @@ void OpOrAdaptorHelper::computeAttrMetadata() {
{namedAttr.name, AttributeMetadata{namedAttr.name, !isOptional, attr}});
}
- auto makeProperty = [&](StringRef storageType) {
+ auto makeProperty = [&](StringRef storageType, StringRef parserCall) {
return Property(
+ /*summary=*/"",
+ /*description=*/"",
/*storageType=*/storageType,
/*interfaceType=*/"::llvm::ArrayRef<int32_t>",
/*convertFromStorageCall=*/"$_storage",
/*assignToStorageCall=*/
"::llvm::copy($_value, $_storage.begin())",
/*convertToAttributeCall=*/
- "::mlir::DenseI32ArrayAttr::get($_ctxt, $_storage)",
+ "return ::mlir::DenseI32ArrayAttr::get($_ctxt, $_storage);",
/*convertFromAttributeCall=*/
"return convertFromAttribute($_storage, $_attr, $_diag);",
+ /*parserCall=*/parserCall,
+ /*optionalParserCall=*/"",
+ /*printerCall=*/printTextualSegmentSize,
/*readFromMlirBytecodeCall=*/readBytecodeSegmentSizeNative,
/*writeToMlirBytecodeCall=*/writeBytecodeSegmentSizeNative,
/*hashPropertyCall=*/
"::llvm::hash_combine_range(std::begin($_storage), "
"std::end($_storage));",
- /*StringRef defaultValue=*/"");
+ /*StringRef defaultValue=*/"",
+ /*storageTypeValueOverride=*/"");
};
// Include key attributes from several traits as implicitly registered.
if (op.getTrait("::mlir::OpTrait::AttrSizedOperandSegments")) {
if (op.getDialect().usePropertiesForAttributes()) {
operandSegmentsSizeStorage =
llvm::formatv("std::array<int32_t, {0}>", op.getNumOperands());
- operandSegmentsSize = {"operandSegmentSizes",
- makeProperty(operandSegmentsSizeStorage)};
+ operandSegmentsSizeParser =
+ llvm::formatv(parseTextualSegmentSizeFormat, op.getNumOperands());
+ operandSegmentsSize = {
+ "operandSegmentSizes",
+ makeProperty(operandSegmentsSizeStorage, operandSegmentsSizeParser)};
} else {
attrMetadata.insert(
{operandSegmentAttrName, AttributeMetadata{operandSegmentAttrName,
@@ -484,8 +525,11 @@ void OpOrAdaptorHelper::computeAttrMetadata() {
if (op.getDialect().usePropertiesForAttributes()) {
resultSegmentsSizeStorage =
llvm::formatv("std::array<int32_t, {0}>", op.getNumResults());
- resultSegmentsSize = {"resultSegmentSizes",
- makeProperty(resultSegmentsSizeStorage)};
+ resultSegmentsSizeParser =
+ llvm::formatv(parseTextualSegmentSizeFormat, op.getNumResults());
+ resultSegmentsSize = {
+ "resultSegmentSizes",
+ makeProperty(resultSegmentsSizeStorage, resultSegmentsSizeParser)};
} else {
attrMetadata.insert(
{resultSegmentAttrName,
@@ -572,6 +616,12 @@ private:
void
genPropertiesSupportForBytecode(ArrayRef<ConstArgument> attrOrProperties);
+ // Generates getters for the properties.
+ void genPropGetters();
+
+ // Generates seters for the properties.
+ void genPropSetters();
+
// Generates getters for the attributes.
void genAttrGetters();
@@ -1041,6 +1091,8 @@ OpEmitter::OpEmitter(const Operator &op,
genNamedRegionGetters();
genNamedSuccessorGetters();
genPropertiesSupport();
+ genPropGetters();
+ genPropSetters();
genAttrGetters();
genAttrSetters();
genOptionalAttrRemovers();
@@ -1198,6 +1250,16 @@ void OpEmitter::genAttrNameGetters() {
}
}
+// Emit the getter for a named property.
+// It is templated to be shared between the Op and the adaptor class.
+template <typename OpClassOrAdaptor>
+static void emitPropGetter(OpClassOrAdaptor &opClass, const Operator &op,
+ StringRef name, const Property &prop) {
+ auto *method = opClass.addInlineMethod(prop.getInterfaceType(), name);
+ ERROR_IF_PRUNED(method, name, op);
+ method->body() << formatv(" return getProperties().{0}();", name);
+}
+
// Emit the getter for an attribute with the return type specified.
// It is templated to be shared between the Op and the adaptor class.
template <typename OpClassOrAdaptor>
@@ -1313,7 +1375,7 @@ void OpEmitter::genPropertiesSupport() {
)decl";
const char *propFromAttrFmt = R"decl(
auto setFromAttr = [] (auto &propStorage, ::mlir::Attribute propAttr,
- ::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError) {{
+ ::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError) -> ::mlir::LogicalResult {{
{0}
};
{2};
@@ -1358,7 +1420,10 @@ void OpEmitter::genPropertiesSupport() {
.addSubst("_storage", propertyStorage)
.addSubst("_diag", propertyDiag)),
name, getAttr);
- if (prop.hasDefaultValue()) {
+ if (prop.hasStorageTypeValueOverride()) {
+ setPropMethod << formatv(attrGetDefaultFmt, name,
+ prop.getStorageTypeValueOverride());
+ } else if (prop.hasDefaultValue()) {
setPropMethod << formatv(attrGetDefaultFmt, name,
prop.getDefaultValue());
} else {
@@ -1409,8 +1474,10 @@ void OpEmitter::genPropertiesSupport() {
const char *propToAttrFmt = R"decl(
{
const auto &propStorage = prop.{0};
- attrs.push_back(odsBuilder.getNamedAttr("{0}",
- {1}));
+ auto attr = [&]() -> ::mlir::Attribute {{
+ {1}
+ }();
+ attrs.push_back(odsBuilder.getNamedAttr("{0}", attr));
}
)decl";
for (const auto &attrOrProp : attrOrProperties) {
@@ -1458,9 +1525,12 @@ void OpEmitter::genPropertiesSupport() {
StringRef name = namedProperty->name;
auto &prop = namedProperty->prop;
FmtContext fctx;
- hashMethod << formatv(propHashFmt, name,
- tgfmt(prop.getHashPropertyCall(),
- &fctx.addSubst("_storage", propertyStorage)));
+ if (!prop.getHashPropertyCall().empty()) {
+ hashMethod << formatv(
+ propHashFmt, name,
+ tgfmt(prop.getHashPropertyCall(),
+ &fctx.addSubst("_storage", propertyStorage)));
+ }
}
}
hashMethod << " return llvm::hash_combine(";
@@ -1468,8 +1538,13 @@ void OpEmitter::genPropertiesSupport() {
attrOrProperties, hashMethod, [&](const ConstArgument &attrOrProp) {
if (const auto *namedProperty =
llvm::dyn_cast_if_present<const NamedProperty *>(attrOrProp)) {
- hashMethod << "\n hash_" << namedProperty->name << "(prop."
- << namedProperty->name << ")";
+ if (!namedProperty->prop.getHashPropertyCall().empty()) {
+ hashMethod << "\n hash_" << namedProperty->name << "(prop."
+ << namedProperty->name << ")";
+ } else {
+ hashMethod << "\n ::llvm::hash_value(prop."
+ << namedProperty->name << ")";
+ }
return;
}
const auto *namedAttr =
@@ -1524,8 +1599,9 @@ void OpEmitter::genPropertiesSupport() {
"\"{0}\") return ",
resultSegmentAttrName);
}
- getInherentAttrMethod << tgfmt(prop.getConvertToAttributeCall(), &fctx)
- << ";\n";
+ getInherentAttrMethod << "[&]() -> ::mlir::Attribute { "
+ << tgfmt(prop.getConvertToAttributeCall(), &fctx)
+ << " }();\n";
if (name == operandSegmentAttrName) {
setInherentAttrMethod
@@ -1549,13 +1625,15 @@ void OpEmitter::genPropertiesSupport() {
)decl",
name);
if (name == operandSegmentAttrName) {
- populateInherentAttrsMethod
- << formatv(" attrs.append(\"{0}\", {1});\n", operandSegmentAttrName,
- tgfmt(prop.getConvertToAttributeCall(), &fctx));
+ populateInherentAttrsMethod << formatv(
+ " attrs.append(\"{0}\", [&]() -> ::mlir::Attribute { {1} }());\n",
+ operandSegmentAttrName,
+ tgfmt(prop.getConvertToAttributeCall(), &fctx));
} else {
- populateInherentAttrsMethod
- << formatv(" attrs.append(\"{0}\", {1});\n", resultSegmentAttrName,
- tgfmt(prop.getConvertToAttributeCall(), &fctx));
+ populateInherentAttrsMethod << formatv(
+ " attrs.append(\"{0}\", [&]() -> ::mlir::Attribute { {1} }());\n",
+ resultSegmentAttrName,
+ tgfmt(prop.getConvertToAttributeCall(), &fctx));
}
}
getInherentAttrMethod << " return std::nullopt;\n";
@@ -1701,6 +1779,26 @@ void OpEmitter::genPropertiesSupportForBytecode(
readPropertiesMethod << " return ::mlir::success();";
}
+void OpEmitter::genPropGetters() {
+ for (const NamedProperty &prop : op.getProperties()) {
+ std::string name = op.getGetterName(prop.name);
+ emitPropGetter(opClass, op, name, prop.prop);
+ }
+}
+
+void OpEmitter::genPropSetters() {
+ for (const NamedProperty &prop : op.getProperties()) {
+ std::string name = op.getSetterName(prop.name);
+ std::string argName = "new" + convertToCamelFromSnakeCase(
+ prop.name, /*capitalizeFirst=*/true);
+ auto *method = opClass.addInlineMethod(
+ "void", name, MethodParameter(prop.prop.getInterfaceType(), argName));
+ if (!method)
+ return;
+ method->body() << formatv(" getProperties().{0}({1});", name, argName);
+ }
+}
+
void OpEmitter::genAttrGetters() {
FmtContext fctx;
fctx.withBuilder("::mlir::Builder((*this)->getContext())");
@@ -2957,6 +3055,12 @@ void OpEmitter::buildParamList(SmallVectorImpl<MethodParameter> &paramList,
}
// Add parameters for all arguments (operands and attributes).
+ // Track "attr-like" (property and attribute) optional values separate from
+ // attributes themselves so that the disambiguation code can look at the first
+ // attribute specifically when determining where to trim the optional-value
+ // list to avoid ambiguity while preserving the ability of all-property ops to
+ // use default parameters.
+ int defaultValuedAttrLikeStartIndex = op.getNumArgs();
int defaultValuedAttrStartIndex = op.getNumArgs();
// Successors and variadic regions go at the end of the parameter list, so no
// default arguments are possible.
@@ -2967,6 +3071,15 @@ void OpEmitter::buildParamList(SmallVectorImpl<MethodParameter> &paramList,
for (int i = op.getNumArgs() - 1; i >= 0; --i) {
auto *namedAttr =
llvm::dyn_cast_if_present<tblgen::NamedAttribute *>(op.getArg(i));
+ auto *namedProperty =
+ llvm::dyn_cast_if_present<tblgen::NamedProperty *>(op.getArg(i));
+ if (namedProperty) {
+ Property prop = namedProperty->prop;
+ if (!prop.hasDefaultValue())
+ break;
+ defaultValuedAttrLikeStartIndex = i;
+ continue;
+ }
if (!namedAttr)
break;
@@ -2986,6 +3099,7 @@ void OpEmitter::buildParamList(SmallVectorImpl<MethodParameter> &paramList,
if (retType == "::llvm::APInt" || retType == "::llvm::APFloat")
break;
+ defaultValuedAttrLikeStartIndex = i;
defaultValuedAttrStartIndex = i;
}
}
@@ -3001,8 +3115,10 @@ void OpEmitter::buildParamList(SmallVectorImpl<MethodParameter> &paramList,
if ((attrParamKind == AttrParamKind::WrappedAttr &&
canUseUnwrappedRawValue(attr)) ||
(attrParamKind == AttrParamKind::UnwrappedValue &&
- !canUseUnwrappedRawValue(attr)))
+ !canUseUnwrappedRawValue(attr))) {
++defaultValuedAttrStartIndex;
+ defaultValuedAttrLikeStartIndex = defaultValuedAttrStartIndex;
+ }
}
/// Collect any inferred attributes.
@@ -3029,8 +3145,16 @@ void OpEmitter::buildParamList(SmallVectorImpl<MethodParameter> &paramList,
operand->isOptional());
continue;
}
- if (llvm::isa_and_present<NamedProperty *>(arg)) {
- // TODO
+ if (auto *propArg = llvm::dyn_cast_if_present<NamedProperty *>(arg)) {
+ const Property &prop = propArg->prop;
+ StringRef type = prop.getInterfaceType();
+ std::string defaultValue;
+ if (prop.hasDefaultValue() && i >= defaultValuedAttrLikeStartIndex) {
+ defaultValue = prop.getDefaultValue();
+ }
+ bool isOptional = prop.hasDefaultValue();
+ paramList.emplace_back(type, propArg->name, StringRef(defaultValue),
+ isOptional);
continue;
}
const NamedAttribute &namedAttr = *arg.get<NamedAttribute *>();
@@ -3157,6 +3281,15 @@ void OpEmitter::genCodeForAddingArgAndRegionForBuilder(
}
}
+ // Push all properties to the result.
+ for (const auto &namedProp : op.getProperties()) {
+ // Use the setter from the Properties struct since the conversion from the
+ // interface type (used in the builder argument) to the storage type (used
+ // in the state) is not necessarily trivial.
+ std::string setterName = op.getSetterName(namedProp.name);
+ body << formatv(" {0}.getOrAddProperties<Properties>().{1}({2});\n",
+ builderOpState, setterName, namedProp.name);
+ }
// Push all attributes to the result.
for (const auto &namedAttr : op.getAttributes()) {
auto &attr = namedAttr.attr;
@@ -3996,17 +4129,19 @@ OpOperandAdaptorEmitter::OpOperandAdaptorEmitter(
// Generate the data member using the storage type.
os << " using " << name << "Ty = " << prop.getStorageType() << ";\n"
<< " " << name << "Ty " << name;
- if (prop.hasDefaultValue())
+ if (prop.hasStorageTypeValueOverride())
+ os << " = " << prop.getStorageTypeValueOverride();
+ else if (prop.hasDefaultValue())
os << " = " << prop.getDefaultValue();
comparatorOs << " rhs." << name << " == this->" << name
<< " &&\n";
// Emit accessors using the interface type.
const char *accessorFmt = R"decl(;
- {0} get{1}() {
+ {0} get{1}() const {
auto &propStorage = this->{2};
return {3};
}
- void set{1}(const {0} &propValue) {
+ void set{1}({0} propValue) {
auto &propStorage = this->{2};
{4};
}
@@ -4274,6 +4409,11 @@ OpOperandAdaptorEmitter::OpOperandAdaptorEmitter(
ERROR_IF_PRUNED(m, "Adaptor::getAttributes", op);
m->body() << " return odsAttrs;";
}
+ for (auto &namedProp : op.getProperties()) {
+ std::string name = op.getGetterName(namedProp.name);
+ emitPropGetter(genericAdaptorBase, op, name, namedProp.prop);
+ }
+
for (auto &namedAttr : op.getAttributes()) {
const auto &name = namedAttr.name;
const auto &attr = namedAttr.attr;
diff --git a/mlir/tools/mlir-tblgen/OpFormatGen.cpp b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
index a97d876..27ad79a 100644
--- a/mlir/tools/mlir-tblgen/OpFormatGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
@@ -45,7 +45,7 @@ public:
OpVariableElement(const VarT *var) : var(var) {}
/// Get the variable.
- const VarT *getVar() { return var; }
+ const VarT *getVar() const { return var; }
protected:
/// The op variable, e.g. a type or attribute constraint.
@@ -64,11 +64,6 @@ struct AttributeVariable
return attrType ? attrType->getBuilderCall() : std::nullopt;
}
- /// Return if this attribute refers to a UnitAttr.
- bool isUnitAttr() const {
- return var->attr.getBaseAttr().getAttrDefName() == "UnitAttr";
- }
-
/// Indicate if this attribute is printed "qualified" (that is it is
/// prefixed with the `#dialect.mnemonic`).
bool shouldBeQualified() { return shouldBeQualifiedFlag; }
@@ -98,6 +93,42 @@ using SuccessorVariable =
/// This class represents a variable that refers to a property argument.
using PropertyVariable =
OpVariableElement<NamedProperty, VariableElement::Property>;
+
+/// LLVM RTTI helper for attribute-like variables, that is, attributes or
+/// properties. This allows for common handling of attributes and properties in
+/// parts of the code that are oblivious to whether something is stored as an
+/// attribute or a property.
+struct AttributeLikeVariable : public VariableElement {
+ enum { AttributeLike = 1 << 0 };
+
+ static bool classof(const VariableElement *ve) {
+ return ve->getKind() == VariableElement::Attribute ||
+ ve->getKind() == VariableElement::Property;
+ }
+
+ static bool classof(const FormatElement *fe) {
+ return isa<VariableElement>(fe) && classof(cast<VariableElement>(fe));
+ }
+
+ /// Returns true if the variable is a UnitAttr or a UnitProperty.
+ bool isUnit() const {
+ if (const auto *attr = dyn_cast<AttributeVariable>(this))
+ return attr->getVar()->attr.getBaseAttr().getAttrDefName() == "UnitAttr";
+ if (const auto *prop = dyn_cast<PropertyVariable>(this)) {
+ return prop->getVar()->prop.getBaseProperty().getPropertyDefName() ==
+ "UnitProperty";
+ }
+ llvm_unreachable("Type that wasn't listed in classof()");
+ }
+
+ StringRef getName() const {
+ if (const auto *attr = dyn_cast<AttributeVariable>(this))
+ return attr->getVar()->name;
+ if (const auto *prop = dyn_cast<PropertyVariable>(this))
+ return prop->getVar()->name;
+ llvm_unreachable("Type that wasn't listed in classof()");
+ }
+};
} // namespace
//===----------------------------------------------------------------------===//
@@ -214,11 +245,11 @@ public:
/// If the parsing element is a single UnitAttr element, then it returns the
/// attribute variable. Otherwise, returns nullptr.
- AttributeVariable *
- getUnitAttrParsingElement(ArrayRef<FormatElement *> pelement) {
+ AttributeLikeVariable *
+ getUnitVariableParsingElement(ArrayRef<FormatElement *> pelement) {
if (pelement.size() == 1) {
- auto *attrElem = dyn_cast<AttributeVariable>(pelement[0]);
- if (attrElem && attrElem->isUnitAttr())
+ auto *attrElem = dyn_cast<AttributeLikeVariable>(pelement[0]);
+ if (attrElem && attrElem->isUnit())
return attrElem;
}
return nullptr;
@@ -488,6 +519,36 @@ const char *const enumAttrParserCode = R"(
}
)";
+/// The code snippet used to generate a parser call for a property.
+/// {0}: The name of the property
+/// {1}: The C++ class name of the operation
+/// {2}: The property's parser code with appropriate substitutions performed
+/// {3}: The description of the expected property for the error message.
+const char *const propertyParserCode = R"(
+ auto {0}PropLoc = parser.getCurrentLocation();
+ auto {0}PropParseResult = [&](auto& propStorage) -> ::mlir::ParseResult {{
+ {2}
+ return ::mlir::success();
+ }(result.getOrAddProperties<{1}::Properties>().{0});
+ if (failed({0}PropParseResult)) {{
+ return parser.emitError({0}PropLoc, "invalid value for property {0}, expected {3}");
+ }
+)";
+
+/// The code snippet used to generate a parser call for a property.
+/// {0}: The name of the property
+/// {1}: The C++ class name of the operation
+/// {2}: The property's parser code with appropriate substitutions performed
+const char *const optionalPropertyParserCode = R"(
+ auto {0}PropParseResult = [&](auto& propStorage) -> ::mlir::OptionalParseResult {{
+ {2}
+ return ::mlir::success();
+ }(result.getOrAddProperties<{1}::Properties>().{0});
+ if ({0}PropParseResult.has_value() && failed(*{0}PropParseResult)) {{
+ return ::mlir::failure();
+ }
+)";
+
/// The code snippet used to generate a parser call for an operand.
///
/// {0}: The name of the operand.
@@ -796,9 +857,9 @@ static void genElementParserStorage(FormatElement *element, const Operator &op,
// If the anchor is a unit attribute, it won't be parsed directly so elide
// it.
- auto *anchor = dyn_cast<AttributeVariable>(optional->getAnchor());
+ auto *anchor = dyn_cast<AttributeLikeVariable>(optional->getAnchor());
FormatElement *elidedAnchorElement = nullptr;
- if (anchor && anchor != elements.front() && anchor->isUnitAttr())
+ if (anchor && anchor != elements.front() && anchor->isUnit())
elidedAnchorElement = anchor;
for (FormatElement *childElement : elements)
if (childElement != elidedAnchorElement)
@@ -808,7 +869,7 @@ static void genElementParserStorage(FormatElement *element, const Operator &op,
} else if (auto *oilist = dyn_cast<OIListElement>(element)) {
for (ArrayRef<FormatElement *> pelement : oilist->getParsingElements()) {
- if (!oilist->getUnitAttrParsingElement(pelement))
+ if (!oilist->getUnitVariableParsingElement(pelement))
for (FormatElement *element : pelement)
genElementParserStorage(element, op, body);
}
@@ -1049,7 +1110,6 @@ static void genCustomDirectiveParser(CustomDirective *dir, MethodBody &body,
body << llvm::formatv(" result.addAttribute(\"{0}\", {0}Attr);\n",
var->name);
}
-
} else if (auto *operand = dyn_cast<OperandVariable>(param)) {
const NamedTypeConstraint *var = operand->getVar();
if (var->isOptional()) {
@@ -1137,6 +1197,29 @@ static void genEnumAttrParser(const NamedAttribute *var, MethodBody &body,
validCaseKeywordsStr, errorMessage, attrAssignment);
}
+// Generate the parser for a property.
+static void genPropertyParser(PropertyVariable *propVar, MethodBody &body,
+ StringRef opCppClassName,
+ bool requireParse = true) {
+ StringRef name = propVar->getVar()->name;
+ const Property &prop = propVar->getVar()->prop;
+ bool parseOptionally =
+ prop.hasDefaultValue() && !requireParse && prop.hasOptionalParser();
+ FmtContext fmtContext;
+ fmtContext.addSubst("_parser", "parser");
+ fmtContext.addSubst("_ctxt", "parser.getContext()");
+ fmtContext.addSubst("_storage", "propStorage");
+
+ if (parseOptionally) {
+ body << formatv(optionalPropertyParserCode, name, opCppClassName,
+ tgfmt(prop.getOptionalParserCall(), &fmtContext));
+ } else {
+ body << formatv(propertyParserCode, name, opCppClassName,
+ tgfmt(prop.getParserCall(), &fmtContext),
+ prop.getSummary());
+ }
+}
+
// Generate the parser for an attribute.
static void genAttrParser(AttributeVariable *attr, MethodBody &body,
FmtContext &attrTypeCtx, bool parseAsOptional,
@@ -1213,14 +1296,16 @@ if (!dict) {
}
)decl";
- // TODO: properties might be optional as well.
+ // {0}: fromAttribute call
+ // {1}: property name
+ // {2}: isRequired
const char *propFromAttrFmt = R"decl(
auto setFromAttr = [] (auto &propStorage, ::mlir::Attribute propAttr,
- ::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError) {{
+ ::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError) -> ::mlir::LogicalResult {{
{0};
};
auto attr = dict.get("{1}");
-if (!attr) {{
+if (!attr && {2}) {{
emitError() << "expected key entry for {1} in DictionaryAttr to set "
"Properties.";
return ::mlir::failure();
@@ -1238,13 +1323,14 @@ if (::mlir::failed(setFromAttr(prop.{1}, attr, emitError)))
StringRef name = namedProperty.name;
const Property &prop = namedProperty.prop;
+ bool isRequired = !prop.hasDefaultValue();
FmtContext fctx;
body << formatv(propFromAttrFmt,
tgfmt(prop.getConvertFromAttributeCall(),
&fctx.addSubst("_attr", "propAttr")
.addSubst("_storage", "propStorage")
.addSubst("_diag", "emitError")),
- name);
+ name, isRequired);
}
// Generate the setter for any attribute not parsed elsewhere.
@@ -1331,20 +1417,24 @@ void OperationFormat::genElementParser(FormatElement *element, MethodBody &body,
// If the anchor is a unit attribute, we don't need to print it. When
// parsing, we will add this attribute if this group is present.
FormatElement *elidedAnchorElement = nullptr;
- auto *anchorAttr = dyn_cast<AttributeVariable>(optional->getAnchor());
- if (anchorAttr && anchorAttr != firstElement &&
- anchorAttr->isUnitAttr()) {
- elidedAnchorElement = anchorAttr;
+ auto *anchorVar = dyn_cast<AttributeLikeVariable>(optional->getAnchor());
+ if (anchorVar && anchorVar != firstElement && anchorVar->isUnit()) {
+ elidedAnchorElement = anchorVar;
if (!thenGroup == optional->isInverted()) {
- // Add the anchor unit attribute to the operation state.
- if (useProperties) {
+ // Add the anchor unit attribute or property to the operation state
+ // or set the property to true.
+ if (isa<PropertyVariable>(anchorVar)) {
+ body << formatv(
+ " result.getOrAddProperties<{1}::Properties>().{0} = true;",
+ anchorVar->getName(), opCppClassName);
+ } else if (useProperties) {
body << formatv(
" result.getOrAddProperties<{1}::Properties>().{0} = "
"parser.getBuilder().getUnitAttr();",
- anchorAttr->getVar()->name, opCppClassName);
+ anchorVar->getName(), opCppClassName);
} else {
- body << " result.addAttribute(\"" << anchorAttr->getVar()->name
+ body << " result.addAttribute(\"" << anchorVar->getName()
<< "\", parser.getBuilder().getUnitAttr());\n";
}
}
@@ -1368,6 +1458,12 @@ void OperationFormat::genElementParser(FormatElement *element, MethodBody &body,
genAttrParser(attrVar, body, attrTypeCtx, /*parseAsOptional=*/true,
useProperties, opCppClassName);
body << " if (" << attrVar->getVar()->name << "Attr) {\n";
+ } else if (auto *propVar = dyn_cast<PropertyVariable>(firstElement)) {
+ genPropertyParser(propVar, body, opCppClassName, /*requireParse=*/false);
+ body << llvm::formatv("if ({0}PropParseResult.has_value() && "
+ "succeeded(*{0}PropParseResult)) ",
+ propVar->getVar()->name)
+ << " {\n";
} else if (auto *literal = dyn_cast<LiteralElement>(firstElement)) {
body << " if (::mlir::succeeded(parser.parseOptional";
genLiteralParser(literal->getSpelling(), body);
@@ -1430,15 +1526,19 @@ void OperationFormat::genElementParser(FormatElement *element, MethodBody &body,
body << ")) {\n";
StringRef lelementName = lelement->getSpelling();
body << formatv(oilistParserCode, lelementName);
- if (AttributeVariable *unitAttrElem =
- oilist->getUnitAttrParsingElement(pelement)) {
- if (useProperties) {
+ if (AttributeLikeVariable *unitVarElem =
+ oilist->getUnitVariableParsingElement(pelement)) {
+ if (isa<PropertyVariable>(unitVarElem)) {
+ body << formatv(
+ " result.getOrAddProperties<{1}::Properties>().{0} = true;",
+ unitVarElem->getName(), opCppClassName);
+ } else if (useProperties) {
body << formatv(
" result.getOrAddProperties<{1}::Properties>().{0} = "
"parser.getBuilder().getUnitAttr();",
- unitAttrElem->getVar()->name, opCppClassName);
+ unitVarElem->getName(), opCppClassName);
} else {
- body << " result.addAttribute(\"" << unitAttrElem->getVar()->name
+ body << " result.addAttribute(\"" << unitVarElem->getName()
<< "\", UnitAttr::get(parser.getContext()));\n";
}
} else {
@@ -1468,6 +1568,8 @@ void OperationFormat::genElementParser(FormatElement *element, MethodBody &body,
(genCtx == GenContext::Normal && attr->getVar()->attr.isOptional());
genAttrParser(attr, body, attrTypeCtx, parseAsOptional, useProperties,
opCppClassName);
+ } else if (auto *prop = dyn_cast<PropertyVariable>(element)) {
+ genPropertyParser(prop, body, opCppClassName);
} else if (auto *operand = dyn_cast<OperandVariable>(element)) {
ArgumentLengthKind lengthKind = getArgumentLengthKind(operand->getVar());
@@ -1876,6 +1978,38 @@ const char *enumAttrBeginPrinterCode = R"(
auto caseValueStr = {1}(caseValue);
)";
+/// Generate a check that an optional or default-valued attribute or property
+/// has a non-default value. For these purposes, the default value of an
+/// optional attribute is its presence, even if the attribute itself has a
+/// default value.
+static void genNonDefaultValueCheck(MethodBody &body, const Operator &op,
+ AttributeVariable &attrElement) {
+ Attribute attr = attrElement.getVar()->attr;
+ std::string getter = op.getGetterName(attrElement.getVar()->name);
+ bool optionalAndDefault = attr.isOptional() && attr.hasDefaultValue();
+ if (optionalAndDefault)
+ body << "(";
+ if (attr.isOptional())
+ body << getter << "Attr()";
+ if (optionalAndDefault)
+ body << " && ";
+ if (attr.hasDefaultValue()) {
+ FmtContext fctx;
+ fctx.withBuilder("::mlir::OpBuilder((*this)->getContext())");
+ body << getter << "Attr() != "
+ << tgfmt(attr.getConstBuilderTemplate(), &fctx,
+ attr.getDefaultValue());
+ }
+ if (optionalAndDefault)
+ body << ")";
+}
+
+static void genNonDefaultValueCheck(MethodBody &body, const Operator &op,
+ PropertyVariable &propElement) {
+ body << op.getGetterName(propElement.getVar()->name)
+ << "() != " << propElement.getVar()->prop.getDefaultValue();
+}
+
/// Generate the printer for the 'prop-dict' directive.
static void genPropDictPrinter(OperationFormat &fmt, Operator &op,
MethodBody &body) {
@@ -1904,6 +2038,15 @@ static void genPropDictPrinter(OperationFormat &fmt, Operator &op,
body << " }\n";
}
}
+ // Similarly, elide default-valued properties.
+ for (const NamedProperty &prop : op.getProperties()) {
+ if (prop.prop.hasDefaultValue()) {
+ body << " if (" << op.getGetterName(prop.name)
+ << "() == " << prop.prop.getDefaultValue() << ") {";
+ body << " elidedProps.push_back(\"" << prop.name << "\");\n";
+ body << " }\n";
+ }
+ }
body << " _odsPrinter << \" \";\n"
<< " printProperties(this->getContext(), _odsPrinter, "
@@ -2031,7 +2174,6 @@ static void genCustomDirectiveParameterPrinter(FormatElement *element,
} else if (auto *property = dyn_cast<PropertyVariable>(element)) {
FmtContext ctx;
- ctx.addSubst("_ctxt", "getContext()");
const NamedProperty *namedProperty = property->getVar();
ctx.addSubst("_storage", "getProperties()." + namedProperty->name);
body << tgfmt(namedProperty->prop.getConvertFromStorageCall(), &ctx);
@@ -2154,16 +2296,6 @@ static void genEnumAttrPrinter(const NamedAttribute *var, const Operator &op,
" }\n";
}
-/// Generate a check that a DefaultValuedAttr has a value that is non-default.
-static void genNonDefaultValueCheck(MethodBody &body, const Operator &op,
- AttributeVariable &attrElement) {
- FmtContext fctx;
- Attribute attr = attrElement.getVar()->attr;
- fctx.withBuilder("::mlir::OpBuilder((*this)->getContext())");
- body << " && " << op.getGetterName(attrElement.getVar()->name) << "Attr() != "
- << tgfmt(attr.getConstBuilderTemplate(), &fctx, attr.getDefaultValue());
-}
-
/// Generate the check for the anchor of an optional group.
static void genOptionalGroupPrinterAnchor(FormatElement *anchor,
const Operator &op,
@@ -2190,17 +2322,12 @@ static void genOptionalGroupPrinterAnchor(FormatElement *anchor,
genOptionalGroupPrinterAnchor(element->getInputs(), op, body);
})
.Case([&](AttributeVariable *element) {
- Attribute attr = element->getVar()->attr;
- body << op.getGetterName(element->getVar()->name) << "Attr()";
- if (attr.isOptional())
- return; // done
- if (attr.hasDefaultValue()) {
- // Consider a default-valued attribute as present if it's not the
- // default value.
- genNonDefaultValueCheck(body, op, *element);
- return;
- }
- llvm_unreachable("attribute must be optional or default-valued");
+ // Consider a default-valued attribute as present if it's not the
+ // default value and an optional one present if it is set.
+ genNonDefaultValueCheck(body, op, *element);
+ })
+ .Case([&](PropertyVariable *element) {
+ genNonDefaultValueCheck(body, op, *element);
})
.Case([&](CustomDirective *ele) {
body << '(';
@@ -2276,10 +2403,10 @@ void OperationFormat::genElementPrinter(FormatElement *element,
ArrayRef<FormatElement *> thenElements = optional->getThenElements();
ArrayRef<FormatElement *> elseElements = optional->getElseElements();
FormatElement *elidedAnchorElement = nullptr;
- auto *anchorAttr = dyn_cast<AttributeVariable>(anchor);
+ auto *anchorAttr = dyn_cast<AttributeLikeVariable>(anchor);
if (anchorAttr && anchorAttr != thenElements.front() &&
(elseElements.empty() || anchorAttr != elseElements.front()) &&
- anchorAttr->isUnitAttr()) {
+ anchorAttr->isUnit()) {
elidedAnchorElement = anchorAttr;
}
auto genElementPrinters = [&](ArrayRef<FormatElement *> elements) {
@@ -2319,13 +2446,13 @@ void OperationFormat::genElementPrinter(FormatElement *element,
for (VariableElement *var : vars) {
TypeSwitch<FormatElement *>(var)
.Case([&](AttributeVariable *attrEle) {
- body << " || (" << op.getGetterName(attrEle->getVar()->name)
- << "Attr()";
- Attribute attr = attrEle->getVar()->attr;
- if (attr.hasDefaultValue()) {
- // Don't print default-valued attributes.
- genNonDefaultValueCheck(body, op, *attrEle);
- }
+ body << " || (";
+ genNonDefaultValueCheck(body, op, *attrEle);
+ body << ")";
+ })
+ .Case([&](PropertyVariable *propEle) {
+ body << " || (";
+ genNonDefaultValueCheck(body, op, *propEle);
body << ")";
})
.Case([&](OperandVariable *ele) {
@@ -2352,7 +2479,7 @@ void OperationFormat::genElementPrinter(FormatElement *element,
body << ") {\n";
genLiteralPrinter(lelement->getSpelling(), body, shouldEmitSpace,
lastWasPunctuation);
- if (oilist->getUnitAttrParsingElement(pelement) == nullptr) {
+ if (oilist->getUnitVariableParsingElement(pelement) == nullptr) {
for (FormatElement *element : pelement)
genElementPrinter(element, body, op, shouldEmitSpace,
lastWasPunctuation);
@@ -2369,7 +2496,7 @@ void OperationFormat::genElementPrinter(FormatElement *element,
return;
}
- // Emit the attribute dictionary.
+ // Emit the property dictionary.
if (isa<PropDictDirective>(element)) {
genPropDictPrinter(*this, op, body);
lastWasPunctuation = false;
@@ -2408,6 +2535,13 @@ void OperationFormat::genElementPrinter(FormatElement *element,
else
body << "_odsPrinter.printStrippedAttrOrType("
<< op.getGetterName(var->name) << "Attr());\n";
+ } else if (auto *property = dyn_cast<PropertyVariable>(element)) {
+ const NamedProperty *var = property->getVar();
+ FmtContext fmtContext;
+ fmtContext.addSubst("_printer", "_odsPrinter");
+ fmtContext.addSubst("_ctxt", "getContext()");
+ fmtContext.addSubst("_storage", "getProperties()." + var->name);
+ body << tgfmt(var->prop.getPrinterCall(), &fmtContext) << ";\n";
} else if (auto *operand = dyn_cast<OperandVariable>(element)) {
if (operand->getVar()->isVariadicOfVariadic()) {
body << " ::llvm::interleaveComma("
@@ -2737,6 +2871,10 @@ static bool isOptionallyParsed(FormatElement *el) {
Attribute attr = attrVar->getVar()->attr;
return attr.isOptional() || attr.hasDefaultValue();
}
+ if (auto *propVar = dyn_cast<PropertyVariable>(el)) {
+ const Property &prop = propVar->getVar()->prop;
+ return prop.hasDefaultValue() && prop.hasOptionalParser();
+ }
if (auto *operandVar = dyn_cast<OperandVariable>(el)) {
const NamedTypeConstraint *operand = operandVar->getVar();
return operand->isOptional() || operand->isVariadic() ||
@@ -3141,10 +3279,9 @@ OpFormatParser::parseVariableImpl(SMLoc loc, StringRef name, Context ctx) {
}
if (const NamedProperty *property = findArg(op.getProperties(), name)) {
- if (ctx != CustomDirectiveContext && ctx != RefDirectiveContext)
+ if (ctx == TypeDirectiveContext)
return emitError(
- loc, "properties currently only supported in `custom` directive");
-
+ loc, "properties cannot be used as children to a `type` directive");
if (ctx == RefDirectiveContext) {
if (!seenProperties.count(property))
return emitError(loc, "property '" + name +
@@ -3428,6 +3565,15 @@ LogicalResult OpFormatParser::verifyOIListParsingElement(FormatElement *element,
"an oilist parsing group");
return success();
})
+ // Only optional properties can be within an oilist parsing group.
+ .Case([&](PropertyVariable *propEle) {
+ if (!propEle->getVar()->prop.hasDefaultValue())
+ return emitError(
+ loc,
+ "only default-valued or optional properties can be used in "
+ "an olist parsing group");
+ return success();
+ })
// Only optional-like(i.e. variadic) operands can be within an
// oilist parsing group.
.Case([&](OperandVariable *ele) {
@@ -3557,6 +3703,16 @@ LogicalResult OpFormatParser::verifyOptionalGroupElement(SMLoc loc,
"can be used to anchor an optional group");
return success();
})
+ // All properties can be within the optional group, but only optional
+ // properties can be the anchor.
+ .Case([&](PropertyVariable *propEle) {
+ Property prop = propEle->getVar()->prop;
+ if (isAnchor && !(prop.hasDefaultValue() && prop.hasOptionalParser()))
+ return emitError(loc, "only properties with default values "
+ "that can be optionally parsed "
+ "can be used to anchor an optional group");
+ return success();
+ })
// Only optional-like(i.e. variadic) operands can be within an optional
// group.
.Case([&](OperandVariable *ele) {
diff --git a/utils/bazel/llvm-project-overlay/bolt/BUILD.bazel b/utils/bazel/llvm-project-overlay/bolt/BUILD.bazel
index a1c089a..907fe10 100644
--- a/utils/bazel/llvm-project-overlay/bolt/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/bolt/BUILD.bazel
@@ -10,6 +10,12 @@ package(
licenses(["notice"])
+genrule(
+ name = "generate_vcs_revision",
+ outs = ["include/VCSVersion.inc"],
+ cmd = "echo '#undef BOLT_REVISION' >> $@\n",
+)
+
cc_binary(
name = "llvm-bolt-heatmap",
srcs = glob([
@@ -114,6 +120,7 @@ cc_library(
textual_hdrs = glob([
"include/bolt/RuntimeLibs/*.h",
]) + ["include/bolt/RuntimeLibs/RuntimeLibraryVariables.inc"],
+ defines=["CMAKE_INSTALL_FULL_LIBDIR=\\\"\\\""],
deps = [
":Core",
":Passes",
@@ -289,7 +296,9 @@ cc_library(
srcs = glob([
"lib/Utils/*.cpp",
]),
- hdrs = glob([
+ hdrs = [
+ "include/VCSVersion.inc",
+ ] + glob([
"include/bolt/Utils/*.h",
]),
includes = ["include"],
diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index 64d36c7..4d443e8 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -944,10 +944,7 @@ cc_library(
srcs = glob([
"lib/IR/*.cpp",
"lib/IR/*.h",
- ]) + [
- # To avoid a dependency cycle.
- "include/llvm/Analysis/IVDescriptors.h",
- ],
+ ]),
hdrs = glob(
[
"include/llvm/*.h",
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 536c825..f83c471 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -2142,6 +2142,7 @@ cc_library(
":FuncTransforms",
":FunctionInterfaces",
":IR",
+ ":IndexDialect",
":LLVMCommonConversion",
":LLVMDialect",
":MemRefDialect",
@@ -2149,6 +2150,7 @@ cc_library(
":SCFDialect",
":SCFTransforms",
":TransformUtils",
+ ":VectorUtils",
"//llvm:Support",
],
)
@@ -5744,6 +5746,7 @@ cc_library(
"lib/Conversion/GPUCommon/OpToFuncCallLowering.h",
],
deps = [
+ ":ArithDialect",
":GPUDialect",
":IR",
":LLVMCommonConversion",