diff options
author | Austin Kerbow <Austin.Kerbow@amd.com> | 2020-11-30 09:06:35 -0800 |
---|---|---|
committer | Austin Kerbow <Austin.Kerbow@amd.com> | 2021-01-26 11:25:51 -0800 |
commit | 2291bd137d12cc4f806d80be93bb442246df4f0e (patch) | |
tree | ff8d25c785aec9d819e9554fb0408014601bb5b8 | |
parent | 683719bc0cc8e12a5f9c06135fc97a13ef414f69 (diff) | |
download | llvm-2291bd137d12cc4f806d80be93bb442246df4f0e.zip llvm-2291bd137d12cc4f806d80be93bb442246df4f0e.tar.gz llvm-2291bd137d12cc4f806d80be93bb442246df4f0e.tar.bz2 |
[AMDGPU] Update subtarget features for new target ID support
Support for XNACK and SRAMECC is not static on some GPUs. We must be able
to differentiate between different scenarios for these dynamic subtarget
features.
The possible settings are:
- Unsupported: The GPU has no support for XNACK/SRAMECC.
- Any: Preference is unspecified. Use conservative settings that can run anywhere.
- Off: Request support for XNACK/SRAMECC Off
- On: Request support for XNACK/SRAMECC On
GCNSubtarget will track the four options based on the following criteria. If
the subtarget does not support XNACK/SRAMECC we say the setting is
"Unsupported". If no subtarget features for XNACK/SRAMECC are requested we
must support "Any" mode. If the subtarget features XNACK/SRAMECC exist in the
feature string when initializing the subtarget, the settings are "On/Off".
The defaults are updated to be conservatively correct, meaning if no setting
for XNACK or SRAMECC is explicitly requested, defaults will be used which
generate code that can be run anywhere. This corresponds to the "Any" setting.
Differential Revision: https://reviews.llvm.org/D85882
103 files changed, 7768 insertions, 6672 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 0a212a41..c352c00 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -129,10 +129,10 @@ def FeatureFmaMixInsts : SubtargetFeature<"fma-mix-insts", "Has v_fma_mix_f32, v_fma_mixlo_f16, v_fma_mixhi_f16 instructions" >; -def FeatureDoesNotSupportXNACK : SubtargetFeature<"no-xnack-support", - "DoesNotSupportXNACK", +def FeatureSupportsXNACK : SubtargetFeature<"xnack-support", + "SupportsXNACK", "true", - "Hardware does not support XNACK" + "Hardware supports XNACK" >; // XNACK is disabled if SH_MEM_CONFIG.ADDRESS_MODE = GPUVM on chips that support @@ -491,16 +491,16 @@ def FeatureAtomicFaddInsts : SubtargetFeature<"atomic-fadd-insts", [FeatureFlatGlobalInsts] >; -def FeatureDoesNotSupportSRAMECC : SubtargetFeature<"no-sram-ecc-support", - "DoesNotSupportSRAMECC", +def FeatureSupportsSRAMECC : SubtargetFeature<"sramecc-support", + "SupportsSRAMECC", "true", - "Hardware does not support SRAM ECC" + "Hardware supports SRAMECC" >; -def FeatureSRAMECC : SubtargetFeature<"sram-ecc", +def FeatureSRAMECC : SubtargetFeature<"sramecc", "EnableSRAMECC", "true", - "Enable SRAM ECC" + "Enable SRAMECC" >; def FeatureNoSdstCMPX : SubtargetFeature<"no-sdst-cmpx", @@ -675,8 +675,7 @@ def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS", [FeatureFP64, FeatureLocalMemorySize32768, FeatureMIMG_R128, FeatureWavefrontSize64, FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureLDSBankCount32, FeatureMovrel, - FeatureTrigReducedRange, FeatureDoesNotSupportSRAMECC, - FeatureDoesNotSupportXNACK] + FeatureTrigReducedRange] >; def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS", @@ -685,8 +684,7 @@ def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS", FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange, FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts, - FeatureDsSrc2Insts, FeatureDoesNotSupportSRAMECC, - FeatureUnalignedBufferAccess] + FeatureDsSrc2Insts, FeatureUnalignedBufferAccess] >; def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS", @@ -699,9 +697,7 @@ def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS", FeatureSDWA, FeatureSDWAOutModsVOPC, FeatureSDWAMac, FeatureDPP, FeatureIntClamp, FeatureTrigReducedRange, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts, - FeatureDsSrc2Insts, FeatureDoesNotSupportSRAMECC, FeatureFastDenormalF32, - FeatureUnalignedBufferAccess - ] + FeatureDsSrc2Insts, FeatureFastDenormalF32, FeatureUnalignedBufferAccess] >; def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9", @@ -717,9 +713,8 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9", FeatureAddNoCarryInsts, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts, FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16, FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts, - FeatureFastDenormalF32, FeatureUnalignedBufferAccess, - FeatureUnalignedDSAccess - ] + FeatureFastDenormalF32, FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, + FeatureSupportsXNACK] >; def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10", @@ -735,7 +730,7 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10", FeatureAddNoCarryInsts, FeatureFmaMixInsts, FeatureGFX8Insts, FeatureNoSdstCMPX, FeatureVscnt, FeatureRegisterBanking, FeatureVOP3Literal, FeatureDPP8, - FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureDoesNotSupportSRAMECC, + FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureGFX10A16, FeatureFastDenormalF32, FeatureG16, FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess ] @@ -748,84 +743,72 @@ class FeatureSet<list<SubtargetFeature> Features_> { def FeatureISAVersion6_0_0 : FeatureSet<[FeatureSouthernIslands, FeatureFastFMAF32, HalfRate64Ops, - FeatureLDSBankCount32, - FeatureDoesNotSupportXNACK]>; + FeatureLDSBankCount32]>; def FeatureISAVersion6_0_1 : FeatureSet< [FeatureSouthernIslands, - FeatureLDSBankCount32, - FeatureDoesNotSupportXNACK]>; + FeatureLDSBankCount32]>; def FeatureISAVersion6_0_2 : FeatureSet< [FeatureSouthernIslands, - FeatureLDSBankCount32, - FeatureDoesNotSupportXNACK]>; + FeatureLDSBankCount32]>; def FeatureISAVersion7_0_0 : FeatureSet< [FeatureSeaIslands, - FeatureLDSBankCount32, - FeatureDoesNotSupportXNACK]>; + FeatureLDSBankCount32]>; def FeatureISAVersion7_0_1 : FeatureSet< [FeatureSeaIslands, HalfRate64Ops, FeatureLDSBankCount32, - FeatureFastFMAF32, - FeatureDoesNotSupportXNACK]>; + FeatureFastFMAF32]>; def FeatureISAVersion7_0_2 : FeatureSet< [FeatureSeaIslands, FeatureLDSBankCount16, - FeatureFastFMAF32, - FeatureDoesNotSupportXNACK]>; + FeatureFastFMAF32]>; def FeatureISAVersion7_0_3 : FeatureSet< [FeatureSeaIslands, - FeatureLDSBankCount16, - FeatureDoesNotSupportXNACK]>; + FeatureLDSBankCount16]>; def FeatureISAVersion7_0_4 : FeatureSet< [FeatureSeaIslands, - FeatureLDSBankCount32, - FeatureDoesNotSupportXNACK]>; + FeatureLDSBankCount32]>; def FeatureISAVersion7_0_5 : FeatureSet< [FeatureSeaIslands, - FeatureLDSBankCount16, - FeatureDoesNotSupportXNACK]>; + FeatureLDSBankCount16]>; def FeatureISAVersion8_0_1 : FeatureSet< [FeatureVolcanicIslands, FeatureFastFMAF32, HalfRate64Ops, FeatureLDSBankCount32, - FeatureXNACK, + FeatureSupportsXNACK, FeatureUnpackedD16VMem]>; def FeatureISAVersion8_0_2 : FeatureSet< [FeatureVolcanicIslands, FeatureLDSBankCount32, FeatureSGPRInitBug, - FeatureUnpackedD16VMem, - FeatureDoesNotSupportXNACK]>; + FeatureUnpackedD16VMem]>; def FeatureISAVersion8_0_3 : FeatureSet< [FeatureVolcanicIslands, FeatureLDSBankCount32, - FeatureUnpackedD16VMem, - FeatureDoesNotSupportXNACK]>; + FeatureUnpackedD16VMem]>; def FeatureISAVersion8_0_5 : FeatureSet< [FeatureVolcanicIslands, FeatureLDSBankCount32, FeatureSGPRInitBug, - FeatureUnpackedD16VMem, - FeatureDoesNotSupportXNACK]>; + FeatureUnpackedD16VMem]>; def FeatureISAVersion8_1_0 : FeatureSet< [FeatureVolcanicIslands, FeatureLDSBankCount16, - FeatureXNACK, + FeatureSupportsXNACK, FeatureImageStoreD16Bug, FeatureImageGather4D16Bug]>; @@ -833,24 +816,18 @@ def FeatureISAVersion9_0_0 : FeatureSet< [FeatureGFX9, FeatureMadMixInsts, FeatureLDSBankCount32, - FeatureDoesNotSupportXNACK, - FeatureDoesNotSupportSRAMECC, FeatureImageGather4D16Bug]>; def FeatureISAVersion9_0_2 : FeatureSet< [FeatureGFX9, FeatureMadMixInsts, FeatureLDSBankCount32, - FeatureXNACK, - FeatureDoesNotSupportSRAMECC, FeatureImageGather4D16Bug]>; def FeatureISAVersion9_0_4 : FeatureSet< [FeatureGFX9, FeatureLDSBankCount32, FeatureFmaMixInsts, - FeatureDoesNotSupportXNACK, - FeatureDoesNotSupportSRAMECC, FeatureImageGather4D16Bug]>; def FeatureISAVersion9_0_6 : FeatureSet< @@ -861,7 +838,7 @@ def FeatureISAVersion9_0_6 : FeatureSet< FeatureDLInsts, FeatureDot1Insts, FeatureDot2Insts, - FeatureDoesNotSupportXNACK, + FeatureSupportsSRAMECC, FeatureImageGather4D16Bug]>; def FeatureISAVersion9_0_8 : FeatureSet< @@ -879,7 +856,7 @@ def FeatureISAVersion9_0_8 : FeatureSet< FeatureMAIInsts, FeaturePkFmacF16Inst, FeatureAtomicFaddInsts, - FeatureSRAMECC, + FeatureSupportsSRAMECC, FeatureMFMAInlineLiteralBug, FeatureImageGather4D16Bug]>; @@ -887,7 +864,6 @@ def FeatureISAVersion9_0_9 : FeatureSet< [FeatureGFX9, FeatureMadMixInsts, FeatureLDSBankCount32, - FeatureXNACK, FeatureImageGather4D16Bug]>; def FeatureISAVersion9_0_C : FeatureSet< @@ -928,7 +904,7 @@ def FeatureISAVersion10_1_0 : FeatureSet< FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureLdsMisalignedBug, - FeatureDoesNotSupportXNACK])>; + FeatureSupportsXNACK])>; def FeatureISAVersion10_1_1 : FeatureSet< !listconcat(FeatureGroup.GFX10_1_Bugs, @@ -949,7 +925,7 @@ def FeatureISAVersion10_1_1 : FeatureSet< FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureLdsMisalignedBug, - FeatureDoesNotSupportXNACK])>; + FeatureSupportsXNACK])>; def FeatureISAVersion10_1_2 : FeatureSet< !listconcat(FeatureGroup.GFX10_1_Bugs, @@ -970,7 +946,7 @@ def FeatureISAVersion10_1_2 : FeatureSet< FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureLdsMisalignedBug, - FeatureDoesNotSupportXNACK])>; + FeatureSupportsXNACK])>; def FeatureISAVersion10_3_0 : FeatureSet< [FeatureGFX10, @@ -983,8 +959,7 @@ def FeatureISAVersion10_3_0 : FeatureSet< FeatureDot5Insts, FeatureDot6Insts, FeatureNSAEncoding, - FeatureWavefrontSize32, - FeatureDoesNotSupportXNACK]>; + FeatureWavefrontSize32]>; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 174aac5..f1a7d74 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -19,6 +19,7 @@ #include "AMDGPURegisterBankInfo.h" #include "AMDGPUTargetMachine.h" #include "SIMachineFunctionInfo.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/SmallString.h" #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h" #include "llvm/CodeGen/MachineScheduler.h" @@ -88,8 +89,7 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, // Similarly we want enable-prt-strict-null to be on by default and not to // unset everything else if it is disabled - // Assuming ECC is enabled is the conservative default. - SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,+sram-ecc,+xnack,"); + SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,"); // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default if (isAmdHsaOS()) @@ -164,20 +164,12 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; - // Disable XNACK on targets where it is not enabled by default unless it is - // explicitly requested. - if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) { - ToggleFeature(AMDGPU::FeatureXNACK); - EnableXNACK = false; - } + TargetID.setTargetIDFromFeaturesString(FS); - // ECC is on by default, but turn it off if the hardware doesn't support it - // anyway. This matters for the gfx9 targets with d16 loads, but don't support - // ECC. - if (DoesNotSupportSRAMECC && EnableSRAMECC) { - ToggleFeature(AMDGPU::FeatureSRAMECC); - EnableSRAMECC = false; - } + LLVM_DEBUG(dbgs() << "xnack setting for subtarget: " + << TargetID.getXnackSetting() << '\n'); + LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: " + << TargetID.getSramEccSetting() << '\n'); return *this; } @@ -206,6 +198,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), AMDGPUSubtarget(TT), TargetTriple(TT), + TargetID(*this), Gen(INVALID), InstrItins(getInstrItineraryForCPU(GPU)), LDSBankCount(0), @@ -221,8 +214,8 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, UnalignedAccessMode(false), HasApertureRegs(false), + SupportsXNACK(false), EnableXNACK(false), - DoesNotSupportXNACK(false), EnableCuMode(false), TrapHandler(false), @@ -271,8 +264,8 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, HasMAIInsts(false), HasPkFmacF16Inst(false), HasAtomicFaddInsts(false), + SupportsSRAMECC(false), EnableSRAMECC(false), - DoesNotSupportSRAMECC(false), HasNoSdstCMPX(false), HasVscnt(false), HasGetWaveIdInst(false), diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index a59b5cf..7a71781 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -72,6 +72,7 @@ private: protected: // Basic subtarget description. Triple TargetTriple; + AMDGPU::IsaInfo::AMDGPUTargetID TargetID; unsigned Gen; InstrItineraryData InstrItins; int LDSBankCount; @@ -88,8 +89,12 @@ protected: bool UnalignedScratchAccess; bool UnalignedAccessMode; bool HasApertureRegs; + bool SupportsXNACK; + + // This should not be used directly. 'TargetID' tracks the dynamic settings + // for XNACK. bool EnableXNACK; - bool DoesNotSupportXNACK; + bool EnableCuMode; bool TrapHandler; @@ -142,8 +147,12 @@ protected: bool HasMAIInsts; bool HasPkFmacF16Inst; bool HasAtomicFaddInsts; + bool SupportsSRAMECC; + + // This should not be used directly. 'TargetID' tracks the dynamic settings + // for SRAMECC. bool EnableSRAMECC; - bool DoesNotSupportSRAMECC; + bool HasNoSdstCMPX; bool HasVscnt; bool HasGetWaveIdInst; @@ -498,7 +507,7 @@ public: } bool isXNACKEnabled() const { - return EnableXNACK; + return TargetID.isXnackOnOrAny(); } bool isCuModeEnabled() const { @@ -561,7 +570,7 @@ public: } bool d16PreservesUnusedBits() const { - return hasD16LoadStore() && !isSRAMECCEnabled(); + return hasD16LoadStore() && !TargetID.isSramEccOnOrAny(); } bool hasD16Images() const { @@ -669,10 +678,6 @@ public: return HasAtomicFaddInsts; } - bool isSRAMECCEnabled() const { - return EnableSRAMECC; - } - bool hasNoSdstCMPX() const { return HasNoSdstCMPX; } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 4905bba..4c1e4de 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -246,6 +246,94 @@ int getMCOpcode(uint16_t Opcode, unsigned Gen) { namespace IsaInfo { +AMDGPUTargetID::AMDGPUTargetID(const MCSubtargetInfo &STI) + : XnackSetting(TargetIDSetting::Any), SramEccSetting(TargetIDSetting::Any) { + if (!STI.getFeatureBits().test(FeatureSupportsXNACK)) + XnackSetting = TargetIDSetting::Unsupported; + if (!STI.getFeatureBits().test(FeatureSupportsSRAMECC)) + SramEccSetting = TargetIDSetting::Unsupported; +} + +void AMDGPUTargetID::setTargetIDFromFeaturesString(StringRef FS) { + // Check if xnack or sramecc is explicitly enabled or disabled. In the + // absence of the target features we assume we must generate code that can run + // in any environment. + SubtargetFeatures Features(FS); + Optional<bool> XnackRequested; + Optional<bool> SramEccRequested; + + for (const std::string &Feature : Features.getFeatures()) { + if (Feature == "+xnack") + XnackRequested = true; + else if (Feature == "-xnack") + XnackRequested = false; + else if (Feature == "+sramecc") + SramEccRequested = true; + else if (Feature == "-sramecc") + SramEccRequested = false; + } + + bool XnackSupported = isXnackSupported(); + bool SramEccSupported = isSramEccSupported(); + + if (XnackRequested) { + if (XnackSupported) { + XnackSetting = + *XnackRequested ? TargetIDSetting::On : TargetIDSetting::Off; + } else { + // If a specific xnack setting was requested and this GPU does not support + // xnack emit a warning. Setting will remain set to "Unsupported". + if (*XnackRequested) { + errs() << "warning: xnack 'On' was requested for a processor that does " + "not support it!\n"; + } else { + errs() << "warning: xnack 'Off' was requested for a processor that " + "does not support it!\n"; + } + } + } + + if (SramEccRequested) { + if (SramEccSupported) { + SramEccSetting = + *SramEccRequested ? TargetIDSetting::On : TargetIDSetting::Off; + } else { + // If a specific sramecc setting was requested and this GPU does not + // support sramecc emit a warning. Setting will remain set to + // "Unsupported". + if (*SramEccRequested) { + errs() << "warning: sramecc 'On' was requested for a processor that " + "does not support it!\n"; + } else { + errs() << "warning: sramecc 'Off' was requested for a processor that " + "does not support it!\n"; + } + } + } +} + +static TargetIDSetting +getTargetIDSettingFromFeatureString(StringRef FeatureString) { + if (FeatureString.endswith("-")) + return TargetIDSetting::Off; + if (FeatureString.endswith("+")) + return TargetIDSetting::On; + + llvm_unreachable("Malformed feature string"); +} + +void AMDGPUTargetID::setTargetIDFromTargetIDStream(StringRef TargetID) { + SmallVector<StringRef, 3> TargetIDSplit; + TargetID.split(TargetIDSplit, ':'); + + for (const auto &FeatureString : TargetIDSplit) { + if (FeatureString.startswith("xnack")) + XnackSetting = getTargetIDSettingFromFeatureString(FeatureString); + if (FeatureString.startswith("sramecc")) + SramEccSetting = getTargetIDSettingFromFeatureString(FeatureString); + } +} + void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream) { auto TargetTriple = STI->getTargetTriple(); auto Version = getIsaVersion(STI->getCPU()); @@ -262,7 +350,7 @@ void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream) { if (hasXNACK(*STI)) Stream << "+xnack"; if (hasSRAMECC(*STI)) - Stream << "+sram-ecc"; + Stream << "+sramecc"; Stream.flush(); } @@ -1688,4 +1776,24 @@ const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t Format, } } // namespace AMDGPU + +raw_ostream &operator<<(raw_ostream &OS, + const AMDGPU::IsaInfo::TargetIDSetting S) { + switch (S) { + case (AMDGPU::IsaInfo::TargetIDSetting::Unsupported): + OS << "Unsupported"; + break; + case (AMDGPU::IsaInfo::TargetIDSetting::Any): + OS << "Any"; + break; + case (AMDGPU::IsaInfo::TargetIDSetting::Off): + OS << "Off"; + break; + case (AMDGPU::IsaInfo::TargetIDSetting::On): + OS << "On"; + break; + } + return OS; +} + } // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index f0332df..f937869 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -69,6 +69,84 @@ enum { TRAP_NUM_SGPRS = 16 }; +enum class TargetIDSetting { + Unsupported, + Any, + Off, + On +}; + +class AMDGPUTargetID { +private: + TargetIDSetting XnackSetting; + TargetIDSetting SramEccSetting; + +public: + explicit AMDGPUTargetID(const MCSubtargetInfo &STI); + ~AMDGPUTargetID() = default; + + /// \return True if the current xnack setting is not "Unsupported". + bool isXnackSupported() const { + return XnackSetting != TargetIDSetting::Unsupported; + } + + /// \returns True if the current xnack setting is "On" or "Any". + bool isXnackOnOrAny() const { + return XnackSetting == TargetIDSetting::On || + XnackSetting == TargetIDSetting::Any; + } + + /// \returns True if current xnack setting is "On" or "Off", + /// false otherwise. + bool isXnackOnOrOff() const { + return getXnackSetting() == TargetIDSetting::On || + getXnackSetting() == TargetIDSetting::Off; + } + + /// \returns The current xnack TargetIDSetting, possible options are + /// "Unsupported", "Any", "Off", and "On". + TargetIDSetting getXnackSetting() const { + return XnackSetting; + } + + /// Sets xnack setting to \p NewXnackSetting. + void setXnackSetting(TargetIDSetting NewXnackSetting) { + XnackSetting = NewXnackSetting; + } + + /// \return True if the current sramecc setting is not "Unsupported". + bool isSramEccSupported() const { + return SramEccSetting != TargetIDSetting::Unsupported; + } + + /// \returns True if the current sramecc setting is "On" or "Any". + bool isSramEccOnOrAny() const { + return SramEccSetting == TargetIDSetting::On || + SramEccSetting == TargetIDSetting::Any; + } + + /// \returns True if current sramecc setting is "On" or "Off", + /// false otherwise. + bool isSramEccOnOrOff() const { + return getSramEccSetting() == TargetIDSetting::On || + getSramEccSetting() == TargetIDSetting::Off; + } + + /// \returns The current sramecc TargetIDSetting, possible options are + /// "Unsupported", "Any", "Off", and "On". + TargetIDSetting getSramEccSetting() const { + return SramEccSetting; + } + + /// Sets sramecc setting to \p NewSramEccSetting. + void setSramEccSetting(TargetIDSetting NewSramEccSetting) { + SramEccSetting = NewSramEccSetting; + } + + void setTargetIDFromFeaturesString(StringRef FS); + void setTargetIDFromTargetIDStream(StringRef TargetID); +}; + /// Streams isa version string for given subtarget \p STI into \p Stream. void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream); @@ -873,6 +951,10 @@ struct SIModeRegisterDefaults { }; } // end namespace AMDGPU + +raw_ostream &operator<<(raw_ostream &OS, + const AMDGPU::IsaInfo::TargetIDSetting S); + } // end namespace llvm #endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll index f0eefbd..f5caf8f8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s ; Check lowering of some large extractelement that use the stack ; instead of register indexing. diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll index a57e681..2440b71 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll @@ -94,57 +94,57 @@ define i128 @extractelement_vgpr_v4i128_vgpr_idx(<4 x i128> addrspace(1)* %ptr, ; GFX9-LABEL: extractelement_vgpr_v4i128_vgpr_idx: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 1, v2 -; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off -; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 -; GFX9-NEXT: v_add_u32_e32 v17, 1, v16 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 6, v16 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 7, v16 +; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 1, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 1, v2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 6, v2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 7, v2 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e64 v10, v2, v4, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v3, v5, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v16 +; GFX9-NEXT: v_cndmask_b32_e64 v12, v8, v10, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v13, v9, v11, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v4, v10, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v11, v7, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v10, v12, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v11, v13, v5, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v10, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v17, v11, v7, vcc ; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:32 ; GFX9-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v16 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 7, v17 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 7, v3 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v9, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 6, v17 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 6, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v13, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v14, s[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v15, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v14, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, v14, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v15, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll index 5f04b36..77c8b431 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -1672,7 +1672,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(double addrspace(1)* %out, i32 ; GPRIDX-NEXT: is_ptr64 = 1 ; GPRIDX-NEXT: is_dynamic_callstack = 0 ; GPRIDX-NEXT: is_debug_enabled = 0 -; GPRIDX-NEXT: is_xnack_enabled = 0 +; GPRIDX-NEXT: is_xnack_enabled = 1 ; GPRIDX-NEXT: workitem_private_segment_byte_size = 0 ; GPRIDX-NEXT: workgroup_group_segment_byte_size = 0 ; GPRIDX-NEXT: gds_segment_byte_size = 0 @@ -2186,7 +2186,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(float addrspace(1)* %out, i32 ; GPRIDX-NEXT: is_ptr64 = 1 ; GPRIDX-NEXT: is_dynamic_callstack = 0 ; GPRIDX-NEXT: is_debug_enabled = 0 -; GPRIDX-NEXT: is_xnack_enabled = 0 +; GPRIDX-NEXT: is_xnack_enabled = 1 ; GPRIDX-NEXT: workitem_private_segment_byte_size = 0 ; GPRIDX-NEXT: workgroup_group_segment_byte_size = 0 ; GPRIDX-NEXT: gds_segment_byte_size = 0 @@ -2361,7 +2361,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(double addrspace(1)* %out, i3 ; GPRIDX-NEXT: is_ptr64 = 1 ; GPRIDX-NEXT: is_dynamic_callstack = 0 ; GPRIDX-NEXT: is_debug_enabled = 0 -; GPRIDX-NEXT: is_xnack_enabled = 0 +; GPRIDX-NEXT: is_xnack_enabled = 1 ; GPRIDX-NEXT: workitem_private_segment_byte_size = 0 ; GPRIDX-NEXT: workgroup_group_segment_byte_size = 0 ; GPRIDX-NEXT: gds_segment_byte_size = 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll index c6e9623..53b9649 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -187,9 +187,9 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX9-NEXT: s_add_u32 s1, 0x104, s1 ; GFX9-NEXT: scratch_load_dword v1, off, s2 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_add_u32 s0, 0x104, s0 ; GFX9-NEXT: scratch_store_dword off, v0, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_add_u32 s0, 0x104, s0 ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm @@ -364,9 +364,9 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX9-NEXT: s_add_u32 s1, 0x4004, s1 ; GFX9-NEXT: scratch_load_dword v1, off, s2 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_add_u32 s0, 0x4004, s0 ; GFX9-NEXT: scratch_store_dword off, v0, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_add_u32 s0, 0x4004, s0 ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll index 88af446..bbc2dba 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll @@ -7,17 +7,18 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.ptr, <64 x i32> addrspace(1)* %ptr, i32 %val, i32 %idx) #0 { ; GCN-LABEL: v_insert_v64i32_varidx: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; GCN-NEXT: s_add_u32 s0, s0, s7 +; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GCN-NEXT: v_mov_b32_e32 v16, 0x100 ; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: v_add_u32_e32 v31, 64, v16 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx16 s[36:51], s[10:11], 0x0 ; GCN-NEXT: s_load_dwordx16 s[52:67], s[10:11], 0x40 ; GCN-NEXT: s_load_dwordx16 s[12:27], s[10:11], 0x80 -; GCN-NEXT: v_add_u32_e32 v31, 64, v16 ; GCN-NEXT: v_add_u32_e32 v32, 0x44, v16 +; GCN-NEXT: v_add_u32_e32 v33, 0x48, v16 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s36 ; GCN-NEXT: v_mov_b32_e32 v1, s37 @@ -43,7 +44,6 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out. ; GCN-NEXT: buffer_store_dword v1, v31, s[0:3], 0 offen ; GCN-NEXT: v_mov_b32_e32 v1, s53 ; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v33, 0x48, v16 ; GCN-NEXT: v_mov_b32_e32 v1, s54 ; GCN-NEXT: buffer_store_dword v1, v33, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v34, 0x4c, v16 @@ -175,7 +175,7 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out. ; GCN-NEXT: buffer_store_dword v1, v75, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v76, 0xf4, v16 ; GCN-NEXT: v_mov_b32_e32 v1, s49 -; GCN-NEXT: s_and_b32 s5, s5, 63 +; GCN-NEXT: s_and_b32 s4, s7, 63 ; GCN-NEXT: buffer_store_dword v1, v76, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v77, 0xf8, v16 ; GCN-NEXT: v_mov_b32_e32 v1, s50 @@ -183,7 +183,7 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out. ; GCN-NEXT: buffer_store_dword v1, v77, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v78, 0xfc, v16 ; GCN-NEXT: v_mov_b32_e32 v1, s51 -; GCN-NEXT: s_lshl_b32 s5, s5, 2 +; GCN-NEXT: s_lshl_b32 s4, s4, 2 ; GCN-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v18, 12, v16 ; GCN-NEXT: v_add_u32_e32 v19, 16, v16 @@ -199,8 +199,8 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out. ; GCN-NEXT: v_add_u32_e32 v29, 56, v16 ; GCN-NEXT: v_add_u32_e32 v30, 60, v16 ; GCN-NEXT: buffer_store_dword v1, v78, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, s5, v16 -; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_add_u32_e32 v1, s4, v16 +; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: buffer_store_dword v3, v18, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v4, v19, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v5, v20, s[0:3], 0 offen @@ -216,6 +216,7 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out. ; GCN-NEXT: buffer_store_dword v15, v30, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_load_dword v2, v17, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v3, v18, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v4, v19, s[0:3], 0 offen diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll index 10659e5..4c497e2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll @@ -2133,8 +2133,8 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg define amdgpu_ps void @insertelement_v_v16i16_s_s(<16 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 inreg %idx) { ; GFX9-LABEL: insertelement_v_v16i16_s_s: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off -; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 +; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 ; GFX9-NEXT: s_and_b32 s1, s3, 1 ; GFX9-NEXT: s_lshr_b32 s12, s3, 1 ; GFX9-NEXT: s_mov_b32 s0, 0xffff @@ -2152,26 +2152,26 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(<16 x i16> addrspace(1)* %ptr, ; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], s12, 6 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], s12, 7 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v11, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11] -; GFX9-NEXT: v_and_or_b32 v10, v1, s13, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[10:11] +; GFX9-NEXT: v_and_or_b32 v12, v1, s13, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v10, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, v10, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v10, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v10, s[2:3] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v6, v10, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v10, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, v10, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, v10, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, v12, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc ; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v10, v12, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v11, v12, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[10:11] ; GFX9-NEXT: s_mov_b64 s[0:1], 16 ; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off @@ -2843,8 +2843,8 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg define amdgpu_ps void @insertelement_v_v16i16_s_v(<16 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 %idx) { ; GFX9-LABEL: insertelement_v_v16i16_s_v: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX9-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 +; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v2 ; GFX9-NEXT: s_mov_b32 s0, 0xffff @@ -2862,25 +2862,25 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(<16 x i16> addrspace(1)* %ptr, ; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e32 v11, v3, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v5, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v6, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v7, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v8, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v9, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v10, s[10:11] -; GFX9-NEXT: v_and_or_b32 v11, v11, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[10:11] +; GFX9-NEXT: v_and_or_b32 v12, v3, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, v12, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v10, v12, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v11, v12, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v9, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[10:11] ; GFX9-NEXT: s_mov_b64 s[0:1], 16 ; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off @@ -2992,8 +2992,8 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(<16 x i16> addrspace(1)* %ptr, define amdgpu_ps void @insertelement_v_v16i16_v_s(<16 x i16> addrspace(1)* %ptr, i16 %val, i32 inreg %idx) { ; GFX9-LABEL: insertelement_v_v16i16_v_s: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX9-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 +; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 ; GFX9-NEXT: s_and_b32 s1, s2, 1 ; GFX9-NEXT: s_lshr_b32 s12, s2, 1 ; GFX9-NEXT: s_lshl_b32 s1, s1, 4 @@ -3009,26 +3009,26 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(<16 x i16> addrspace(1)* %ptr, ; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], s12, 6 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], s12, 7 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v11, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[10:11] -; GFX9-NEXT: v_and_or_b32 v11, v1, s13, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[10:11] +; GFX9-NEXT: v_and_or_b32 v12, v1, s13, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, v12, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v10, v12, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v11, v12, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v9, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[10:11] ; GFX9-NEXT: s_mov_b64 s[0:1], 16 ; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off @@ -3140,8 +3140,8 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(<16 x i16> addrspace(1)* %ptr, define amdgpu_ps void @insertelement_v_v16i16_v_v(<16 x i16> addrspace(1)* %ptr, i16 %val, i32 %idx) { ; GFX9-LABEL: insertelement_v_v16i16_v_v: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off -; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:16 +; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 @@ -3158,25 +3158,25 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(<16 x i16> addrspace(1)* %ptr, ; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[10:11] ; GFX9-NEXT: v_and_or_b32 v12, v3, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, v12, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v12, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, v12, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v12, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, v12, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v12, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v12, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v10, v12, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v11, v12, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v9, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v6, v10, v12, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v11, v12, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[10:11] ; GFX9-NEXT: s_mov_b64 s[0:1], 16 ; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll index 7cad269..a9508c4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll @@ -38,6 +38,7 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in, ; GCN-NEXT: global_load_dwordx4 v[20:23], v[2:3], off offset:16 ; GCN-NEXT: global_load_dwordx4 v[24:27], v[2:3], off offset:32 ; GCN-NEXT: global_load_dwordx4 v[28:31], v[2:3], off offset:48 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: global_load_dwordx4 v[0:3], v64, s[0:1] offset:128 ; GCN-NEXT: global_load_dwordx4 v[16:19], v64, s[0:1] offset:192 ; GCN-NEXT: s_waitcnt vmcnt(7) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll index d4fac6b..0257c06 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll @@ -1586,10 +1586,10 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(i32 addrspace(1)* %out0, ; GFX9-LABEL: nocse_lds_atomic_inc_ret_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: ds_inc_rtn_u32 v2, v1, v0 ; GFX9-NEXT: ds_inc_rtn_u32 v0, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll index a2a212f..f25ec6c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll @@ -252,37 +252,37 @@ define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, [8 x i32] ; GFX10_W32-LABEL: test_div_fmas_f32: ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x4 -; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0xb8 -; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x70 -; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x94 -; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x4c -; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W32-NEXT: s_load_dword s7, s[0:1], 0xb8 +; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x70 +; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0x94 +; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x4c +; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s3 -; GFX10_W32-NEXT: v_mov_b32_e32 v1, s4 -; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 -; GFX10_W32-NEXT: v_div_fmas_f32 v0, s5, v0, v1 +; GFX10_W32-NEXT: s_and_b32 s0, 1, s7 +; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, s6 +; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10_W32-NEXT: v_div_fmas_f32 v0, s4, v0, v1 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10_W32-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10_W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x4 -; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0xb8 -; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x70 -; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x94 -; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x4c -; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W64-NEXT: s_load_dword s7, s[0:1], 0xb8 +; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x70 +; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0x94 +; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x4c +; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s3 -; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 -; GFX10_W64-NEXT: v_mov_b32_e32 v1, s4 -; GFX10_W64-NEXT: v_div_fmas_f32 v0, s5, v0, v1 +; GFX10_W64-NEXT: s_and_b32 s0, 1, s7 +; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GFX10_W64-NEXT: v_mov_b32_e32 v1, s6 +; GFX10_W64-NEXT: v_div_fmas_f32 v0, s4, v0, v1 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10_W64-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W64-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) store float %result, float addrspace(1)* %out, align 4 @@ -329,33 +329,33 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %o ; GFX10_W32-LABEL: test_div_fmas_f32_inline_imm_0: ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x3 -; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0xb8 -; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x94 +; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0xb8 +; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x94 ; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x70 -; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s3 -; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 +; GFX10_W32-NEXT: s_and_b32 s0, 1, s6 +; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10_W32-NEXT: v_div_fmas_f32 v0, 1.0, s4, v0 -; GFX10_W32-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10_W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_0: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x3 -; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0xb8 -; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x94 +; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0xb8 +; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x94 ; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x70 -; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s3 -; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX10_W64-NEXT: s_and_b32 s0, 1, s6 +; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX10_W64-NEXT: v_div_fmas_f32 v0, 1.0, s4, v0 -; GFX10_W64-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W64-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float 1.0, float %b, float %c, i1 %d) store float %result, float addrspace(1)* %out, align 4 @@ -402,33 +402,33 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %o ; GFX10_W32-LABEL: test_div_fmas_f32_inline_imm_1: ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x3 -; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0x58 -; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x34 +; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0x58 +; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x34 ; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s3 -; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 +; GFX10_W32-NEXT: s_and_b32 s0, 1, s6 +; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10_W32-NEXT: v_div_fmas_f32 v0, s4, 1.0, v0 -; GFX10_W32-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10_W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_1: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x3 -; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0x58 -; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x34 +; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0x58 +; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x34 ; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s3 -; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX10_W64-NEXT: s_and_b32 s0, 1, s6 +; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX10_W64-NEXT: v_div_fmas_f32 v0, s4, 1.0, v0 -; GFX10_W64-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W64-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float 1.0, float %c, i1 %d) store float %result, float addrspace(1)* %out, align 4 @@ -475,33 +475,33 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %o ; GFX10_W32-LABEL: test_div_fmas_f32_inline_imm_2: ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x3 -; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0xb8 -; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x70 +; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0xb8 +; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x70 ; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x4c -; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s3 -; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 +; GFX10_W32-NEXT: s_and_b32 s0, 1, s6 +; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10_W32-NEXT: v_div_fmas_f32 v0, s4, v0, 1.0 -; GFX10_W32-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10_W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_2: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x3 -; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0xb8 -; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x70 +; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0xb8 +; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x70 ; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x4c -; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s3 -; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX10_W64-NEXT: s_and_b32 s0, 1, s6 +; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX10_W64-NEXT: v_div_fmas_f32 v0, s4, v0, 1.0 -; GFX10_W64-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W64-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float 1.0, i1 %d) store float %result, float addrspace(1)* %out, align 4 @@ -552,35 +552,35 @@ define amdgpu_kernel void @test_div_fmas_f64(double addrspace(1)* %out, double % ; GFX10_W32-LABEL: test_div_fmas_f64: ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x1 -; GFX10_W32-NEXT: s_load_dword s8, s[0:1], 0x44 -; GFX10_W32-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0x44 +; GFX10_W32-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: s_and_b32 s8, 1, s8 -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s4 -; GFX10_W32-NEXT: v_mov_b32_e32 v2, s6 -; GFX10_W32-NEXT: v_mov_b32_e32 v1, s5 -; GFX10_W32-NEXT: v_mov_b32_e32 v3, s7 -; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s8 -; GFX10_W32-NEXT: v_div_fmas_f64 v[0:1], s[2:3], v[0:1], v[2:3] +; GFX10_W32-NEXT: s_and_b32 s0, 1, s2 +; GFX10_W32-NEXT: v_mov_b32_e32 v0, s8 +; GFX10_W32-NEXT: v_mov_b32_e32 v2, s10 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, s9 +; GFX10_W32-NEXT: v_mov_b32_e32 v3, s11 +; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10_W32-NEXT: v_div_fmas_f64 v[0:1], s[6:7], v[0:1], v[2:3] ; GFX10_W32-NEXT: v_mov_b32_e32 v2, 0 -; GFX10_W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10_W32-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f64: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x1 -; GFX10_W64-NEXT: s_load_dword s8, s[0:1], 0x44 -; GFX10_W64-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0x44 +; GFX10_W64-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: s_and_b32 s8, 1, s8 -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s4 -; GFX10_W64-NEXT: v_mov_b32_e32 v2, s6 -; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s8 -; GFX10_W64-NEXT: v_mov_b32_e32 v1, s5 -; GFX10_W64-NEXT: v_mov_b32_e32 v3, s7 -; GFX10_W64-NEXT: v_div_fmas_f64 v[0:1], s[2:3], v[0:1], v[2:3] +; GFX10_W64-NEXT: s_and_b32 s0, 1, s2 +; GFX10_W64-NEXT: v_mov_b32_e32 v0, s8 +; GFX10_W64-NEXT: v_mov_b32_e32 v2, s10 +; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GFX10_W64-NEXT: v_mov_b32_e32 v1, s9 +; GFX10_W64-NEXT: v_mov_b32_e32 v3, s11 +; GFX10_W64-NEXT: v_div_fmas_f64 v[0:1], s[6:7], v[0:1], v[2:3] ; GFX10_W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX10_W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10_W64-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10_W64-NEXT: s_endpgm %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d) store double %result, double addrspace(1)* %out, align 8 @@ -630,34 +630,34 @@ define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %ou ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x1 ; GFX10_W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W32-NEXT: s_cmp_eq_u32 s7, 0 ; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, s6 -; GFX10_W32-NEXT: s_cselect_b32 s2, 1, 0 -; GFX10_W32-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 +; GFX10_W32-NEXT: s_cselect_b32 s0, 1, 0 +; GFX10_W32-NEXT: s_and_b32 s0, 1, s0 +; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10_W32-NEXT: v_div_fmas_f32 v0, s4, v0, v1 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10_W32-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10_W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32_cond_to_vcc: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x1 ; GFX10_W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W64-NEXT: s_cmp_eq_u32 s7, 0 ; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 -; GFX10_W64-NEXT: s_cselect_b32 s2, 1, 0 +; GFX10_W64-NEXT: s_cselect_b32 s0, 1, 0 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, s6 -; GFX10_W64-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX10_W64-NEXT: s_and_b32 s0, 1, s0 +; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX10_W64-NEXT: v_div_fmas_f32 v0, s4, v0, v1 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10_W64-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W64-NEXT: s_endpgm %cmp = icmp eq i32 %i, 0 %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cmp) @@ -703,33 +703,33 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspa ; GFX10_W32-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc: ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x3 -; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0x70 -; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x94 +; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x70 +; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0x94 ; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x4c -; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, 0 ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s2 -; GFX10_W32-NEXT: v_mov_b32_e32 v1, s3 +; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, s6 ; GFX10_W32-NEXT: v_div_fmas_f32 v0, s4, v0, v1 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10_W32-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10_W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x3 -; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0x70 -; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x94 +; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x70 +; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0x94 ; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x4c -; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, 0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s2 -; GFX10_W64-NEXT: v_mov_b32_e32 v1, s3 +; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W64-NEXT: v_mov_b32_e32 v1, s6 ; GFX10_W64-NEXT: v_div_fmas_f32 v0, s4, v0, v1 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10_W64-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W64-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 false) store float %result, float addrspace(1)* %out, align 4 @@ -774,33 +774,33 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspac ; GFX10_W32-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc: ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x3 -; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0x70 -; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x94 +; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x70 +; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0x94 ; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x4c -; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, 1 ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s2 -; GFX10_W32-NEXT: v_mov_b32_e32 v1, s3 +; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, s6 ; GFX10_W32-NEXT: v_div_fmas_f32 v0, s4, v0, v1 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10_W32-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10_W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x3 -; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0x70 -; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x94 +; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x70 +; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0x94 ; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x4c -; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, 1 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s2 -; GFX10_W64-NEXT: v_mov_b32_e32 v1, s3 +; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W64-NEXT: v_mov_b32_e32 v1, s6 ; GFX10_W64-NEXT: v_div_fmas_f32 v0, s4, v0, v1 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10_W64-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W64-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 true) store float %result, float addrspace(1)* %out, align 4 @@ -882,15 +882,15 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace ; GFX10_W32-NEXT: s_waitcnt vmcnt(0) ; GFX10_W32-NEXT: global_load_dword v3, v1, s[6:7] offset:4 glc dlc ; GFX10_W32-NEXT: s_waitcnt vmcnt(0) -; GFX10_W32-NEXT: global_load_dword v1, v1, s[6:7] offset:8 glc dlc +; GFX10_W32-NEXT: global_load_dword v4, v1, s[6:7] offset:8 glc dlc ; GFX10_W32-NEXT: s_waitcnt vmcnt(0) +; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W32-NEXT: s_cmp_lg_u32 s0, 0 ; GFX10_W32-NEXT: s_cselect_b32 s0, 1, 0 ; GFX10_W32-NEXT: s_and_b32 s0, 1, s0 ; GFX10_W32-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 ; GFX10_W32-NEXT: s_and_b32 vcc_lo, vcc_lo, s0 -; GFX10_W32-NEXT: v_div_fmas_f32 v0, v2, v3, v1 -; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX10_W32-NEXT: v_div_fmas_f32 v0, v2, v3, v4 ; GFX10_W32-NEXT: global_store_dword v1, v0, s[4:5] offset:8 ; GFX10_W32-NEXT: s_endpgm ; @@ -905,15 +905,15 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace ; GFX10_W64-NEXT: s_waitcnt vmcnt(0) ; GFX10_W64-NEXT: global_load_dword v3, v1, s[6:7] offset:4 glc dlc ; GFX10_W64-NEXT: s_waitcnt vmcnt(0) -; GFX10_W64-NEXT: global_load_dword v1, v1, s[6:7] offset:8 glc dlc +; GFX10_W64-NEXT: global_load_dword v4, v1, s[6:7] offset:8 glc dlc ; GFX10_W64-NEXT: s_waitcnt vmcnt(0) +; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W64-NEXT: s_cmp_lg_u32 s0, 0 ; GFX10_W64-NEXT: s_cselect_b32 s0, 1, 0 ; GFX10_W64-NEXT: s_and_b32 s0, 1, s0 ; GFX10_W64-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 ; GFX10_W64-NEXT: s_and_b64 vcc, vcc, s[0:1] -; GFX10_W64-NEXT: v_div_fmas_f32 v0, v2, v3, v1 -; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX10_W64-NEXT: v_div_fmas_f32 v0, v2, v3, v4 ; GFX10_W64-NEXT: global_store_dword v1, v0, s[4:5] offset:8 ; GFX10_W64-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll index 4f17689..68afd70 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll @@ -52,9 +52,9 @@ define amdgpu_kernel void @test_div_scale_f32_1(float addrspace(1)* %out, float ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4 glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, v1 +; GFX10-NEXT: v_div_scale_f32 v0, s2, v2, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm @@ -119,9 +119,9 @@ define amdgpu_kernel void @test_div_scale_f32_2(float addrspace(1)* %out, float ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4 glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s2, v1, v0, v1 +; GFX10-NEXT: v_div_scale_f32 v0, s2, v1, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm @@ -186,12 +186,12 @@ define amdgpu_kernel void @test_div_scale_f64_1(double addrspace(1)* %out, doubl ; GFX10-LABEL: test_div_scale_f64_1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] glc dlc +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[2:3], v2, s[2:3] offset:8 glc dlc +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[2:3], v[2:3], v[0:1] ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -258,12 +258,12 @@ define amdgpu_kernel void @test_div_scale_f64_2(double addrspace(1)* %out, doubl ; GFX10-LABEL: test_div_scale_f64_2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] glc dlc +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[2:3], v2, s[2:3] offset:8 glc dlc +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[0:1], v[2:3], v[0:1] ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -792,13 +792,13 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(float addrspace(1)* % ; GFX10-LABEL: test_div_scale_f32_all_scalar_1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x4c -; GFX10-NEXT: s_load_dword s3, s[0:1], 0x70 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x4c +; GFX10-NEXT: s_load_dword s5, s[0:1], 0x70 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s2, s3, s3, s2 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_div_scale_f32 v0, s0, s5, s5, s4 +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) %result0 = extractvalue { float, i1 } %result, 0 @@ -836,13 +836,13 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(float addrspace(1)* % ; GFX10-LABEL: test_div_scale_f32_all_scalar_2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x4c -; GFX10-NEXT: s_load_dword s3, s[0:1], 0x70 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x4c +; GFX10-NEXT: s_load_dword s5, s[0:1], 0x70 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s2, s2, s3, s2 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_div_scale_f32 v0, s0, s4, s5, s4 +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) %result0 = extractvalue { float, i1 } %result, 0 @@ -881,11 +881,11 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(double addrspace(1)* ; ; GFX10-LABEL: test_div_scale_f64_all_scalar_1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[4:5], s[4:5], s[2:3] ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -927,11 +927,11 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(double addrspace(1)* ; ; GFX10-LABEL: test_div_scale_f64_all_scalar_2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[2:3], s[4:5], s[2:3] ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -1104,11 +1104,11 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4 glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_div_scale_f32 v0, s2, v2, v2, v0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1176,9 +1176,9 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(float addrspace(1)* %out, ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4 glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v2 ; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll index 7e11446..09921f1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll @@ -30,6 +30,18 @@ define amdgpu_ps void @image_store_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, ha ; GFX81-NEXT: s_mov_b32 s7, s9 ; GFX81-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x1 unorm d16 ; GFX81-NEXT: s_endpgm +; PACKED-LABEL: image_store_f16: +; PACKED: ; %bb.0: +; PACKED-NEXT: s_mov_b32 s0, s2 +; PACKED-NEXT: s_mov_b32 s1, s3 +; PACKED-NEXT: s_mov_b32 s2, s4 +; PACKED-NEXT: s_mov_b32 s3, s5 +; PACKED-NEXT: s_mov_b32 s4, s6 +; PACKED-NEXT: s_mov_b32 s5, s7 +; PACKED-NEXT: s_mov_b32 s6, s8 +; PACKED-NEXT: s_mov_b32 s7, s9 +; PACKED-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x1 unorm d16 +; PACKED-NEXT: s_endpgm call void @llvm.amdgcn.image.store.2d.f16.i32(half %data, i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret void } @@ -61,6 +73,18 @@ define amdgpu_ps void @image_store_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, ; GFX81-NEXT: s_mov_b32 s7, s9 ; GFX81-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0x3 unorm d16 ; GFX81-NEXT: s_endpgm +; PACKED-LABEL: image_store_v2f16: +; PACKED: ; %bb.0: +; PACKED-NEXT: s_mov_b32 s0, s2 +; PACKED-NEXT: s_mov_b32 s1, s3 +; PACKED-NEXT: s_mov_b32 s2, s4 +; PACKED-NEXT: s_mov_b32 s3, s5 +; PACKED-NEXT: s_mov_b32 s4, s6 +; PACKED-NEXT: s_mov_b32 s5, s7 +; PACKED-NEXT: s_mov_b32 s6, s8 +; PACKED-NEXT: s_mov_b32 s7, s9 +; PACKED-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x3 unorm d16 +; PACKED-NEXT: s_endpgm call void @llvm.amdgcn.image.store.2d.v2f16.i32(<2 x half> %in, i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret void } @@ -135,6 +159,18 @@ define amdgpu_ps void @image_store_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, ; GFX81-NEXT: s_mov_b32 s7, s9 ; GFX81-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0xf unorm d16 ; GFX81-NEXT: s_endpgm +; PACKED-LABEL: image_store_v4f16: +; PACKED: ; %bb.0: +; PACKED-NEXT: s_mov_b32 s0, s2 +; PACKED-NEXT: s_mov_b32 s1, s3 +; PACKED-NEXT: s_mov_b32 s2, s4 +; PACKED-NEXT: s_mov_b32 s3, s5 +; PACKED-NEXT: s_mov_b32 s4, s6 +; PACKED-NEXT: s_mov_b32 s5, s7 +; PACKED-NEXT: s_mov_b32 s6, s8 +; PACKED-NEXT: s_mov_b32 s7, s9 +; PACKED-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0xf unorm d16 +; PACKED-NEXT: s_endpgm call void @llvm.amdgcn.image.store.2d.v4f16.i32(<4 x half> %in, i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll index 1610b84..b84171b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll @@ -21,13 +21,13 @@ define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in) { ; GFX10-LABEL: dpp_test: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c ; encoding: [0x80,0x00,0x00,0xf4,0x2c,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; encoding: [0x00,0x00,0x04,0xf4,0x24,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; encoding: [0x00,0x01,0x00,0xf4,0x2c,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; encoding: [0x80,0x00,0x04,0xf4,0x24,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; encoding: [0x80,0x02,0x02,0x7e] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] -; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; encoding: [0x02,0x02,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; encoding: [0x04,0x02,0x00,0x7e] ; GFX10-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11] -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; encoding: [0x00,0x80,0x70,0xdc,0x01,0x00,0x00,0x00] +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; encoding: [0x00,0x80,0x70,0xdc,0x01,0x00,0x02,0x00] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i32 1, i32 1, i1 true) #0 store i32 %tmp0, i32 addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll index ab2a399..8fef0ad 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll @@ -4,6 +4,21 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=PACKED %s define amdgpu_ps half @struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { + ; PACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; PACKED: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; PACKED: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; PACKED: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; PACKED: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) + ; PACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN]] + ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 ; UNPACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -19,7 +34,12 @@ define amdgpu_ps half @struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_vof ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) ; UNPACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN]] ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 - ; PACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset + %val = call half @llvm.amdgcn.struct.tbuffer.load.f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) + ret half %val +} + +define amdgpu_ps <2 x half> @struct_tbuffer_load_v2f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { + ; PACKED-LABEL: name: struct_tbuffer_load_v2f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; PACKED: bb.1 (%ir-block.0): ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 ; PACKED: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 @@ -31,14 +51,9 @@ define amdgpu_ps half @struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_vof ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) - ; PACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN]] + ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_XY_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_XY_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; PACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_BOTHEN]] ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call half @llvm.amdgcn.struct.tbuffer.load.f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) - ret half %val -} - -define amdgpu_ps <2 x half> @struct_tbuffer_load_v2f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; UNPACKED-LABEL: name: struct_tbuffer_load_v2f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -65,21 +80,6 @@ define amdgpu_ps <2 x half> @struct_tbuffer_load_v2f16__sgpr_rsrc__vgpr_vindex__ ; UNPACKED: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec ; UNPACKED: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 - ; PACKED-LABEL: name: struct_tbuffer_load_v2f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset - ; PACKED: bb.1 (%ir-block.0): - ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 - ; PACKED: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; PACKED: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; PACKED: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; PACKED: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_XY_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_XY_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; PACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_BOTHEN]] - ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call <2 x half> @llvm.amdgcn.struct.tbuffer.load.v2f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) ret <2 x half> %val } @@ -91,6 +91,24 @@ define amdgpu_ps <2 x half> @struct_tbuffer_load_v2f16__sgpr_rsrc__vgpr_vindex__ ; } define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { + ; PACKED-LABEL: name: struct_tbuffer_load_v4f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; PACKED: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; PACKED: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; PACKED: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; PACKED: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4) + ; PACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub0 + ; PACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub1 + ; PACKED: $vgpr0 = COPY [[COPY7]] + ; PACKED: $vgpr1 = COPY [[COPY8]] + ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 ; UNPACKED-LABEL: name: struct_tbuffer_load_v4f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -127,29 +145,27 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__sgpr_rsrc__vgpr_vindex__ ; UNPACKED: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED: $vgpr1 = COPY [[V_OR_B32_e64_1]] ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 - ; PACKED-LABEL: name: struct_tbuffer_load_v4f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset + %val = call <4 x half> @llvm.amdgcn.struct.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) + ret <4 x half> %val +} + +define amdgpu_ps half @struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_vindex0(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; PACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_vindex0 ; PACKED: bb.1 (%ir-block.0): - ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 ; PACKED: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; PACKED: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; PACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4) - ; PACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub0 - ; PACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub1 - ; PACKED: $vgpr0 = COPY [[COPY7]] - ; PACKED: $vgpr1 = COPY [[COPY8]] - ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 - %val = call <4 x half> @llvm.amdgcn.struct.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) - ret <4 x half> %val -} - -define amdgpu_ps half @struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_vindex0(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; PACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; PACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) + ; PACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN]] + ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 ; UNPACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_vindex0 ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -166,27 +182,57 @@ define amdgpu_ps half @struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_vof ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) ; UNPACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN]] ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 - ; PACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_vindex0 - ; PACKED: bb.1 (%ir-block.0): - ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 - ; PACKED: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; PACKED: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; PACKED: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; PACKED: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; PACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; PACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1 - ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) - ; PACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN]] - ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call half @llvm.amdgcn.struct.tbuffer.load.f16(<4 x i32> %rsrc, i32 0, i32 %voffset, i32 %soffset, i32 78, i32 0) ret half %val } define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__sgpr_voffset__vgpr_soffset(<4 x i32> %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) { + ; PACKED-LABEL: name: struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__sgpr_voffset__vgpr_soffset + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: successors: %bb.2(0x80000000) + ; PACKED: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; PACKED: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; PACKED: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; PACKED: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; PACKED: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; PACKED: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; PACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; PACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; PACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; PACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; PACKED: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; PACKED: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; PACKED: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; PACKED: bb.2: + ; PACKED: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; PACKED: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec + ; PACKED: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec + ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; PACKED: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY9]], implicit $exec + ; PACKED: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub0, implicit $exec + ; PACKED: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub1, implicit $exec + ; PACKED: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; PACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY10]], implicit $exec + ; PACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; PACKED: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; PACKED: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; PACKED: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec + ; PACKED: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc + ; PACKED: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 + ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4) + ; PACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; PACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; PACKED: bb.3: + ; PACKED: successors: %bb.4(0x80000000) + ; PACKED: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; PACKED: bb.4: + ; PACKED: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub0 + ; PACKED: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub1 + ; PACKED: $vgpr0 = COPY [[COPY11]] + ; PACKED: $vgpr1 = COPY [[COPY12]] + ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 ; UNPACKED-LABEL: name: struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__sgpr_voffset__vgpr_soffset ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED: successors: %bb.2(0x80000000) @@ -251,72 +297,11 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__ ; UNPACKED: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED: $vgpr1 = COPY [[V_OR_B32_e64_1]] ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 - ; PACKED-LABEL: name: struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__sgpr_voffset__vgpr_soffset - ; PACKED: bb.1 (%ir-block.0): - ; PACKED: successors: %bb.2(0x80000000) - ; PACKED: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 - ; PACKED: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; PACKED: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; PACKED: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; PACKED: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; PACKED: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; PACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; PACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; PACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; PACKED: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; PACKED: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 - ; PACKED: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec - ; PACKED: bb.2: - ; PACKED: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; PACKED: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec - ; PACKED: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec - ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 - ; PACKED: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY9]], implicit $exec - ; PACKED: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub0, implicit $exec - ; PACKED: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub1, implicit $exec - ; PACKED: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 - ; PACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY10]], implicit $exec - ; PACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc - ; PACKED: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; PACKED: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec - ; PACKED: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec - ; PACKED: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc - ; PACKED: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 - ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4) - ; PACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec - ; PACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec - ; PACKED: bb.3: - ; PACKED: successors: %bb.4(0x80000000) - ; PACKED: $exec = S_MOV_B64_term [[S_MOV_B64_term]] - ; PACKED: bb.4: - ; PACKED: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub0 - ; PACKED: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub1 - ; PACKED: $vgpr0 = COPY [[COPY11]] - ; PACKED: $vgpr1 = COPY [[COPY12]] - ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 %val = call <4 x half> @llvm.amdgcn.struct.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) ret <4 x half> %val } define amdgpu_ps half @struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_voffset_add4095(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset.base, i32 inreg %soffset) { - ; UNPACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_voffset_add4095 - ; UNPACKED: bb.1 (%ir-block.0): - ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 - ; UNPACKED: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; UNPACKED: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; UNPACKED: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; UNPACKED: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource" + 4095, align 1, addrspace 4) - ; UNPACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN]] - ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 ; PACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_voffset_add4095 ; PACKED: bb.1 (%ir-block.0): ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -332,6 +317,21 @@ define amdgpu_ps half @struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_vof ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource" + 4095, align 1, addrspace 4) ; PACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN]] ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; UNPACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_voffset_add4095 + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; UNPACKED: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; UNPACKED: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; UNPACKED: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; UNPACKED: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; UNPACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource" + 4095, align 1, addrspace 4) + ; UNPACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN]] + ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 %voffset = add i32 %voffset.base, 4095 %val = call half @llvm.amdgcn.struct.tbuffer.load.f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) ret half %val diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll index 25c5411..adc6dd4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll @@ -20,14 +20,14 @@ define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in1, i32 %in2) ; GFX10-LABEL: dpp_test: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 false) store i32 %tmp0, i32 addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll index ff6467a..ad792cc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll @@ -18,52 +18,52 @@ define <3 x i32> @v_load_constant_v3i32_align1(<3 x i32> addrspace(4)* %ptr) { ; GFX9-NOUNALIGNED-LABEL: v_load_constant_v3i32_align1: ; GFX9-NOUNALIGNED: ; %bb.0: ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v3, v[0:1], off offset:1 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v4, v[0:1], off offset:2 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v5, v[0:1], off offset:3 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v6, v[0:1], off offset:4 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v7, v[0:1], off offset:5 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v8, v[0:1], off offset:6 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v9, v[0:1], off offset:7 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v[0:1], off offset:8 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v[0:1], off offset:9 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v12, v[0:1], off offset:10 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v0, v[0:1], off offset:11 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, 0xff +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v13, v[0:1], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v12, v[0:1], off offset:1 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v[0:1], off offset:2 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v[0:1], off offset:3 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v9, v[0:1], off offset:4 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v8, v[0:1], off offset:5 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v7, v[0:1], off offset:6 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v6, v[0:1], off offset:7 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v5, v[0:1], off offset:8 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v4, v[0:1], off offset:9 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v3, v[0:1], off offset:10 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v2, v[0:1], off offset:11 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, 0xff ; GFX9-NOUNALIGNED-NEXT: s_movk_i32 s4, 0xff ; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s5, 8 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v13, 8 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, 8 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v3, s5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v12, s5, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v11, s4, v11 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s4, v5 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v2, s4, v3 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v10, s4, v10 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v7, s5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v8, s5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v8, v8, v1 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v7, v7, v0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v9, v9, v1 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v6, v6, v0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v10, 24, v10 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v11, v13, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v12, v12, v1 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v3, v3, v0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 24, v5 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v5, v6, s4, v7 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 24, v9 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v8, v10, v1, v11 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v10, 24, v0 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 16, v12 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v2, v3, v4 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v5, v6, v7 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v8, v9, v10 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, v2, v0 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v4, v13, s4, v12 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v5, v5, v0, v1 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v8, v9, s4, v8 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v4, v11, v10 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v8, v7, v6 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v5, v3, v2 ; GFX9-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-UNALIGNED-LABEL: v_load_constant_v3i32_align1: @@ -154,25 +154,25 @@ define <3 x i32> @v_load_constant_v3i32_align2(<3 x i32> addrspace(4)* %ptr) { ; GFX9-NOUNALIGNED-LABEL: v_load_constant_v3i32_align2: ; GFX9-NOUNALIGNED: ; %bb.0: ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v3, v[0:1], off offset:2 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v4, v[0:1], off offset:4 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v5, v[0:1], off offset:6 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v6, v[0:1], off offset:8 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v0, v[0:1], off offset:10 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v7, v[0:1], off +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v6, v[0:1], off offset:2 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v5, v[0:1], off offset:4 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v4, v[0:1], off offset:6 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v3, v[0:1], off offset:8 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v2, v[0:1], off offset:10 ; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s4, 0xffff ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s4, v3 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s4, v6 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s4, v5 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s4, v4 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v2, s4, v1 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v4, s4, v3 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v6, s4, v5 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v7, s4, v0 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v5, s4, v1 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v3, s4, v2 ; GFX9-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-UNALIGNED-LABEL: v_load_constant_v3i32_align2: @@ -393,52 +393,52 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(<3 x i32> addrspace(4)* ; GFX9-NOUNALIGNED-LABEL: s_load_constant_v3i32_align1: ; GFX9-NOUNALIGNED: ; %bb.0: ; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v1, v0, s[0:1] -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v2, v0, s[0:1] offset:1 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v3, v0, s[0:1] offset:2 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v4, v0, s[0:1] offset:3 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v5, v0, s[0:1] offset:4 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v6, v0, s[0:1] offset:5 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v7, v0, s[0:1] offset:6 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v8, v0, s[0:1] offset:7 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v9, v0, s[0:1] offset:8 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v0, s[0:1] offset:9 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v0, s[0:1] offset:10 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v0, v0, s[0:1] offset:11 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v12, 0xff +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v12, v0, s[0:1] +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v0, s[0:1] offset:1 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v0, s[0:1] offset:2 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v9, v0, s[0:1] offset:3 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v8, v0, s[0:1] offset:4 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v7, v0, s[0:1] offset:5 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v6, v0, s[0:1] offset:6 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v5, v0, s[0:1] offset:7 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v4, v0, s[0:1] offset:8 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v3, v0, s[0:1] offset:9 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v2, v0, s[0:1] offset:10 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v1, v0, s[0:1] offset:11 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, 0xff ; GFX9-NOUNALIGNED-NEXT: s_movk_i32 s0, 0xff ; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s1, 8 ; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v13, 8 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v2, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v11, s1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v10, s0, v10 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s0, v4 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v1, s0, v2 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v9, s0, v9 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v11, v12, s0, v11 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v6, s1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v7, s1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v7, v7, v12 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v6, v6, v0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v8, v8, v12 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v5, v5, v0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v10, v13, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v11, v11, v12 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, v2, v0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, v0, v12 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 24, v4 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v4, v5, s0, v6 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 24, v8 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v7, v9, v12, v10 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 24, v0 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 16, v11 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v1, v2, v3 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v4, v5, v6 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v7, v8, v9 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, v1, v0 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v3, v4, v0, v3 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v7, v8, s0, v7 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 24, v1 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v11, v10, v9 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v7, v6, v5 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v3, v2, v4 ; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 @@ -538,25 +538,25 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(<3 x i32> addrspace(4)* ; GFX9-NOUNALIGNED-LABEL: s_load_constant_v3i32_align2: ; GFX9-NOUNALIGNED: ; %bb.0: ; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v1, v0, s[0:1] -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v3, v0, s[0:1] offset:4 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v4, v0, s[0:1] offset:6 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v5, v0, s[0:1] offset:8 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v0, v0, s[0:1] offset:10 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v6, v0, s[0:1] +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v5, v0, s[0:1] offset:2 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v4, v0, s[0:1] offset:4 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v3, v0, s[0:1] offset:6 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v2, v0, s[0:1] offset:8 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v1, v0, s[0:1] offset:10 ; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s0, v2 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s0, v5 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s0, v4 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s0, v0 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v0 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v1, s0, v2 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v3, s0, v4 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v5, s0, v6 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v4, s0, v3 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v2, s0, v5 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v6, s0, v0 ; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 @@ -614,10 +614,10 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(<3 x i32> addrspace(4)* define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(<3 x i32> addrspace(4)* inreg %ptr) { ; GFX9-LABEL: s_load_constant_v3i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, s0 -; GFX9-NEXT: s_mov_b32 s3, s1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -636,10 +636,10 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(<3 x i32> addrspace(4)* define amdgpu_ps i96 @s_load_constant_i96_align8(i96 addrspace(4)* inreg %ptr) { ; GFX9-LABEL: s_load_constant_i96_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, s0 -; GFX9-NEXT: s_mov_b32 s3, s1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -658,10 +658,10 @@ define amdgpu_ps i96 @s_load_constant_i96_align8(i96 addrspace(4)* inreg %ptr) { define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align8(<3 x i32> addrspace(4)* inreg %ptr) { ; GFX9-LABEL: s_load_constant_v3i32_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, s0 -; GFX9-NEXT: s_mov_b32 s3, s1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -680,10 +680,10 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align8(<3 x i32> addrspace(4)* define amdgpu_ps <3 x i32> @s_load_constant_v6i16_align8(<6 x i16> addrspace(4)* inreg %ptr) { ; GFX9-LABEL: s_load_constant_v6i16_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, s0 -; GFX9-NEXT: s_mov_b32 s3, s1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll index 25e7bee..a2d8d05 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll @@ -109,15 +109,15 @@ define amdgpu_kernel void @localize_globals(i1 %cond) { ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, gv3@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, gv3@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 1 ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v0, v0, s[4:5] +; GFX9-NEXT: global_store_dword v0, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: BB1_2: ; %Flow ; GFX9-NEXT: s_xor_b32 s0, s0, -1 @@ -132,8 +132,8 @@ define amdgpu_kernel void @localize_globals(i1 %cond) { ; GFX9-NEXT: s_add_u32 s2, s2, gv1@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, gv1@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll index 44b0732..1fb9efe 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll @@ -36,17 +36,17 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache ; GCN-NEXT: s_cbranch_scc1 BB0_3 ; GCN-NEXT: ; %bb.2: ; %bb.1 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GCN-NEXT: s_load_dword s4, s[4:5], 0x10 -; GCN-NEXT: s_add_u32 s5, s32, 0x1000 -; GCN-NEXT: s_add_u32 s8, s5, 4 +; GCN-NEXT: s_load_dword s8, s[4:5], 0x10 +; GCN-NEXT: s_add_u32 s4, s32, 0x1000 +; GCN-NEXT: s_add_u32 s5, s4, 4 +; GCN-NEXT: v_mov_b32_e32 v3, s5 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s4, s4, 2 +; GCN-NEXT: s_lshl_b32 s5, s8, 2 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; GCN-NEXT: v_mov_b32_e32 v2, 1 -; GCN-NEXT: v_mov_b32_e32 v3, s8 -; GCN-NEXT: s_add_u32 s4, s5, s4 +; GCN-NEXT: s_add_u32 s4, s4, s5 ; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen @@ -109,18 +109,18 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache ; GCN-NEXT: s_cbranch_scc1 BB1_2 ; GCN-NEXT: ; %bb.1: ; %bb.0 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GCN-NEXT: s_load_dword s4, s[4:5], 0xc -; GCN-NEXT: s_add_u32 s5, s32, 0x1000 -; GCN-NEXT: s_and_b32 s5, s5, 0xfffff000 -; GCN-NEXT: s_add_u32 s8, s5, 4 +; GCN-NEXT: s_load_dword s8, s[4:5], 0xc +; GCN-NEXT: s_add_u32 s4, s32, 0x1000 +; GCN-NEXT: s_and_b32 s4, s4, 0xfffff000 +; GCN-NEXT: s_add_u32 s5, s4, 4 +; GCN-NEXT: v_mov_b32_e32 v3, s5 ; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s4, s4, 2 -; GCN-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NEXT: s_lshl_b32 s5, s8, 2 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; GCN-NEXT: v_mov_b32_e32 v2, 1 -; GCN-NEXT: v_mov_b32_e32 v3, s8 -; GCN-NEXT: s_add_u32 s4, s5, s4 +; GCN-NEXT: s_add_u32 s4, s4, s5 ; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll index 8c3f37b..f703543 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll @@ -8,14 +8,14 @@ define amdgpu_kernel void @store_lds_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: ds_write_b128 v4, v[0:3] ; GFX9-NEXT: s_endpgm ; @@ -39,51 +39,51 @@ define amdgpu_kernel void @store_lds_v4i32(<4 x i32> addrspace(3)* %out, <4 x i3 define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_lshr_b32 s5, s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_lshr_b32 s0, s4, 8 ; GFX9-NEXT: ds_write_b8 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-NEXT: s_lshr_b32 s6, s0, 16 -; GFX9-NEXT: s_lshr_b32 s7, s0, 24 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_lshr_b32 s1, s4, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:1 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_lshr_b32 s3, s4, 24 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:2 -; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:3 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_lshr_b32 s0, s1, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: s_lshr_b32 s0, s5, 8 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_lshr_b32 s4, s1, 16 -; GFX9-NEXT: s_lshr_b32 s5, s1, 24 +; GFX9-NEXT: s_lshr_b32 s1, s5, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:5 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_lshr_b32 s2, s5, 24 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:6 -; GFX9-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-NEXT: ds_write_b8 v1, v0 offset:7 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: s_lshr_b32 s0, s2, 8 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:7 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_lshr_b32 s0, s6, 8 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_lshr_b32 s1, s2, 16 -; GFX9-NEXT: s_lshr_b32 s4, s2, 24 +; GFX9-NEXT: s_lshr_b32 s1, s6, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:9 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_lshr_b32 s2, s6, 24 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:10 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:11 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: s_lshr_b32 s0, s3, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: s_lshr_b32 s0, s7, 8 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:12 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_lshr_b32 s1, s3, 16 +; GFX9-NEXT: s_lshr_b32 s1, s7, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:13 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_lshr_b32 s2, s3, 24 +; GFX9-NEXT: s_lshr_b32 s2, s7, 24 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:14 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:15 @@ -148,27 +148,27 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_lshr_b32 s5, s0, 16 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_lshr_b32 s0, s4, 16 ; GFX9-NEXT: ds_write_b16 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:2 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_lshr_b32 s0, s1, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: s_lshr_b32 s0, s5, 16 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:6 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: s_lshr_b32 s0, s2, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_lshr_b32 s0, s6, 16 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:10 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: s_lshr_b32 s0, s3, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: s_lshr_b32 s0, s7, 16 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:12 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:14 @@ -209,15 +209,15 @@ define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out, define amdgpu_kernel void @store_lds_v4i32_align4(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: ds_write2_b32 v1, v0, v2 offset1:1 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-NEXT: ds_write2_b32 v1, v3, v0 offset0:2 offset1:3 ; GFX9-NEXT: s_endpgm ; @@ -242,14 +242,14 @@ define amdgpu_kernel void @store_lds_v4i32_align4(<4 x i32> addrspace(3)* %out, define amdgpu_kernel void @store_lds_v4i32_align8(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 ; GFX9-NEXT: s_endpgm ; @@ -273,14 +273,14 @@ define amdgpu_kernel void @store_lds_v4i32_align8(<4 x i32> addrspace(3)* %out, define amdgpu_kernel void @store_lds_v4i32_align16(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: ds_write_b128 v4, v[0:3] ; GFX9-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll index c10b655..bb321fe 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll @@ -8,13 +8,13 @@ define amdgpu_kernel void @store_lds_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 ; GFX9-NEXT: ds_write_b96 v3, v[0:2] ; GFX9-NEXT: s_endpgm ; @@ -37,42 +37,42 @@ define amdgpu_kernel void @store_lds_v3i32(<3 x i32> addrspace(3)* %out, <3 x i3 define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_lshr_b32 s3, s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: s_lshr_b32 s0, s12, 8 ; GFX9-NEXT: ds_write_b8 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: s_lshr_b32 s5, s0, 16 -; GFX9-NEXT: s_lshr_b32 s6, s0, 24 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_lshr_b32 s1, s12, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:1 -; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_lshr_b32 s3, s12, 24 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:2 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:3 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_lshr_b32 s0, s1, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: s_lshr_b32 s0, s13, 8 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_lshr_b32 s4, s1, 24 +; GFX9-NEXT: s_lshr_b32 s1, s13, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:5 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_lshr_b32 s2, s13, 24 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:6 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: ds_write_b8 v1, v0 offset:7 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: s_lshr_b32 s0, s2, 8 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:7 +; GFX9-NEXT: v_mov_b32_e32 v0, s14 +; GFX9-NEXT: s_lshr_b32 s0, s14, 8 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_lshr_b32 s1, s2, 16 +; GFX9-NEXT: s_lshr_b32 s1, s14, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:9 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_lshr_b32 s3, s2, 24 +; GFX9-NEXT: s_lshr_b32 s2, s14, 24 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:10 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:11 ; GFX9-NEXT: s_endpgm ; @@ -124,22 +124,22 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_lshr_b32 s3, s0, 16 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: s_lshr_b32 s0, s12, 16 ; GFX9-NEXT: ds_write_b16 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:2 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_lshr_b32 s0, s1, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: s_lshr_b32 s0, s13, 16 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:6 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: s_lshr_b32 s0, s2, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s14 +; GFX9-NEXT: s_lshr_b32 s0, s14, 16 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:10 @@ -175,13 +175,13 @@ define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out, define amdgpu_kernel void @store_lds_v3i32_align4(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mov_b32_e32 v3, s14 ; GFX9-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 ; GFX9-NEXT: ds_write_b32 v2, v3 offset:8 ; GFX9-NEXT: s_endpgm @@ -206,13 +206,13 @@ define amdgpu_kernel void @store_lds_v3i32_align4(<3 x i32> addrspace(3)* %out, define amdgpu_kernel void @store_lds_v3i32_align8(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mov_b32_e32 v3, s14 ; GFX9-NEXT: ds_write_b64 v2, v[0:1] ; GFX9-NEXT: ds_write_b32 v2, v3 offset:8 ; GFX9-NEXT: s_endpgm @@ -237,13 +237,13 @@ define amdgpu_kernel void @store_lds_v3i32_align8(<3 x i32> addrspace(3)* %out, define amdgpu_kernel void @store_lds_v3i32_align16(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 ; GFX9-NEXT: ds_write_b96 v3, v[0:2] ; GFX9-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll index adbbb0e..e0037f0 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll @@ -360,7 +360,6 @@ define i32 @select_mul_lhs_const_i32(i1 %cond) { ; IR-LABEL: @select_mul_lhs_const_i32( ; IR-NEXT: [[OP:%.*]] = select i1 [[COND:%.*]], i32 5000, i32 8000 ; IR-NEXT: ret i32 [[OP]] -; %select = select i1 %cond, i32 5, i32 8 %op = mul i32 1000, %select ret i32 %op @@ -380,7 +379,6 @@ define i32 @select_mul_rhs_const_i32(i1 %cond) { ; IR-LABEL: @select_mul_rhs_const_i32( ; IR-NEXT: [[OP:%.*]] = select i1 [[COND:%.*]], i32 5000, i32 8000 ; IR-NEXT: ret i32 [[OP]] -; %select = select i1 %cond, i32 5, i32 8 %op = mul i32 %select, 1000 ret i32 %op @@ -420,7 +418,6 @@ define i16 @select_add_trunc_select(i1 %cond) { ; IR-LABEL: @select_add_trunc_select( ; IR-NEXT: [[OP:%.*]] = select i1 [[COND:%.*]], i16 47, i16 50 ; IR-NEXT: ret i16 [[OP]] -; %select = select i1 %cond, i32 5, i32 8 %trunc = trunc i32 %select to i16 %op = add i16 %trunc, 42 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index ac752b7..fe32aa0 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -176,33 +176,33 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive ; ; GFX7LESS-LABEL: add_i32_uniform: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0xb -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 +; GFX7LESS-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7LESS-NEXT: s_cbranch_execz BB1_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s3, s[6:7] +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s1, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mul_i32 s3, s2, s3 +; GFX7LESS-NEXT: s_mul_i32 s1, s0, s1 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s3 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 ; GFX7LESS-NEXT: BB1_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX7LESS-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 -; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s1, v0 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7LESS-NEXT: s_endpgm @@ -243,28 +243,28 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive ; GFX9-LABEL: add_i32_uniform: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_mov_b64 s[6:7], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz BB1_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX9-NEXT: s_bcnt1_i32_b64 s3, s[6:7] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s1, s0, s1 +; GFX9-NEXT: s_mul_i32 s3, s2, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB1_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -276,20 +276,20 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x2c -; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX1064-NEXT: s_mov_b64 s[6:7], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz BB1_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[6:7] ; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s1, s0, s1 -; GFX1064-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-NEXT: s_mul_i32 s3, s2, s3 +; GFX1064-NEXT: v_mov_b32_e32 v2, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 @@ -298,9 +298,9 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB1_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s6, -1 @@ -312,19 +312,19 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_clause 0x1 ; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x2c -; GFX1032-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX1032-NEXT: s_mov_b32 s3, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB1_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 ; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s2, s0, s2 -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 +; GFX1032-NEXT: s_mul_i32 s1, s2, s1 +; GFX1032-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 @@ -333,9 +333,9 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB1_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s6, -1 @@ -1720,33 +1720,33 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; ; GFX7LESS-LABEL: sub_i32_uniform: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0xb -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 +; GFX7LESS-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7LESS-NEXT: s_cbranch_execz BB9_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s3, s[6:7] +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s1, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mul_i32 s3, s2, s3 +; GFX7LESS-NEXT: s_mul_i32 s1, s0, s1 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s3 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 ; GFX7LESS-NEXT: BB9_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX7LESS-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 -; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 +; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s1, v0 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7LESS-NEXT: s_endpgm @@ -1787,28 +1787,28 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX9-LABEL: sub_i32_uniform: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_mov_b64 s[6:7], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz BB9_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX9-NEXT: s_bcnt1_i32_b64 s3, s[6:7] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s1, s0, s1 +; GFX9-NEXT: s_mul_i32 s3, s2, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB9_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -1820,20 +1820,20 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x2c -; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX1064-NEXT: s_mov_b64 s[6:7], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz BB9_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[6:7] ; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s1, s0, s1 -; GFX1064-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-NEXT: s_mul_i32 s3, s2, s3 +; GFX1064-NEXT: v_mov_b32_e32 v2, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 @@ -1842,9 +1842,9 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB9_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s6, -1 @@ -1856,19 +1856,19 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_clause 0x1 ; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x2c -; GFX1032-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX1032-NEXT: s_mov_b32 s3, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB9_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 ; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s2, s0, s2 -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 +; GFX1032-NEXT: s_mul_i32 s1, s2, s1 +; GFX1032-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 @@ -1877,9 +1877,9 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB9_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s6, -1 diff --git a/llvm/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir b/llvm/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir index a4b71ed..f8f623c 100644 --- a/llvm/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir +++ b/llvm/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir @@ -1,7 +1,6 @@ # RUN: llc -march=amdgcn -mcpu=carrizo -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK %s -# Make sure the default assumption is xnack enabled with no cpu -# RUN: llc -march=amdgcn -verify-machineinstrs -mattr=+volcanic-islands -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK %s +# RUN: llc -march=amdgcn -verify-machineinstrs -mattr=+volcanic-islands -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN %s # RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck --check-prefix=GCN %s --- # Trivial clause at beginning of program diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll index fd5cf7a..19b82bd 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll @@ -1,5 +1,4 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -codegenprepare -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=OPT %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=GCN %s diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll index 5668994..20a43f0 100644 --- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll +++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll @@ -1,7 +1,9 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -debug-only=machine-scheduler < %s 2> %t | FileCheck --enable-var-scope --check-prefixes=CHECK,GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs -debug-only=machine-scheduler < %s 2> %t | FileCheck --enable-var-scope --check-prefixes=CHECK,GCN %s ; RUN: FileCheck --enable-var-scope --check-prefixes=CHECK,DBG %s < %t ; REQUIRES: asserts +; FIXME: Verifier error with xnack enabled. + ; CHECK-LABEL: {{^}}cluster_load_cluster_store: define amdgpu_kernel void @cluster_load_cluster_store(i32* noalias %lb, i32* noalias %sb) { bb: diff --git a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll index e90c855..80dc38c 100644 --- a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll +++ b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll @@ -1,7 +1,7 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SICI,SI %s -; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=GCN,SICI %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VIGFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,VIGFX9 %s +; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SICIVI,SICI,SI %s +; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=GCN,SICIVI,SICI %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,SICIVI,VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s ; GCN-LABEL: {{^}}load_i32: ; GCN-DAG: s_mov_b32 s3, 0 @@ -9,8 +9,8 @@ ; GCN-DAG: s_mov_b32 s1, s3 ; SICI-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0 ; SICI-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x2 -; VIGFX9-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0 -; VIGFX9-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x8 +; GFX9-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0 +; GFX9-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x8 define amdgpu_vs float @load_i32(i32 addrspace(6)* inreg %p0, i32 addrspace(6)* inreg %p1) #0 { %gep1 = getelementptr inbounds i32, i32 addrspace(6)* %p1, i32 2 %r0 = load i32, i32 addrspace(6)* %p0 @@ -21,13 +21,18 @@ define amdgpu_vs float @load_i32(i32 addrspace(6)* inreg %p0, i32 addrspace(6)* } ; GCN-LABEL: {{^}}load_v2i32: -; GCN-DAG: s_mov_b32 s3, 0 -; GCN-DAG: s_mov_b32 s2, s1 -; GCN-DAG: s_mov_b32 s1, s3 +; SICIVI-DAG: s_mov_b32 s3, 0 +; SICIVI-DAG: s_mov_b32 s2, s1 +; SICIVI-DAG: s_mov_b32 s1, s3 ; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 ; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x4 -; VIGFX9-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 -; VIGFX9-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10 +; VI-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 +; VI-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10 +; GFX9-DAG: s_mov_b32 s6, s1 +; GFX9-DAG: s_mov_b32 s7, 0 +; GFX9-DAG: s_mov_b32 s1, s7 +; GFX9-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 +; GFX9-DAG: s_load_dwordx2 s[{{.*}}], s[6:7], 0x10 define amdgpu_vs <2 x float> @load_v2i32(<2 x i32> addrspace(6)* inreg %p0, <2 x i32> addrspace(6)* inreg %p1) #0 { %gep1 = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(6)* %p1, i32 2 %r0 = load <2 x i32>, <2 x i32> addrspace(6)* %p0 @@ -43,8 +48,10 @@ define amdgpu_vs <2 x float> @load_v2i32(<2 x i32> addrspace(6)* inreg %p0, <2 x ; GCN-DAG: s_mov_b32 s1, s3 ; SICI-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0 ; SICI-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x8 -; VIGFX9-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0 -; VIGFX9-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x20 +; VI-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0 +; VI-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x20 +; GFX9-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0 +; GFX9-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x20 define amdgpu_vs <4 x float> @load_v4i32(<4 x i32> addrspace(6)* inreg %p0, <4 x i32> addrspace(6)* inreg %p1) #0 { %gep1 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(6)* %p1, i32 2 %r0 = load <4 x i32>, <4 x i32> addrspace(6)* %p0 @@ -60,8 +67,10 @@ define amdgpu_vs <4 x float> @load_v4i32(<4 x i32> addrspace(6)* inreg %p0, <4 x ; GCN-DAG: s_mov_b32 s1, s3 ; SICI-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0 ; SICI-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x10 -; VIGFX9-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0 -; VIGFX9-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x40 +; VI-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0 +; VI-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x40 +; GFX9-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0 +; GFX9-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x40 define amdgpu_vs <8 x float> @load_v8i32(<8 x i32> addrspace(6)* inreg %p0, <8 x i32> addrspace(6)* inreg %p1) #0 { %gep1 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(6)* %p1, i32 2 %r0 = load <8 x i32>, <8 x i32> addrspace(6)* %p0 @@ -77,8 +86,10 @@ define amdgpu_vs <8 x float> @load_v8i32(<8 x i32> addrspace(6)* inreg %p0, <8 x ; GCN-DAG: s_mov_b32 s1, s3 ; SICI-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0 ; SICI-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x20 -; VIGFX9-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0 -; VIGFX9-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x80 +; VI-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0 +; VI-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x80 +; GFX9-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0 +; GFX9-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x80 define amdgpu_vs <16 x float> @load_v16i32(<16 x i32> addrspace(6)* inreg %p0, <16 x i32> addrspace(6)* inreg %p1) #0 { %gep1 = getelementptr inbounds <16 x i32>, <16 x i32> addrspace(6)* %p1, i32 2 %r0 = load <16 x i32>, <16 x i32> addrspace(6)* %p0 @@ -94,8 +105,10 @@ define amdgpu_vs <16 x float> @load_v16i32(<16 x i32> addrspace(6)* inreg %p0, < ; GCN-DAG: s_mov_b32 s1, s3 ; SICI-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0 ; SICI-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x2 -; VIGFX9-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0 -; VIGFX9-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x8 +; VI-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0 +; VI-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x8 +; GFX9-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0 +; GFX9-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x8 define amdgpu_vs float @load_float(float addrspace(6)* inreg %p0, float addrspace(6)* inreg %p1) #0 { %gep1 = getelementptr inbounds float, float addrspace(6)* %p1, i32 2 %r0 = load float, float addrspace(6)* %p0 @@ -105,13 +118,18 @@ define amdgpu_vs float @load_float(float addrspace(6)* inreg %p0, float addrspac } ; GCN-LABEL: {{^}}load_v2float: -; GCN-DAG: s_mov_b32 s3, 0 -; GCN-DAG: s_mov_b32 s2, s1 -; GCN-DAG: s_mov_b32 s1, s3 +; SICIVI-DAG: s_mov_b32 s3, 0 +; SICIVI-DAG: s_mov_b32 s2, s1 +; SICIVI-DAG: s_mov_b32 s1, s3 ; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 ; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x4 -; VIGFX9-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 -; VIGFX9-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10 +; VI-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 +; VI-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10 +; GFX9-DAG: s_mov_b32 s6, s1 +; GFX9-DAG: s_mov_b32 s7, 0 +; GFX9-DAG: s_mov_b32 s1, s7 +; GFX9-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 +; GFX9-DAG: s_load_dwordx2 s[{{.*}}], s[6:7], 0x10 define amdgpu_vs <2 x float> @load_v2float(<2 x float> addrspace(6)* inreg %p0, <2 x float> addrspace(6)* inreg %p1) #0 { %gep1 = getelementptr inbounds <2 x float>, <2 x float> addrspace(6)* %p1, i32 2 %r0 = load <2 x float>, <2 x float> addrspace(6)* %p0 @@ -126,8 +144,10 @@ define amdgpu_vs <2 x float> @load_v2float(<2 x float> addrspace(6)* inreg %p0, ; GCN-DAG: s_mov_b32 s1, s3 ; SICI-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0 ; SICI-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x8 -; VIGFX9-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0 -; VIGFX9-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x20 +; VI-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0 +; VI-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x20 +; GFX9-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0 +; GFX9-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x20 define amdgpu_vs <4 x float> @load_v4float(<4 x float> addrspace(6)* inreg %p0, <4 x float> addrspace(6)* inreg %p1) #0 { %gep1 = getelementptr inbounds <4 x float>, <4 x float> addrspace(6)* %p1, i32 2 %r0 = load <4 x float>, <4 x float> addrspace(6)* %p0 @@ -142,8 +162,10 @@ define amdgpu_vs <4 x float> @load_v4float(<4 x float> addrspace(6)* inreg %p0, ; GCN-DAG: s_mov_b32 s1, s3 ; SICI-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0 ; SICI-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x10 -; VIGFX9-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0 -; VIGFX9-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x40 +; VI-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0 +; VI-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x40 +; GFX9-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0 +; GFX9-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x40 define amdgpu_vs <8 x float> @load_v8float(<8 x float> addrspace(6)* inreg %p0, <8 x float> addrspace(6)* inreg %p1) #0 { %gep1 = getelementptr inbounds <8 x float>, <8 x float> addrspace(6)* %p1, i32 2 %r0 = load <8 x float>, <8 x float> addrspace(6)* %p0 @@ -158,8 +180,10 @@ define amdgpu_vs <8 x float> @load_v8float(<8 x float> addrspace(6)* inreg %p0, ; GCN-DAG: s_mov_b32 s1, s3 ; SICI-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0 ; SICI-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x20 -; VIGFX9-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0 -; VIGFX9-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x80 +; VI-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0 +; VI-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x80 +; GFX9-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0 +; GFX9-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x80 define amdgpu_vs <16 x float> @load_v16float(<16 x float> addrspace(6)* inreg %p0, <16 x float> addrspace(6)* inreg %p1) #0 { %gep1 = getelementptr inbounds <16 x float>, <16 x float> addrspace(6)* %p1, i32 2 %r0 = load <16 x float>, <16 x float> addrspace(6)* %p0 diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll index 78e5eb0..77cab29 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -20,16 +20,16 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { ; SI-LABEL: s_ctlz_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_flbit_i32_b32 s0, s2 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s2, 0 +; SI-NEXT: s_flbit_i32_b32 s5, s4 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, 32, v0, vcc -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_ctlz_i32: @@ -376,23 +376,23 @@ define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace define amdgpu_kernel void @s_ctlz_i64(i64 addrspace(1)* noalias %out, [8 x i32], i64 %val) nounwind { ; SI-LABEL: s_ctlz_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_flbit_i32_b32 s0, s2 -; SI-NEXT: s_flbit_i32_b32 s1, s3 -; SI-NEXT: s_add_i32 s0, s0, 32 -; SI-NEXT: s_or_b32 s2, s2, s3 -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 +; SI-NEXT: s_flbit_i32_b32 s6, s4 +; SI-NEXT: s_flbit_i32_b32 s7, s5 +; SI-NEXT: s_add_i32 s6, s6, 32 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s2, 0 +; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_ctlz_i64: @@ -440,22 +440,22 @@ define amdgpu_kernel void @s_ctlz_i64(i64 addrspace(1)* noalias %out, [8 x i32], define amdgpu_kernel void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind { ; SI-LABEL: s_ctlz_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_flbit_i32_b32 s0, s2 -; SI-NEXT: s_flbit_i32_b32 s1, s3 -; SI-NEXT: s_add_i32 s0, s0, 32 -; SI-NEXT: s_or_b32 s2, s2, s3 -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 +; SI-NEXT: s_flbit_i32_b32 s6, s4 +; SI-NEXT: s_flbit_i32_b32 s7, s5 +; SI-NEXT: s_add_i32 s6, s6, 32 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s2, 0 +; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_ctlz_i64_trunc: diff --git a/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll b/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll index 015a6b8..c638beb 100644 --- a/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll +++ b/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll @@ -17,8 +17,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx704 < %s | FileCheck --check-prefixes=GFX704 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire < %s | FileCheck --check-prefixes=GFX704 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx705 < %s | FileCheck --check-prefixes=GFX705 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx801 < %s | FileCheck --check-prefixes=GFX801 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=carrizo < %s | FileCheck --check-prefixes=GFX801 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=3 -mcpu=gfx801 < %s | FileCheck --check-prefixes=GFX801 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=3 -mcpu=carrizo < %s | FileCheck --check-prefixes=GFX801 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 < %s | FileCheck --check-prefixes=GFX802 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=iceland < %s | FileCheck --check-prefixes=GFX802 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga < %s | FileCheck --check-prefixes=GFX802 %s @@ -38,11 +38,14 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+xnack < %s | FileCheck --check-prefixes=XNACK-GFX900 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx902 -mattr=-xnack < %s | FileCheck --check-prefixes=NO-XNACK-GFX902 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx904 -mattr=+sram-ecc < %s | FileCheck --check-prefixes=SRAM-ECC-GFX904 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=+sram-ecc < %s | FileCheck --check-prefixes=SRAM-ECC-GFX906 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx904 -mattr=+sramecc < %s | FileCheck --check-prefixes=SRAM-ECC-GFX904 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=+sramecc < %s | FileCheck --check-prefixes=SRAM-ECC-GFX906 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx904 -mattr=+sram-ecc,+xnack < %s | FileCheck --check-prefixes=SRAM-ECC-XNACK-GFX904 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=+sram-ecc,+xnack < %s | FileCheck --check-prefixes=SRAM-ECC-XNACK-GFX906 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx904 -mattr=+sramecc,+xnack < %s | FileCheck --check-prefixes=SRAM-ECC-XNACK-GFX904 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=+sramecc,+xnack < %s | FileCheck --check-prefixes=SRAM-ECC-XNACK-GFX906 %s + +; FIXME: With the default attributes these directives are not accurate for +; xnack and sramecc. Subsequent Target-ID patches will address this. ; GFX600: .amdgcn_target "amdgcn-amd-amdhsa--gfx600" ; GFX601: .amdgcn_target "amdgcn-amd-amdhsa--gfx601" @@ -53,24 +56,24 @@ ; GFX703: .amdgcn_target "amdgcn-amd-amdhsa--gfx703" ; GFX704: .amdgcn_target "amdgcn-amd-amdhsa--gfx704" ; GFX705: .amdgcn_target "amdgcn-amd-amdhsa--gfx705" -; GFX801: .amdgcn_target "amdgcn-amd-amdhsa--gfx801+xnack" +; GFX801: .amdgcn_target "amdgcn-amd-amdhsa--gfx801" ; GFX802: .amdgcn_target "amdgcn-amd-amdhsa--gfx802" ; GFX803: .amdgcn_target "amdgcn-amd-amdhsa--gfx803" ; GFX805: .amdgcn_target "amdgcn-amd-amdhsa--gfx805" -; GFX810: .amdgcn_target "amdgcn-amd-amdhsa--gfx810+xnack" +; GFX810: .amdgcn_target "amdgcn-amd-amdhsa--gfx810" ; GFX900: .amdgcn_target "amdgcn-amd-amdhsa--gfx900" -; GFX902: .amdgcn_target "amdgcn-amd-amdhsa--gfx902+xnack" +; GFX902: .amdgcn_target "amdgcn-amd-amdhsa--gfx902" ; GFX904: .amdgcn_target "amdgcn-amd-amdhsa--gfx904" ; GFX906: .amdgcn_target "amdgcn-amd-amdhsa--gfx906" ; XNACK-GFX900: .amdgcn_target "amdgcn-amd-amdhsa--gfx900+xnack" ; NO-XNACK-GFX902: .amdgcn_target "amdgcn-amd-amdhsa--gfx902" -; SRAM-ECC-GFX904: .amdgcn_target "amdgcn-amd-amdhsa--gfx904+sram-ecc" -; SRAM-ECC-GFX906: "amdgcn-amd-amdhsa--gfx906+sram-ecc" +; SRAM-ECC-GFX904: .amdgcn_target "amdgcn-amd-amdhsa--gfx904+sramecc" +; SRAM-ECC-GFX906: "amdgcn-amd-amdhsa--gfx906+sramecc" -; SRAM-ECC-XNACK-GFX904: .amdgcn_target "amdgcn-amd-amdhsa--gfx904+xnack+sram-ecc" -; SRAM-ECC-XNACK-GFX906: .amdgcn_target "amdgcn-amd-amdhsa--gfx906+xnack+sram-ecc" +; SRAM-ECC-XNACK-GFX904: .amdgcn_target "amdgcn-amd-amdhsa--gfx904+xnack+sramecc" +; SRAM-ECC-XNACK-GFX906: .amdgcn_target "amdgcn-amd-amdhsa--gfx906+xnack+sramecc" define amdgpu_kernel void @directive_amdgcn_target() { ret void diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll index 7fdcdb1..db34168 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -321,11 +321,11 @@ define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out, ; GFX9-LABEL: read2_ptr_is_subreg_arg_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: ds_read_b32 v1, v1 offset:32 ; GFX9-NEXT: ds_read_b32 v2, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -374,11 +374,11 @@ define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1) ; GFX9-LABEL: read2_ptr_is_subreg_arg_offset_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: ds_read_b32 v1, v1 offset:32 ; GFX9-NEXT: ds_read_b32 v2, v2 offset:32 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -567,10 +567,10 @@ define amdgpu_kernel void @unaligned_read2_f32(float addrspace(1)* %out, float a ; GFX9-ALIGNED-LABEL: unaligned_read2_f32: ; GFX9-ALIGNED: ; %bb.0: ; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-ALIGNED-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s0, v0 +; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s4, v0 ; GFX9-ALIGNED-NEXT: ds_read_u8 v2, v1 ; GFX9-ALIGNED-NEXT: ds_read_u8 v3, v1 offset:1 ; GFX9-ALIGNED-NEXT: ds_read_u8 v4, v1 offset:2 @@ -596,10 +596,10 @@ define amdgpu_kernel void @unaligned_read2_f32(float addrspace(1)* %out, float a ; GFX9-UNALIGNED-LABEL: unaligned_read2_f32: ; GFX9-UNALIGNED: ; %bb.0: ; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-UNALIGNED-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s0, v2 +; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s4, v2 ; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:8 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1 @@ -661,10 +661,10 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(float addrspace(1)* %out, ; GFX9-ALIGNED-LABEL: unaligned_offset_read2_f32: ; GFX9-ALIGNED: ; %bb.0: ; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-ALIGNED-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s0, v0 +; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s4, v0 ; GFX9-ALIGNED-NEXT: ds_read_u8 v2, v1 offset:5 ; GFX9-ALIGNED-NEXT: ds_read_u8 v3, v1 offset:6 ; GFX9-ALIGNED-NEXT: ds_read_u8 v4, v1 offset:7 @@ -690,10 +690,10 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(float addrspace(1)* %out, ; GFX9-UNALIGNED-LABEL: unaligned_offset_read2_f32: ; GFX9-UNALIGNED: ; %bb.0: ; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-UNALIGNED-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: v_add3_u32 v0, s0, v2, 5 +; GFX9-UNALIGNED-NEXT: v_add3_u32 v0, s4, v2, 5 ; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1 @@ -745,10 +745,10 @@ define amdgpu_kernel void @misaligned_2_simple_read2_f32(float addrspace(1)* %ou ; GFX9-ALIGNED-LABEL: misaligned_2_simple_read2_f32: ; GFX9-ALIGNED: ; %bb.0: ; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-ALIGNED-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s0, v0 +; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s4, v0 ; GFX9-ALIGNED-NEXT: ds_read_u16 v2, v1 ; GFX9-ALIGNED-NEXT: ds_read_u16 v3, v1 offset:2 ; GFX9-ALIGNED-NEXT: ds_read_u16 v4, v1 offset:32 @@ -764,10 +764,10 @@ define amdgpu_kernel void @misaligned_2_simple_read2_f32(float addrspace(1)* %ou ; GFX9-UNALIGNED-LABEL: misaligned_2_simple_read2_f32: ; GFX9-UNALIGNED: ; %bb.0: ; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-UNALIGNED-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s0, v2 +; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s4, v2 ; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:8 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1 @@ -1210,10 +1210,10 @@ define amdgpu_kernel void @misaligned_read2_v2i32(<2 x i32> addrspace(1)* %out, ; GFX9-LABEL: misaligned_read2_v2i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] @@ -1241,10 +1241,10 @@ define amdgpu_kernel void @misaligned_read2_i64(i64 addrspace(1)* %out, i64 addr ; GFX9-LABEL: misaligned_read2_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] @@ -1288,16 +1288,16 @@ define amdgpu_kernel void @ds_read_diff_base_interleaving( ; ; GFX9-LABEL: ds_read_diff_base_interleaving: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v2, s0, v1 -; GFX9-NEXT: v_add_u32_e32 v3, s1, v0 -; GFX9-NEXT: v_add_u32_e32 v4, s2, v1 -; GFX9-NEXT: v_add_u32_e32 v6, s3, v0 +; GFX9-NEXT: v_add_u32_e32 v2, s4, v1 +; GFX9-NEXT: v_add_u32_e32 v3, s5, v0 +; GFX9-NEXT: v_add_u32_e32 v4, s6, v1 +; GFX9-NEXT: v_add_u32_e32 v6, s7, v0 ; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 ; GFX9-NEXT: ds_read2_b32 v[2:3], v3 offset1:4 ; GFX9-NEXT: ds_read2_b32 v[4:5], v4 offset1:1 @@ -1312,7 +1312,7 @@ define amdgpu_kernel void @ds_read_diff_base_interleaving( ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_f32_e32 v1, v5, v7 ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX9-NEXT: global_store_dword v8, v0, s[4:5] offset:40 +; GFX9-NEXT: global_store_dword v8, v0, s[2:3] offset:40 ; GFX9-NEXT: s_endpgm float addrspace(1)* nocapture %arg, [4 x [4 x float]] addrspace(3)* %arg1, @@ -1388,17 +1388,18 @@ define amdgpu_kernel void @ds_read_call_read(i32 addrspace(1)* %out, i32 addrspa ; GFX9-NEXT: s_getpc_b64 s[36:37] ; GFX9-NEXT: s_mov_b32 s36, s0 ; GFX9-NEXT: s_load_dwordx4 s[36:39], s[36:37], 0x0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: v_lshl_add_u32 v41, v0, 2, s0 ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, void_func_void@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, void_func_void@gotpcrel32@hi+12 +; GFX9-NEXT: v_lshl_add_u32 v41, v0, 2, s2 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX9-NEXT: ds_read_b32 v42, v41 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll index 6689f9e..8d5722b 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -200,14 +200,14 @@ define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(float addrspa ; GFX9-LABEL: simple_write2_two_val_subreg2_mixed_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[1:2], v3, s[0:1] glc +; GFX9-NEXT: global_load_dwordx2 v[1:2], v5, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v3, s[0:1] offset:8 glc +; GFX9-NEXT: global_load_dwordx2 v[3:4], v5, s[0:1] offset:8 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ds_write2_b32 v0, v1, v3 offset1:8 +; GFX9-NEXT: ds_write2_b32 v0, v1, v4 offset1:8 ; GFX9-NEXT: s_endpgm %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i @@ -528,17 +528,17 @@ define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* ; GFX9-LABEL: write2_ptr_subreg_arg_two_val_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NEXT: global_load_dword v0, v0, s[6:7] -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: ds_write_b32 v2, v1 offset:32 +; GFX9-NEXT: ds_write_b32 v0, v1 offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ds_write_b32 v3, v0 offset:32 +; GFX9-NEXT: ds_write_b32 v3, v2 offset:32 ; GFX9-NEXT: s_endpgm %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i @@ -617,11 +617,11 @@ define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(double addrspace ; GFX9-LABEL: misaligned_simple_write2_one_val_f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] -; GFX9-NEXT: v_add_u32_e32 v2, s0, v2 +; GFX9-NEXT: v_add_u32_e32 v2, s4, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 ; GFX9-NEXT: ds_write2_b32 v2, v0, v1 offset0:14 offset1:15 @@ -674,11 +674,11 @@ define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(double add ; GFX9-ALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64: ; GFX9-ALIGNED: ; %bb.0: ; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX9-ALIGNED-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x34 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] -; GFX9-ALIGNED-NEXT: v_add_u32_e32 v2, s0, v2 +; GFX9-ALIGNED-NEXT: v_add_u32_e32 v2, s4, v2 ; GFX9-ALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX9-ALIGNED-NEXT: ds_write_b8_d16_hi v2, v0 offset:7 @@ -701,11 +701,11 @@ define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(double add ; GFX9-UNALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64: ; GFX9-UNALIGNED: ; %bb.0: ; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX9-UNALIGNED-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x34 ; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] -; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v2, s0, v2 +; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v2, s4, v2 ; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v3, 5, v2 ; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v2, 9, v2 ; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -976,10 +976,10 @@ define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(<4 x float> addrs ; GFX9-ALIGNED-LABEL: simple_write2_v4f32_superreg_align4: ; GFX9-ALIGNED: ; %bb.0: ; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s4 -; GFX9-ALIGNED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX9-ALIGNED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, s1 @@ -992,10 +992,10 @@ define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(<4 x float> addrs ; GFX9-UNALIGNED-LABEL: simple_write2_v4f32_superreg_align4: ; GFX9-UNALIGNED: ; %bb.0: ; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_lshl_add_u32 v4, v0, 4, s4 -; GFX9-UNALIGNED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX9-UNALIGNED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll b/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll index 3750e6b..85b068b 100644 --- a/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll +++ b/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll @@ -61,6 +61,9 @@ ; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx1032 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1032 %s ; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx1033 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1033 %s +; FIXME: With the default attributes the eflags are not accurate for +; xnack and sramecc. Subsequent Target-ID patches will address this. + ; ARCH-R600: Format: elf32-amdgpu ; ARCH-R600: Arch: r600 ; ARCH-R600: AddressSize: 32bit @@ -96,19 +99,15 @@ ; GFX704: EF_AMDGPU_MACH_AMDGCN_GFX704 (0x26) ; GFX705: EF_AMDGPU_MACH_AMDGCN_GFX705 (0x3B) ; GFX801: EF_AMDGPU_MACH_AMDGCN_GFX801 (0x28) -; GFX801-NEXT: EF_AMDGPU_XNACK (0x100) ; GFX802: EF_AMDGPU_MACH_AMDGCN_GFX802 (0x29) ; GFX803: EF_AMDGPU_MACH_AMDGCN_GFX803 (0x2A) ; GFX805: EF_AMDGPU_MACH_AMDGCN_GFX805 (0x3C) ; GFX810: EF_AMDGPU_MACH_AMDGCN_GFX810 (0x2B) -; GFX810-NEXT: EF_AMDGPU_XNACK (0x100) ; GFX900: EF_AMDGPU_MACH_AMDGCN_GFX900 (0x2C) ; GFX902: EF_AMDGPU_MACH_AMDGCN_GFX902 (0x2D) -; GFX902-NEXT: EF_AMDGPU_XNACK (0x100) ; GFX904: EF_AMDGPU_MACH_AMDGCN_GFX904 (0x2E) ; GFX906: EF_AMDGPU_MACH_AMDGCN_GFX906 (0x2F) ; GFX908: EF_AMDGPU_MACH_AMDGCN_GFX908 (0x30) -; GFX908-NEXT: EF_AMDGPU_SRAM_ECC (0x200) ; GFX909: EF_AMDGPU_MACH_AMDGCN_GFX909 (0x31) ; GFX90C: EF_AMDGPU_MACH_AMDGCN_GFX90C (0x32) ; GFX1010: EF_AMDGPU_MACH_AMDGCN_GFX1010 (0x33) diff --git a/llvm/test/CodeGen/AMDGPU/elf-header-flags-sram-ecc.ll b/llvm/test/CodeGen/AMDGPU/elf-header-flags-sram-ecc.ll index 46f104bd..b977bb0 100644 --- a/llvm/test/CodeGen/AMDGPU/elf-header-flags-sram-ecc.ll +++ b/llvm/test/CodeGen/AMDGPU/elf-header-flags-sram-ecc.ll @@ -1,24 +1,9 @@ -; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx902 < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=NO-SRAM-ECC-GFX902 %s -; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx902 -mattr=-sram-ecc < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=NO-SRAM-ECC-GFX902 %s -; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx902 -mattr=+sram-ecc < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=SRAM-ECC-GFX902 %s - ; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=NO-SRAM-ECC-GFX906 %s -; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 -mattr=-sram-ecc < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=NO-SRAM-ECC-GFX906 %s -; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 -mattr=+sram-ecc < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=SRAM-ECC-GFX906 %s -; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 -mattr=+sram-ecc,+xnack < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=SRAM-ECC-XNACK-GFX906 %s - -; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx908 < %s | llvm-readobj -file-headers - | FileCheck --check-prefix=SRAM-ECC-GFX908 %s - -; NO-SRAM-ECC-GFX902: Flags [ -; NO-SRAM-ECC-GFX902-NEXT: EF_AMDGPU_MACH_AMDGCN_GFX902 (0x2D) -; NO-SRAM-ECC-GFX902-NEXT: EF_AMDGPU_XNACK (0x100) -; NO-SRAM-ECC-GFX902-NEXT: ] +; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 -mattr=-sramecc < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=NO-SRAM-ECC-GFX906 %s +; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 -mattr=+sramecc < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=SRAM-ECC-GFX906 %s +; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 -mattr=+sramecc,+xnack < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=SRAM-ECC-XNACK-GFX906 %s -; SRAM-ECC-GFX902: Flags [ -; SRAM-ECC-GFX902-NEXT: EF_AMDGPU_MACH_AMDGCN_GFX902 (0x2D) -; SRAM-ECC-GFX902-NEXT: EF_AMDGPU_SRAM_ECC (0x200) -; SRAM-ECC-GFX902-NEXT: EF_AMDGPU_XNACK (0x100) -; SRAM-ECC-GFX902-NEXT: ] +; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx908 -mattr=+sramecc < %s | llvm-readobj -file-headers - | FileCheck --check-prefix=SRAM-ECC-GFX908 %s ; NO-SRAM-ECC-GFX906: Flags [ ; NO-SRAM-ECC-GFX906-NEXT: EF_AMDGPU_MACH_AMDGCN_GFX906 (0x2F) diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll index 00df478..cd89c5b 100644 --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll @@ -33,9 +33,9 @@ define i32 @global_load_2xi16_align2(i16 addrspace(1)* %p) #0 { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:2 +; GFX9-NEXT: global_load_ushort v3, v[0:1], off offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(1)* %p, i64 1 %p.0 = load i16, i16 addrspace(1)* %p, align 2 diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll index fc250d3..ad86771 100644 --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll @@ -32,18 +32,18 @@ define i32 @private_load_2xi16_align2(i16 addrspace(5)* %p) #0 { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_load_ushort v1, v0, s[0:3], 0 offen -; GFX9-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen offset:2 +; GFX9-NEXT: buffer_load_ushort v2, v0, s[0:3], 0 offen offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-FLASTSCR-LABEL: private_load_2xi16_align2: ; GFX9-FLASTSCR: ; %bb.0: ; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-FLASTSCR-NEXT: scratch_load_ushort v1, v0, off -; GFX9-FLASTSCR-NEXT: scratch_load_ushort v0, v0, off offset:2 +; GFX9-FLASTSCR-NEXT: scratch_load_ushort v2, v0, off offset:2 ; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLASTSCR-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-FLASTSCR-NEXT: v_lshl_or_b32 v0, v2, 16, v1 ; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1 %p.0 = load i16, i16 addrspace(5)* %p, align 2 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll index 979bfd6..268e327 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll @@ -1,11 +1,11 @@ ; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=VI-NOXNACK -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=VI-NOXNACK -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=carrizo -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=VI-NOXNACK -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=stoney -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=VI-NOXNACK -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=carrizo -verify-machineinstrs < %s | FileCheck -check-prefix=VI-XNACK -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=stoney -verify-machineinstrs < %s | FileCheck -check-prefix=VI-XNACK -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=carrizo -mattr=+xnack -verify-machineinstrs < %s | FileCheck -check-prefix=VI-XNACK -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=stoney -mattr=+xnack -verify-machineinstrs < %s | FileCheck -check-prefix=VI-XNACK -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=kaveri --amdhsa-code-object-version=2 -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-CI -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=carrizo --amdhsa-code-object-version=2 -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-VI-NOXNACK -check-prefix=GCN %s diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll index 4244d8f4..aa659e8 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -228,8 +228,8 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] ; GFX9-PAL-NEXT: s_mov_b32 s4, s0 ; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 @@ -825,8 +825,8 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] ; GFX9-PAL-NEXT: s_mov_b32 s4, s0 ; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 @@ -1456,8 +1456,8 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] ; GFX9-PAL-NEXT: s_mov_b32 s4, s0 ; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 @@ -2013,8 +2013,8 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] ; GFX9-PAL-NEXT: s_mov_b32 s4, s0 ; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 4 +; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll index a27fa7d..b2cc852 100644 --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -7,26 +7,26 @@ define amdgpu_kernel void @frem_f16(half addrspace(1)* %out, half addrspace(1)* ; SI-LABEL: frem_f16: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:8 +; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 -; SI-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 +; SI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 ; SI-NEXT: v_rcp_f32_e32 v4, v3 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 @@ -42,7 +42,7 @@ define amdgpu_kernel void @frem_f16(half addrspace(1)* %out, half addrspace(1)* ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: frem_f16: @@ -126,22 +126,22 @@ define amdgpu_kernel void @fast_frem_f16(half addrspace(1)* %out, half addrspace ; SI-LABEL: fast_frem_f16: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:8 +; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_rcp_f32_e32 v2, v1 @@ -149,7 +149,7 @@ define amdgpu_kernel void @fast_frem_f16(half addrspace(1)* %out, half addrspace ; SI-NEXT: v_trunc_f32_e32 v2, v2 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: fast_frem_f16: @@ -216,22 +216,22 @@ define amdgpu_kernel void @unsafe_frem_f16(half addrspace(1)* %out, half addrspa ; SI-LABEL: unsafe_frem_f16: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:8 +; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_rcp_f32_e32 v2, v1 @@ -239,7 +239,7 @@ define amdgpu_kernel void @unsafe_frem_f16(half addrspace(1)* %out, half addrspa ; SI-NEXT: v_trunc_f32_e32 v2, v2 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: unsafe_frem_f16: @@ -306,23 +306,23 @@ define amdgpu_kernel void @frem_f32(float addrspace(1)* %out, float addrspace(1) ; SI-LABEL: frem_f32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 -; SI-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 +; SI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 ; SI-NEXT: v_rcp_f32_e32 v4, v3 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 @@ -336,7 +336,7 @@ define amdgpu_kernel void @frem_f32(float addrspace(1)* %out, float addrspace(1) ; SI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 ; SI-NEXT: v_trunc_f32_e32 v2, v2 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: frem_f32: @@ -421,26 +421,26 @@ define amdgpu_kernel void @fast_frem_f32(float addrspace(1)* %out, float addrspa ; SI-LABEL: fast_frem_f32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_rcp_f32_e32 v2, v1 ; SI-NEXT: v_mul_f32_e32 v2, v0, v2 ; SI-NEXT: v_trunc_f32_e32 v2, v2 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: fast_frem_f32: @@ -503,26 +503,26 @@ define amdgpu_kernel void @unsafe_frem_f32(float addrspace(1)* %out, float addrs ; SI-LABEL: unsafe_frem_f32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_rcp_f32_e32 v2, v1 ; SI-NEXT: v_mul_f32_e32 v2, v0, v2 ; SI-NEXT: v_trunc_f32_e32 v2, v2 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: unsafe_frem_f32: @@ -585,20 +585,20 @@ define amdgpu_kernel void @frem_f64(double addrspace(1)* %out, double addrspace( ; SI-LABEL: frem_f64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s8 ; SI-NEXT: s_mov_b32 s5, s9 -; SI-NEXT: s_mov_b32 s0, s10 -; SI-NEXT: s_mov_b32 s1, s11 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s11 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_mov_b32 s2, s6 ; SI-NEXT: s_mov_b32 s3, s7 -; SI-NEXT: s_mov_b32 s14, s6 -; SI-NEXT: s_mov_b32 s15, s7 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1] ; SI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] @@ -712,20 +712,20 @@ define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrs ; SI-LABEL: fast_frem_f64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s4 -; SI-NEXT: s_mov_b32 s13, s5 -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 -; SI-NEXT: s_mov_b32 s2, s14 -; SI-NEXT: s_mov_b32 s3, s15 -; SI-NEXT: s_mov_b32 s10, s14 -; SI-NEXT: s_mov_b32 s11, s15 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -738,7 +738,7 @@ define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrs ; SI-NEXT: v_bfe_u32 v6, v5, 20, 11 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0xfffffc01, v6 ; SI-NEXT: s_mov_b32 s1, 0xfffff -; SI-NEXT: s_mov_b32 s0, s14 +; SI-NEXT: s_mov_b32 s0, s10 ; SI-NEXT: v_lshr_b64 v[6:7], s[0:1], v8 ; SI-NEXT: v_not_b32_e32 v6, v6 ; SI-NEXT: v_and_b32_e32 v6, v4, v6 @@ -752,7 +752,7 @@ define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrs ; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc ; SI-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[0:1] ; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: fast_frem_f64: @@ -824,20 +824,20 @@ define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double add ; SI-LABEL: unsafe_frem_f64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s4 -; SI-NEXT: s_mov_b32 s13, s5 -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 -; SI-NEXT: s_mov_b32 s2, s14 -; SI-NEXT: s_mov_b32 s3, s15 -; SI-NEXT: s_mov_b32 s10, s14 -; SI-NEXT: s_mov_b32 s11, s15 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -850,7 +850,7 @@ define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double add ; SI-NEXT: v_bfe_u32 v6, v5, 20, 11 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0xfffffc01, v6 ; SI-NEXT: s_mov_b32 s1, 0xfffff -; SI-NEXT: s_mov_b32 s0, s14 +; SI-NEXT: s_mov_b32 s0, s10 ; SI-NEXT: v_lshr_b64 v[6:7], s[0:1], v8 ; SI-NEXT: v_not_b32_e32 v6, v6 ; SI-NEXT: v_and_b32_e32 v6, v4, v6 @@ -864,7 +864,7 @@ define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double add ; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc ; SI-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[0:1] ; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: unsafe_frem_f64: @@ -1816,20 +1816,20 @@ define amdgpu_kernel void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x doub ; SI-LABEL: frem_v2f64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s8 ; SI-NEXT: s_mov_b32 s5, s9 -; SI-NEXT: s_mov_b32 s0, s10 -; SI-NEXT: s_mov_b32 s1, s11 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s11 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_mov_b32 s2, s6 ; SI-NEXT: s_mov_b32 s3, s7 -; SI-NEXT: s_mov_b32 s14, s6 -; SI-NEXT: s_mov_b32 s15, s7 -; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 -; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[12:15], 0 offset:64 +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:64 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[6:7], v[6:7], v[2:3] ; SI-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll index 23944bb..f290c1f 100644 --- a/llvm/test/CodeGen/AMDGPU/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/fshl.ll @@ -47,15 +47,15 @@ define amdgpu_kernel void @fshl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 % ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_not_b32 s0, s0 -; GFX9-NEXT: s_lshr_b32 s1, s4, 1 +; GFX9-NEXT: s_not_b32 s1, s6 +; GFX9-NEXT: s_lshr_b32 s0, s4, 1 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_alignbit_b32 v1, s1, v1, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; @@ -106,11 +106,11 @@ define amdgpu_kernel void @fshl_i32_imm(i32 addrspace(1)* %in, i32 %x, i32 %y) { ; GFX9-LABEL: fshl_i32_imm: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, 25 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 25 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; @@ -185,21 +185,21 @@ define amdgpu_kernel void @fshl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x3c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s0, s5, 1 ; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: s_not_b32 s1, s1 -; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 1 +; GFX9-NEXT: s_not_b32 s1, s9 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_lshr_b32 s5, s5, 1 -; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1 +; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 1 +; GFX9-NEXT: v_alignbit_b32 v1, s0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_not_b32 s0, s0 +; GFX9-NEXT: s_not_b32 s1, s8 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; GFX9-NEXT: s_lshr_b32 s1, s4, 1 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: v_alignbit_b32 v0, s1, v0, v3 +; GFX9-NEXT: s_lshr_b32 s0, s4, 1 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; @@ -261,11 +261,11 @@ define amdgpu_kernel void @fshl_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32> ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 23 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v3, 25 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] @@ -365,37 +365,37 @@ define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, ; ; GFX9-LABEL: fshl_v4i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54 +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s0, s7, 1 ; GFX9-NEXT: v_mov_b32_e32 v0, s11 -; GFX9-NEXT: s_not_b32 s3, s3 +; GFX9-NEXT: s_not_b32 s1, s15 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_lshr_b32 s7, s7, 1 -; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, v1 +; GFX9-NEXT: v_alignbit_b32 v3, s0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: s_not_b32 s2, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_not_b32 s1, s14 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_alignbit_b32 v0, s6, v0, 1 -; GFX9-NEXT: s_lshr_b32 s3, s6, 1 -; GFX9-NEXT: v_alignbit_b32 v2, s3, v0, v1 +; GFX9-NEXT: s_lshr_b32 s0, s6, 1 +; GFX9-NEXT: v_alignbit_b32 v2, s0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s9 -; GFX9-NEXT: s_not_b32 s1, s1 +; GFX9-NEXT: s_not_b32 s1, s13 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 1 -; GFX9-NEXT: s_lshr_b32 s2, s5, 1 -; GFX9-NEXT: v_alignbit_b32 v1, s2, v0, v1 +; GFX9-NEXT: s_lshr_b32 s0, s5, 1 +; GFX9-NEXT: v_alignbit_b32 v1, s0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: s_not_b32 s0, s0 +; GFX9-NEXT: s_not_b32 s1, s12 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; GFX9-NEXT: s_lshr_b32 s1, s4, 1 -; GFX9-NEXT: v_mov_b32_e32 v5, s0 -; GFX9-NEXT: v_alignbit_b32 v0, s1, v0, v5 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] +; GFX9-NEXT: s_lshr_b32 s0, s4, 1 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v5 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_v4i32: @@ -470,20 +470,20 @@ define amdgpu_kernel void @fshl_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32> ; ; GFX9-LABEL: fshl_v4i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s11 +; GFX9-NEXT: v_mov_b32_e32 v1, s10 ; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, 31 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s9 ; GFX9-NEXT: v_alignbit_b32 v2, s6, v1, 23 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 25 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 31 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_v4i32_imm: diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll index a2a9cb5..77d1016 100644 --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -50,11 +50,11 @@ define amdgpu_kernel void @fshr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 % ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm @@ -103,11 +103,11 @@ define amdgpu_kernel void @fshr_i32_imm(i32 addrspace(1)* %in, i32 %x, i32 %y) { ; GFX9-LABEL: fshr_i32_imm: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, 7 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 7 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; @@ -170,14 +170,14 @@ define amdgpu_kernel void @fshr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x3c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s8 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm @@ -236,11 +236,11 @@ define amdgpu_kernel void @fshr_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32> ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 9 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v3, 7 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] @@ -316,25 +316,25 @@ define amdgpu_kernel void @fshr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, ; ; GFX9-LABEL: fshr_v4i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54 +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s11 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s15 ; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s14 ; GFX9-NEXT: v_alignbit_b32 v2, s6, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s9 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v5, s0 +; GFX9-NEXT: v_mov_b32_e32 v5, s12 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v5 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshr_v4i32: @@ -401,20 +401,20 @@ define amdgpu_kernel void @fshr_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32> ; ; GFX9-LABEL: fshr_v4i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s11 +; GFX9-NEXT: v_mov_b32_e32 v1, s10 ; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, 1 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s9 ; GFX9-NEXT: v_alignbit_b32 v2, s6, v1, 9 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 7 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshr_v4i32_imm: diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll index 04dc4a2..41c3f026 100644 --- a/llvm/test/CodeGen/AMDGPU/function-returns.ll +++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll @@ -389,6 +389,7 @@ define <4 x half> @v4f16_func_void() #0 { ; FIXME: Should not scalarize ; GCN-LABEL: {{^}}v5i16_func_void: ; GFX9: buffer_load_dwordx2 v[0:1] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_short_d16 v2 ; GFX9-NEXT: s_waitcnt ; GFX9-NEXT: s_setpc_b64 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll index b6d1e16..d80fbaa 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -2478,13 +2478,13 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[4:5] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v4, s[4:5] offset:16 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[4:5] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[4:5] offset:16 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 @@ -2510,7 +2510,7 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 { ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 @@ -2518,8 +2518,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v4, s[4:5] -; GFX10-NEXT: global_load_dwordx4 v[4:7], v4, s[4:5] offset:16 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[4:5] +; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[4:5] offset:16 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 @@ -2622,15 +2622,15 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-NEXT: v_mov_b32_e32 v16, 0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v12, s[4:5] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v12, s[4:5] offset:16 -; GFX9-NEXT: global_load_dwordx4 v[8:11], v12, s[4:5] offset:32 -; GFX9-NEXT: global_load_dwordx4 v[12:15], v12, s[4:5] offset:48 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[4:5] offset:16 +; GFX9-NEXT: global_load_dwordx4 v[8:11], v16, s[4:5] offset:32 +; GFX9-NEXT: global_load_dwordx4 v[12:15], v16, s[4:5] offset:48 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4 @@ -2656,7 +2656,7 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 { ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v12, 0 +; GFX10-NEXT: v_mov_b32_e32 v16, 0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 @@ -2664,10 +2664,10 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x3 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v12, s[4:5] -; GFX10-NEXT: global_load_dwordx4 v[4:7], v12, s[4:5] offset:16 -; GFX10-NEXT: global_load_dwordx4 v[8:11], v12, s[4:5] offset:32 -; GFX10-NEXT: global_load_dwordx4 v[12:15], v12, s[4:5] offset:48 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] +; GFX10-NEXT: global_load_dwordx4 v[4:7], v16, s[4:5] offset:16 +; GFX10-NEXT: global_load_dwordx4 v[8:11], v16, s[4:5] offset:32 +; GFX10-NEXT: global_load_dwordx4 v[12:15], v16, s[4:5] offset:48 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4 @@ -2697,19 +2697,19 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v28, 0 +; GFX9-NEXT: v_mov_b32_e32 v32, 0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v28, s[4:5] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v28, s[4:5] offset:16 -; GFX9-NEXT: global_load_dwordx4 v[8:11], v28, s[4:5] offset:32 -; GFX9-NEXT: global_load_dwordx4 v[12:15], v28, s[4:5] offset:48 -; GFX9-NEXT: global_load_dwordx4 v[16:19], v28, s[4:5] offset:64 -; GFX9-NEXT: global_load_dwordx4 v[20:23], v28, s[4:5] offset:80 -; GFX9-NEXT: global_load_dwordx4 v[24:27], v28, s[4:5] offset:96 -; GFX9-NEXT: global_load_dwordx4 v[28:31], v28, s[4:5] offset:112 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v32, s[4:5] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v32, s[4:5] offset:16 +; GFX9-NEXT: global_load_dwordx4 v[8:11], v32, s[4:5] offset:32 +; GFX9-NEXT: global_load_dwordx4 v[12:15], v32, s[4:5] offset:48 +; GFX9-NEXT: global_load_dwordx4 v[16:19], v32, s[4:5] offset:64 +; GFX9-NEXT: global_load_dwordx4 v[20:23], v32, s[4:5] offset:80 +; GFX9-NEXT: global_load_dwordx4 v[24:27], v32, s[4:5] offset:96 +; GFX9-NEXT: global_load_dwordx4 v[28:31], v32, s[4:5] offset:112 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v32i32@rel32@lo+4 @@ -2735,7 +2735,7 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 { ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v28, 0 +; GFX10-NEXT: v_mov_b32_e32 v32, 0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 @@ -2743,14 +2743,14 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v28, s[4:5] -; GFX10-NEXT: global_load_dwordx4 v[4:7], v28, s[4:5] offset:16 -; GFX10-NEXT: global_load_dwordx4 v[8:11], v28, s[4:5] offset:32 -; GFX10-NEXT: global_load_dwordx4 v[12:15], v28, s[4:5] offset:48 -; GFX10-NEXT: global_load_dwordx4 v[16:19], v28, s[4:5] offset:64 -; GFX10-NEXT: global_load_dwordx4 v[20:23], v28, s[4:5] offset:80 -; GFX10-NEXT: global_load_dwordx4 v[24:27], v28, s[4:5] offset:96 -; GFX10-NEXT: global_load_dwordx4 v[28:31], v28, s[4:5] offset:112 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v32, s[4:5] +; GFX10-NEXT: global_load_dwordx4 v[4:7], v32, s[4:5] offset:16 +; GFX10-NEXT: global_load_dwordx4 v[8:11], v32, s[4:5] offset:32 +; GFX10-NEXT: global_load_dwordx4 v[12:15], v32, s[4:5] offset:48 +; GFX10-NEXT: global_load_dwordx4 v[16:19], v32, s[4:5] offset:64 +; GFX10-NEXT: global_load_dwordx4 v[20:23], v32, s[4:5] offset:80 +; GFX10-NEXT: global_load_dwordx4 v[24:27], v32, s[4:5] offset:96 +; GFX10-NEXT: global_load_dwordx4 v[28:31], v32, s[4:5] offset:112 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v32i32@rel32@lo+4 @@ -2780,28 +2780,28 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v28, 0 +; GFX9-NEXT: v_mov_b32_e32 v32, 0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v28, s[4:5] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v28, s[4:5] offset:16 -; GFX9-NEXT: global_load_dwordx4 v[8:11], v28, s[4:5] offset:32 -; GFX9-NEXT: global_load_dwordx4 v[12:15], v28, s[4:5] offset:48 -; GFX9-NEXT: global_load_dwordx4 v[16:19], v28, s[4:5] offset:64 -; GFX9-NEXT: global_load_dwordx4 v[20:23], v28, s[4:5] offset:80 -; GFX9-NEXT: global_load_dwordx4 v[24:27], v28, s[4:5] offset:96 -; GFX9-NEXT: global_load_dwordx4 v[28:31], v28, s[4:5] offset:112 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v32, s[4:5] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v32, s[4:5] offset:16 +; GFX9-NEXT: global_load_dwordx4 v[8:11], v32, s[4:5] offset:32 +; GFX9-NEXT: global_load_dwordx4 v[12:15], v32, s[4:5] offset:48 +; GFX9-NEXT: global_load_dwordx4 v[16:19], v32, s[4:5] offset:64 +; GFX9-NEXT: global_load_dwordx4 v[20:23], v32, s[4:5] offset:80 +; GFX9-NEXT: global_load_dwordx4 v[24:27], v32, s[4:5] offset:96 +; GFX9-NEXT: global_load_dwordx4 v[28:31], v32, s[4:5] offset:112 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: global_load_dword v33, v[0:1], off ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: global_load_dword v32, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -2822,29 +2822,29 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 { ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v28, 0 +; GFX10-NEXT: v_mov_b32_e32 v32, 0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: global_load_dword v32, v[0:1], off +; GFX10-NEXT: global_load_dword v33, v[0:1], off ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v28, s[4:5] -; GFX10-NEXT: global_load_dwordx4 v[4:7], v28, s[4:5] offset:16 -; GFX10-NEXT: global_load_dwordx4 v[8:11], v28, s[4:5] offset:32 -; GFX10-NEXT: global_load_dwordx4 v[12:15], v28, s[4:5] offset:48 -; GFX10-NEXT: global_load_dwordx4 v[16:19], v28, s[4:5] offset:64 -; GFX10-NEXT: global_load_dwordx4 v[20:23], v28, s[4:5] offset:80 -; GFX10-NEXT: global_load_dwordx4 v[24:27], v28, s[4:5] offset:96 -; GFX10-NEXT: global_load_dwordx4 v[28:31], v28, s[4:5] offset:112 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v32, s[4:5] +; GFX10-NEXT: global_load_dwordx4 v[4:7], v32, s[4:5] offset:16 +; GFX10-NEXT: global_load_dwordx4 v[8:11], v32, s[4:5] offset:32 +; GFX10-NEXT: global_load_dwordx4 v[12:15], v32, s[4:5] offset:48 +; GFX10-NEXT: global_load_dwordx4 v[16:19], v32, s[4:5] offset:64 +; GFX10-NEXT: global_load_dwordx4 v[20:23], v32, s[4:5] offset:80 +; GFX10-NEXT: global_load_dwordx4 v[24:27], v32, s[4:5] offset:96 +; GFX10-NEXT: global_load_dwordx4 v[28:31], v32, s[4:5] offset:112 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 ; GFX10-NEXT: s_waitcnt vmcnt(8) -; GFX10-NEXT: buffer_store_dword v32, off, s[0:3], s32 +; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 @@ -2948,13 +2948,13 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] offset:4 -; GFX9-NEXT: global_load_ubyte v0, v0, s[4:5] +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:4 +; GFX9-NEXT: global_load_ubyte v0, v2, s[4:5] ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4 @@ -2980,7 +2980,7 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 { ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 @@ -2988,8 +2988,8 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_ubyte v0, v1, s[4:5] -; GFX10-NEXT: global_load_dword v1, v1, s[4:5] offset:4 +; GFX10-NEXT: global_load_ubyte v0, v2, s[4:5] +; GFX10-NEXT: global_load_dword v1, v2, s[4:5] offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4 @@ -5650,7 +5650,7 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 { ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 18 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[20:21], s[4:5], 0x0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s36, 0 @@ -5671,8 +5671,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s51, 15 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 -; GFX10-NEXT: s_load_dwordx16 s[4:19], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx16 s[36:51], s[20:21], 0x40 +; GFX10-NEXT: s_load_dwordx16 s[4:19], s[20:21], 0x0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 16 ; GFX10-NEXT: v_writelane_b32 v40, s31, 17 ; GFX10-NEXT: s_getpc_b64 s[30:31] @@ -5831,13 +5831,17 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 18 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[20:21], s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s22, s[4:5], 0x0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s36, 0 ; GFX10-NEXT: v_writelane_b32 v40, s37, 1 ; GFX10-NEXT: v_writelane_b32 v40, s38, 2 ; GFX10-NEXT: v_writelane_b32 v40, s39, 3 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s22 ; GFX10-NEXT: v_writelane_b32 v40, s40, 4 ; GFX10-NEXT: v_writelane_b32 v40, s41, 5 ; GFX10-NEXT: v_writelane_b32 v40, s42, 6 @@ -5850,24 +5854,21 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { ; GFX10-NEXT: v_writelane_b32 v40, s49, 13 ; GFX10-NEXT: v_writelane_b32 v40, s50, 14 ; GFX10-NEXT: v_writelane_b32 v40, s51, 15 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s20, s[4:5], 0x0 -; GFX10-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 -; GFX10-NEXT: s_load_dwordx16 s[4:19], s[4:5], 0x0 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx16 s[36:51], s[20:21], 0x40 +; GFX10-NEXT: s_load_dwordx16 s[4:19], s[20:21], 0x0 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; GFX10-NEXT: v_writelane_b32 v40, s30, 16 ; GFX10-NEXT: v_writelane_b32 v40, s31, 17 ; GFX10-NEXT: s_getpc_b64 s[30:31] ; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v32i32_i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v32i32_i32_inreg@rel32@hi+12 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s20 +; GFX10-NEXT: v_mov_b32_e32 v0, s46 ; GFX10-NEXT: v_mov_b32_e32 v1, s47 ; GFX10-NEXT: v_mov_b32_e32 v2, s48 ; GFX10-NEXT: v_mov_b32_e32 v3, s49 ; GFX10-NEXT: s_mov_b32 s20, s36 -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 -; GFX10-NEXT: v_mov_b32_e32 v0, s46 ; GFX10-NEXT: s_mov_b32 s21, s37 ; GFX10-NEXT: s_mov_b32 s22, s38 ; GFX10-NEXT: s_mov_b32 s23, s39 diff --git a/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir b/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir index 9b32e53..ef01220 100644 --- a/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir +++ b/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir @@ -1,6 +1,6 @@ -# RUN: llc -march=amdgcn -mcpu=gfx902 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK %s -# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX9 %s -# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK %s +# RUN: llc -march=amdgcn -mcpu=gfx902 -mattr=+xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK,GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64,-xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK %s # GCN-LABEL: name: break_smem_clause_simple_load_smrd8_ptr_hidden_bundle # GCN: bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir b/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir index d0879cd..2ad7155 100644 --- a/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir +++ b/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir @@ -1,6 +1,6 @@ -# RUN: llc -march=amdgcn -mcpu=gfx902 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK,GFX9 %s -# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX9 %s -# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX10 %s +# RUN: llc -march=amdgcn -mcpu=gfx902 -mattr=+xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK,GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64,-xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX10 %s # GCN-LABEL: name: break_smem_clause_max_look_ahead_in_bundle # GCN: S_LOAD_DWORDX2_IMM diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll index 4cfa67f..396b8c7 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll @@ -1,7 +1,7 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes - | FileCheck --check-prefixes=CHECK,GFX700,WAVE64 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes - | FileCheck --check-prefixes=CHECK,GFX803,WAVE64 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes - | FileCheck --check-prefixes=CHECK,GFX900,WAVE64 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes - | FileCheck --check-prefixes=CHECK,GFX1010,WAVE32 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=3 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes - | FileCheck --check-prefixes=CHECK,GFX700,WAVE64 %s +; RUN: llc -mattr=-xnack -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=3 -mcpu=gfx803 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes - | FileCheck --check-prefixes=CHECK,GFX803,WAVE64 %s +; RUN: llc -mattr=-xnack -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=3 -mcpu=gfx900 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes - | FileCheck --check-prefixes=CHECK,GFX900,WAVE64 %s +; RUN: llc -mattr=-xnack -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=3 -mcpu=gfx1010 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes - | FileCheck --check-prefixes=CHECK,GFX1010,WAVE32 %s @var = addrspace(1) global float 0.0 diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll index a7bfb6f..03330e7 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll @@ -1,6 +1,6 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=2 -enable-misched=0 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefixes=CHECK,GFX700 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 --amdhsa-code-object-version=2 -enable-misched=0 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefixes=CHECK,GFX803 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=2 -enable-misched=0 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefixes=CHECK,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 --amdhsa-code-object-version=2 -mattr=-xnack -enable-misched=0 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefixes=CHECK,GFX803 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=2 -mattr=-xnack -enable-misched=0 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefixes=CHECK,GFX900 %s @var = addrspace(1) global float 0.0 diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll index 8b9931a..a1a0123 100644 --- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -5,8 +5,8 @@ define amdgpu_kernel void @udiv32_invariant_denom(i32 addrspace(1)* nocapture %a ; GFX9-LABEL: udiv32_invariant_denom: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-NEXT: s_sub_i32 s3, 0, s2 @@ -64,8 +64,8 @@ define amdgpu_kernel void @urem32_invariant_denom(i32 addrspace(1)* nocapture %a ; GFX9-LABEL: urem32_invariant_denom: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-NEXT: s_sub_i32 s3, 0, s2 @@ -121,6 +121,7 @@ define amdgpu_kernel void @sdiv32_invariant_denom(i32 addrspace(1)* nocapture %a ; GFX9-LABEL: sdiv32_invariant_denom: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x2c +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s2, s3, 31 @@ -180,6 +181,7 @@ define amdgpu_kernel void @srem32_invariant_denom(i32 addrspace(1)* nocapture %a ; GFX9-LABEL: srem32_invariant_denom: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s3, s2, 31 diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll index 1f21e3e..dd813b0 100644 --- a/llvm/test/CodeGen/AMDGPU/idot2.ll +++ b/llvm/test/CodeGen/AMDGPU/idot2.ll @@ -62,56 +62,56 @@ define amdgpu_kernel void @udot2(<2 x i16> addrspace(1)* %src1, ; GFX9-NODL-LABEL: udot2: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v1, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s1, v1, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_dot2_u32_u16 v1, s4, v1, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-DL-NEXT: v_dot2_u32_u16 v1, s1, v1, v2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-DL-NEXT: v_dot2_u32_u16 v0, s1, s0, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -194,69 +194,69 @@ define amdgpu_kernel void @udot2_MulMul(<2 x i16> addrspace(1)* %src1, ; GFX9-NODL-LABEL: udot2_MulMul: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v1, s2, v1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 -; GFX9-NODL-NEXT: v_add_u32_e32 v1, s5, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v1, s5, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s1, v2, v1 +; GFX9-NODL-NEXT: v_add_u32_e32 v1, s9, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_MulMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v1, s2, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 -; GFX9-DL-NEXT: v_add_u32_e32 v1, s5, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-DL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v1, s5, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s1, v2, v1 +; GFX9-DL-NEXT: v_add_u32_e32 v1, s9, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_MulMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: s_mov_b32 s5, 0xffff +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s6, s2, s5 -; GFX10-DL-NEXT: s_and_b32 s5, s3, s5 -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX10-DL-NEXT: v_mul_u32_u24_e64 v0, s5, s6 -; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, s4, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: s_and_b32 s5, s0, s4 +; GFX10-DL-NEXT: s_and_b32 s4, s1, s4 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX10-DL-NEXT: v_mul_u32_u24_e64 v0, s4, s5 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s1, s0, v0 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, s8, v0 +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -332,55 +332,55 @@ define amdgpu_kernel void @idot2(<2 x i16> addrspace(1)* %src1, ; GFX9-NODL-LABEL: idot2: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s3, v2, v1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-NODL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s1, v2, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s5, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_dot2_i32_i16 v1, s4, v1, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-DL-NEXT: v_dot2_i32_i16 v1, s1, v1, v2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-DL-NEXT: v_dot2_i32_i16 v0, s1, s0, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -457,66 +457,66 @@ define amdgpu_kernel void @idot2_MixedTypedMul(<2 x i16> addrspace(1)* %src1, ; GFX9-NODL-LABEL: idot2_MixedTypedMul: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s1, v2, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s5, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MixedTypedMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-DL-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-DL-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s1, v2, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s5, v2, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MixedTypedMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-DL-NEXT: s_lshr_b32 s5, s1, 16 ; GFX10-DL-NEXT: s_sext_i32_i16 s0, s0 ; GFX10-DL-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s4, v0 ; GFX10-DL-NEXT: v_mad_i32_i24 v0, s1, s0, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -596,56 +596,56 @@ define amdgpu_kernel void @udot2_alt_AddOperands(<2 x i16> addrspace(1)* %src1, ; GFX9-NODL-LABEL: udot2_alt_AddOperands: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v1, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s1, v1, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_alt_AddOperands: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_dot2_u32_u16 v1, s4, v1, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-DL-NEXT: v_dot2_u32_u16 v1, s1, v1, v2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_alt_AddOperands: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-DL-NEXT: v_dot2_u32_u16 v0, s1, s0, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -722,66 +722,66 @@ define amdgpu_kernel void @idot2_MixedExt(<2 x i16> addrspace(1)* %src1, ; GFX9-NODL-LABEL: idot2_MixedExt: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NODL-NEXT: s_and_b32 s6, s3, 0xffff -; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s3, v2, v1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, 0xffff +; GFX9-NODL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s1, v2, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s5, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MixedExt: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-DL-NEXT: s_and_b32 s6, s3, 0xffff -; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v1, s3, v2, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-DL-NEXT: s_and_b32 s5, s1, 0xffff +; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s1, v2, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s5, v2, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MixedExt: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_ashr_i32 s2, s0, 16 -; GFX10-DL-NEXT: s_ashr_i32 s3, s1, 16 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_ashr_i32 s4, s0, 16 +; GFX10-DL-NEXT: s_ashr_i32 s5, s1, 16 ; GFX10-DL-NEXT: s_sext_i32_i16 s0, s0 ; GFX10-DL-NEXT: s_and_b32 s1, s1, 0xffff -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s3, s2, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s5, s4, v0 ; GFX10-DL-NEXT: v_mad_i32_i24 v0, s1, s0, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -850,55 +850,55 @@ define amdgpu_kernel void @notudot2_SameVec(<2 x i16> addrspace(1)* %src1, ; GFX9-NODL-LABEL: notudot2_SameVec: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, s2, v1 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, s4, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NODL-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s1, s1, v1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s0, s0, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notudot2_SameVec: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, s2, v1 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, s4, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-DL-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s1, s1, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s0, s0, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notudot2_SameVec: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s2, s3 -; GFX10-DL-NEXT: s_and_b32 s2, s4, 0xffff -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s2, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s1, s1, s8 +; GFX10-DL-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s0, s0, v0 +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -978,56 +978,56 @@ define amdgpu_kernel void @udot2_v4i16(<4 x i16> addrspace(1)* %src1, ; GFX9-NODL-LABEL: udot2_v4i16: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v1, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s1, v1, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_v4i16: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_dot2_u32_u16 v1, s4, v1, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-DL-NEXT: v_dot2_u32_u16 v1, s1, v1, v2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_v4i16: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-DL-NEXT: v_dot2_u32_u16 v0, s1, s0, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <4 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -1107,56 +1107,56 @@ define amdgpu_kernel void @udot2_v4i16_Hi(<4 x i16> addrspace(1)* %src1, ; GFX9-NODL-LABEL: udot2_v4i16_Hi: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x4 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x4 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x4 +; GFX9-NODL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v1, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s1, v1, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_v4i16_Hi: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x4 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x4 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_dot2_u32_u16 v1, s4, v1, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-DL-NEXT: v_dot2_u32_u16 v1, s1, v1, v2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_v4i16_Hi: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x4 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x4 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-DL-NEXT: v_dot2_u32_u16 v0, s1, s0, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <4 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -1236,69 +1236,69 @@ define amdgpu_kernel void @notudot2_v4i16_Even(<4 x i16> addrspace(1)* %src1, ; GFX9-NODL-LABEL: notudot2_v4i16_Even: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s10, 0xffff ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s11, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s3, s3, s8 -; GFX9-NODL-NEXT: s_and_b32 s2, s2, s8 -; GFX9-NODL-NEXT: s_and_b32 s5, s5, s8 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NODL-NEXT: s_and_b32 s1, s1, s10 +; GFX9-NODL-NEXT: s_and_b32 s0, s0, s10 +; GFX9-NODL-NEXT: s_and_b32 s5, s9, s10 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s11 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v1, v2 -; GFX9-NODL-NEXT: s_and_b32 s4, s4, s8 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NODL-NEXT: s_and_b32 s4, s8, s10 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notudot2_v4i16_Even: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s8, 0xffff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s10, 0xffff ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s11, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s3, s3, s8 -; GFX9-DL-NEXT: s_and_b32 s2, s2, s8 -; GFX9-DL-NEXT: s_and_b32 s5, s5, s8 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-DL-NEXT: s_and_b32 s1, s1, s10 +; GFX9-DL-NEXT: s_and_b32 s0, s0, s10 +; GFX9-DL-NEXT: s_and_b32 s5, s9, s10 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s11 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v1, v2 -; GFX9-DL-NEXT: s_and_b32 s4, s4, s8 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-DL-NEXT: s_and_b32 s4, s8, s10 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notudot2_v4i16_Even: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: s_mov_b32 s7, 0xffff +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s10, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_and_b32 s1, s1, s7 -; GFX10-DL-NEXT: s_and_b32 s3, s3, s7 -; GFX10-DL-NEXT: s_and_b32 s0, s0, s7 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s1, v0 -; GFX10-DL-NEXT: s_and_b32 s1, s2, s7 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s10 +; GFX10-DL-NEXT: s_and_b32 s1, s1, s4 +; GFX10-DL-NEXT: s_and_b32 s5, s9, s4 +; GFX10-DL-NEXT: s_and_b32 s0, s0, s4 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s1, v0 +; GFX10-DL-NEXT: s_and_b32 s1, s8, s4 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, s1, s0, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <4 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -1378,69 +1378,69 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(<4 x i16> addrspace(1)* %src1, ; GFX9-NODL-LABEL: notudot2_v4i16_Middle: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s10, 0xffff ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s11, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s3, s3, s8 -; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NODL-NEXT: s_and_b32 s5, s5, s8 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NODL-NEXT: s_and_b32 s1, s1, s10 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_and_b32 s5, s9, s10 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s11 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v1, v2 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NODL-NEXT: s_lshr_b32 s4, s8, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notudot2_v4i16_Middle: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s8, 0xffff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s10, 0xffff ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s11, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s3, s3, s8 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-DL-NEXT: s_and_b32 s5, s5, s8 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-DL-NEXT: s_and_b32 s1, s1, s10 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-DL-NEXT: s_and_b32 s5, s9, s10 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s11 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v1, v2 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-DL-NEXT: s_lshr_b32 s4, s8, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notudot2_v4i16_Middle: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: s_mov_b32 s7, 0xffff +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s10, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_and_b32 s1, s1, s7 -; GFX10-DL-NEXT: s_and_b32 s3, s3, s7 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s10 +; GFX10-DL-NEXT: s_and_b32 s1, s1, s4 +; GFX10-DL-NEXT: s_and_b32 s4, s9, s4 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 16 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s1, v0 -; GFX10-DL-NEXT: s_lshr_b32 s1, s2, 16 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s1, v0 +; GFX10-DL-NEXT: s_lshr_b32 s1, s8, 16 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, s1, s0, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <4 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -1520,69 +1520,69 @@ define amdgpu_kernel void @notudot2_DiffIndex(<2 x i16> addrspace(1)* %src1, ; GFX9-NODL-LABEL: notudot2_DiffIndex: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v1, v2 -; GFX9-NODL-NEXT: s_lshr_b32 s7, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s7, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_lshr_b32 s5, s1, 16 +; GFX9-NODL-NEXT: s_and_b32 s1, s1, s8 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s1, v1, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notudot2_DiffIndex: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v1, v2 -; GFX9-DL-NEXT: s_lshr_b32 s7, s4, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s7, v2, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-DL-NEXT: s_lshr_b32 s5, s1, 16 +; GFX9-DL-NEXT: s_and_b32 s1, s1, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s1, v1, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v2, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notudot2_DiffIndex: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_lshr_b32 s3, s0, 16 -; GFX10-DL-NEXT: s_and_b32 s6, s1, s2 -; GFX10-DL-NEXT: s_and_b32 s0, s0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_lshr_b32 s5, s0, 16 +; GFX10-DL-NEXT: s_and_b32 s6, s1, s4 +; GFX10-DL-NEXT: s_and_b32 s0, s0, s4 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 16 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s3, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s5, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, s1, s0, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -1664,72 +1664,72 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1 ; GFX9-NODL-LABEL: udot2_MultipleUses_add1: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v1, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v2, v1 +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s1, v1, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v2, v1 ; GFX9-NODL-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_MultipleUses_add1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v1, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v2, v1 +; GFX9-DL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-DL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s1, v1, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v2, v1 ; GFX9-DL-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_MultipleUses_add1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_mov_b32 s6, 0xffff -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-DL-NEXT: s_lshr_b32 s5, s1, 16 ; GFX10-DL-NEXT: s_and_b32 s0, s0, s6 ; GFX10-DL-NEXT: s_and_b32 s1, s1, s6 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s4, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s1, s0, v0 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, v1, v0 -; GFX10-DL-NEXT: global_store_dword v2, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -1810,69 +1810,69 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1 ; GFX9-NODL-LABEL: idot2_MultipleUses_add1: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s3, v2, v1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v2, v1 +; GFX9-NODL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-NODL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s1, v2, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s5, v2, v1 ; GFX9-NODL-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MultipleUses_add1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-DL-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v1, s3, v2, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v2, v1 +; GFX9-DL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-DL-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s1, v2, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s5, v2, v1 ; GFX9-DL-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MultipleUses_add1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_ashr_i32 s2, s0, 16 -; GFX10-DL-NEXT: s_ashr_i32 s3, s1, 16 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_ashr_i32 s4, s0, 16 +; GFX10-DL-NEXT: s_ashr_i32 s5, s1, 16 ; GFX10-DL-NEXT: s_sext_i32_i16 s0, s0 ; GFX10-DL-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s3, s2, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s5, s4, v0 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s1, s0, v0 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, v1, v0 -; GFX10-DL-NEXT: global_store_dword v2, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -1956,72 +1956,72 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1 ; GFX9-NODL-LABEL: udot2_MultipleUses_mul1: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v3, v2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v1, v2 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v1, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v1, v2 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_MultipleUses_mul1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v1, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v1, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-DL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v1, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v1, v2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_MultipleUses_mul1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_and_b32 s3, s0, s2 -; GFX10-DL-NEXT: s_and_b32 s2, s1, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_and_b32 s5, s0, s4 +; GFX10-DL-NEXT: s_and_b32 s4, s1, s4 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 16 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 16 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s3, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, s1, s0, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s3, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -2103,69 +2103,69 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1 ; GFX9-NODL-LABEL: idot2_MultipleUses_mul1: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s3, v3, v1 -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s5, v2, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s1, v3, v1 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s5, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MultipleUses_mul1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-DL-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v1, s3, v3, v1 -; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-DL-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s5, v2, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s1, v3, v1 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s5, v2, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MultipleUses_mul1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_sext_i32_i16 s2, s0 -; GFX10-DL-NEXT: s_sext_i32_i16 s3, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_sext_i32_i16 s4, s0 +; GFX10-DL-NEXT: s_sext_i32_i16 s5, s1 ; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 16 ; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 16 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s3, s2, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s5, s4, v0 ; GFX10-DL-NEXT: v_mad_i32_i24 v0, s1, s0, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s3, s2, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s5, s4, v0 +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -2250,72 +2250,72 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1 ; GFX9-NODL-LABEL: udot2_MultipleUses_mul2: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v1, v2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v1, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v1, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s1, v1, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_MultipleUses_mul2: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v1, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v1, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-DL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v1, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s1, v1, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v2, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_MultipleUses_mul2: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_mov_b32 s6, 0xffff -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-DL-NEXT: s_lshr_b32 s5, s1, 16 ; GFX10-DL-NEXT: s_and_b32 s0, s0, s6 ; GFX10-DL-NEXT: s_and_b32 s1, s1, s6 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s4, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s4, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, s1, s0, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -2397,69 +2397,69 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1 ; GFX9-NODL-LABEL: idot2_MultipleUses_mul2: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s3, v2, v1 -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s3, v2, v1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-NODL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s1, v2, v1 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s1, v2, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s5, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MultipleUses_mul2: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-DL-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v1, s3, v2, v1 -; GFX9-DL-NEXT: v_mad_i32_i24 v1, s3, v2, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-DL-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s1, v2, v1 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s1, v2, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s5, v2, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MultipleUses_mul2: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_ashr_i32 s2, s0, 16 -; GFX10-DL-NEXT: s_ashr_i32 s3, s1, 16 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_ashr_i32 s4, s0, 16 +; GFX10-DL-NEXT: s_ashr_i32 s5, s1, 16 ; GFX10-DL-NEXT: s_sext_i32_i16 s0, s0 ; GFX10-DL-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s3, s2, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s3, s2, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s5, s4, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s5, s4, v0 ; GFX10-DL-NEXT: v_mad_i32_i24 v0, s1, s0, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -2541,40 +2541,40 @@ define amdgpu_kernel void @udot2_acc16(<2 x i16> addrspace(1)* %src1, ; GFX9-NODL-LABEL: udot2_acc16: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[0:1] -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s5, s4, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: s_and_b32 s2, s3, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 +; GFX9-NODL-NEXT: s_lshr_b32 s5, s8, 16 +; GFX9-NODL-NEXT: s_and_b32 s4, s8, s0 +; GFX9-NODL-NEXT: s_and_b32 s0, s1, s0 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 -; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s1, v2, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s0, v2, v1 +; GFX9-NODL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_acc16: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: global_load_ushort v1, v0, s[0:1] +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot2_u32_u16 v1, s2, v2, v1 -; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot2_u32_u16 v1, s0, v2, v1 +; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_acc16: @@ -2584,10 +2584,10 @@ define amdgpu_kernel void @udot2_acc16(<2 x i16> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s7, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot2_u32_u16 v1, s0, s1, v1 +; GFX10-DL-NEXT: v_dot2_u32_u16 v1, s6, s7, v1 ; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -2673,12 +2673,12 @@ define amdgpu_kernel void @notsdot2_sext8(<2 x i8> addrspace(1)* %src1, ; GFX9-NODL-LABEL: notsdot2_sext8: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_ushort v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 8 ; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v1, 8, v1 @@ -2688,20 +2688,20 @@ define amdgpu_kernel void @notsdot2_sext8(<2 x i8> addrspace(1)* %src1, ; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s0 ; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v4, v3, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notsdot2_sext8: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_ushort v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_ushort v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_bfe_i32 v3, v1, 0, 8 ; GFX9-DL-NEXT: v_lshrrev_b16_e32 v1, 8, v1 @@ -2711,9 +2711,9 @@ define amdgpu_kernel void @notsdot2_sext8(<2 x i8> addrspace(1)* %src1, ; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX9-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s2 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s0 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, v4, v3, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notsdot2_sext8: diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll index 15e7147..e8bdaff 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -72,63 +72,63 @@ define amdgpu_kernel void @idot4_acc32(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL-LABEL: idot4_acc32: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s2 -; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s3 -; GFX9-NODL-NEXT: s_bfe_i32 s7, s3, 0x80008 +; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s0 +; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s1 +; GFX9-NODL-NEXT: s_bfe_i32 s7, s1, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NODL-NEXT: s_bfe_i32 s9, s3, 0x80010 +; GFX9-NODL-NEXT: s_bfe_i32 s9, s1, 0x80010 ; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s4, v1, v2 -; GFX9-NODL-NEXT: s_bfe_i32 s6, s2, 0x80008 +; GFX9-NODL-NEXT: s_bfe_i32 s6, s0, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NODL-NEXT: s_bfe_i32 s8, s2, 0x80010 +; GFX9-NODL-NEXT: s_bfe_i32 s8, s0, 0x80010 ; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 24 +; GFX9-NODL-NEXT: s_ashr_i32 s1, s1, 24 ; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s8, v2, v1 -; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s2, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s0, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_dot4_i32_i8 v1, s4, v1, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-DL-NEXT: v_dot4_i32_i8 v1, s0, v1, v2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-DL-NEXT: v_dot4_i32_i8 v0, s0, s1, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -246,47 +246,47 @@ define amdgpu_kernel void @idot4_acc16(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL-LABEL: idot4_acc16: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[0:1] -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s2 -; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s3 -; GFX9-NODL-NEXT: s_bfe_i32 s7, s3, 0x80008 +; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s0 +; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s1 +; GFX9-NODL-NEXT: s_bfe_i32 s7, s1, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: s_bfe_i32 s9, s3, 0x80010 -; GFX9-NODL-NEXT: s_bfe_i32 s6, s2, 0x80008 +; GFX9-NODL-NEXT: s_bfe_i32 s9, s1, 0x80010 +; GFX9-NODL-NEXT: s_bfe_i32 s6, s0, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NODL-NEXT: s_bfe_i32 s8, s2, 0x80010 -; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 24 +; GFX9-NODL-NEXT: s_bfe_i32 s8, s0, 0x80010 +; GFX9-NODL-NEXT: s_ashr_i32 s1, s1, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s9 -; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 24 +; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s4, v2, v1 ; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v3, v1 ; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s8, v4, v1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s2, v2, v1 -; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s0, v2, v1 +; GFX9-NODL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc16: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: global_load_ushort v1, v0, s[0:1] +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_i32_i8 v1, s2, v2, v1 -; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_i32_i8 v1, s0, v2, v1 +; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc16: @@ -296,10 +296,10 @@ define amdgpu_kernel void @idot4_acc16(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s7, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot4_i32_i8 v1, s0, s1, v1 +; GFX10-DL-NEXT: v_dot4_i32_i8 v1, s6, s7, v1 ; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -409,48 +409,48 @@ define amdgpu_kernel void @idot4_acc8(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL-LABEL: idot4_acc8: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff +; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_ubyte v1, v0, s[0:1] -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NODL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_bfe_u32 s7, s3, 0x80008 -; GFX9-NODL-NEXT: s_and_b32 s5, s4, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s4, 0x80008 -; GFX9-NODL-NEXT: s_and_b32 s2, s3, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: s_bfe_u32 s8, s4, 0x80010 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NODL-NEXT: s_bfe_u32 s9, s3, 0x80010 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80008 +; GFX9-NODL-NEXT: s_and_b32 s4, s8, s0 +; GFX9-NODL-NEXT: s_bfe_u32 s5, s8, 0x80008 +; GFX9-NODL-NEXT: s_and_b32 s0, s1, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s8, 0x80010 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NODL-NEXT: s_bfe_u32 s9, s1, 0x80010 +; GFX9-NODL-NEXT: s_lshr_b32 s8, s8, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s7, v3, v1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s0, v2, v1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s6, v3, v1 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s9, v4, v1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 -; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s1, v2, v1 +; GFX9-NODL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc8: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v1, s2, v2, v1 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, s0, v2, v1 +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc8: @@ -460,10 +460,10 @@ define amdgpu_kernel void @idot4_acc8(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s7, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot4_u32_u8 v1, s0, s1, v1 +; GFX10-DL-NEXT: v_dot4_u32_u8 v1, s6, s7, v1 ; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -565,91 +565,91 @@ define amdgpu_kernel void @idot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL-LABEL: idot4_multiuse_mul1: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s2 -; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s3 -; GFX9-NODL-NEXT: s_bfe_i32 s7, s3, 0x80008 +; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s0 +; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s1 +; GFX9-NODL-NEXT: s_bfe_i32 s7, s1, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NODL-NEXT: s_bfe_i32 s6, s2, 0x80008 +; GFX9-NODL-NEXT: s_bfe_i32 s6, s0, 0x80008 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s4, v1, v2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NODL-NEXT: s_bfe_i32 s9, s3, 0x80010 +; GFX9-NODL-NEXT: s_bfe_i32 s9, s1, 0x80010 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v3, v2 -; GFX9-NODL-NEXT: s_bfe_i32 s8, s2, 0x80010 +; GFX9-NODL-NEXT: s_bfe_i32 s8, s0, 0x80010 ; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s4, v1, v2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 24 +; GFX9-NODL-NEXT: s_ashr_i32 s1, s1, 24 ; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s8, v2, v1 -; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s2, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s0, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_multiuse_mul1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i8 s4, s2 -; GFX9-DL-NEXT: s_sext_i32_i8 s5, s3 -; GFX9-DL-NEXT: s_bfe_i32 s7, s3, 0x80008 +; GFX9-DL-NEXT: s_sext_i32_i8 s4, s0 +; GFX9-DL-NEXT: s_sext_i32_i8 s5, s1 +; GFX9-DL-NEXT: s_bfe_i32 s7, s1, 0x80008 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-DL-NEXT: s_bfe_i32 s6, s2, 0x80008 +; GFX9-DL-NEXT: s_bfe_i32 s6, s0, 0x80008 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s4, v1, v2 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-DL-NEXT: s_bfe_i32 s9, s3, 0x80010 +; GFX9-DL-NEXT: s_bfe_i32 s9, s1, 0x80010 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v3, v2 -; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x80010 +; GFX9-DL-NEXT: s_bfe_i32 s8, s0, 0x80010 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s4, v1, v2 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 24 +; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 24 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s8, v2, v1 -; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 24 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_mad_i32_i24 v1, s2, v2, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 24 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s0, v2, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_multiuse_mul1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_sext_i32_i8 s2, s0 -; GFX10-DL-NEXT: s_sext_i32_i8 s3, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_sext_i32_i8 s4, s0 +; GFX10-DL-NEXT: s_sext_i32_i8 s5, s1 ; GFX10-DL-NEXT: s_bfe_i32 s6, s0, 0x80008 ; GFX10-DL-NEXT: s_bfe_i32 s7, s1, 0x80008 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s2, s3, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s5, v0 ; GFX10-DL-NEXT: v_mad_i32_i24 v0, s6, s7, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s2, s3, v0 -; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x80010 -; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x80010 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s5, v0 +; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x80010 +; GFX10-DL-NEXT: s_bfe_i32 s5, s1, 0x80010 ; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 24 ; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 24 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s2, s3, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s5, v0 ; GFX10-DL-NEXT: v_mad_i32_i24 v0, s0, s1, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -760,92 +760,92 @@ define amdgpu_kernel void @idot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL-LABEL: idot4_acc32_vecMul: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s8, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s2 -; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v2, 8, s3 -; GFX9-NODL-NEXT: s_ashr_i32 s6, s3, 24 -; GFX9-NODL-NEXT: s_bfe_i32 s7, s3, 0x80010 -; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s3 -; GFX9-NODL-NEXT: s_ashr_i32 s4, s2, 24 -; GFX9-NODL-NEXT: s_bfe_i32 s5, s2, 0x80010 -; GFX9-NODL-NEXT: s_sext_i32_i8 s2, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s0 +; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v2, 8, s1 +; GFX9-NODL-NEXT: s_ashr_i32 s6, s1, 24 +; GFX9-NODL-NEXT: s_bfe_i32 s7, s1, 0x80010 +; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s1 +; GFX9-NODL-NEXT: s_ashr_i32 s4, s0, 24 +; GFX9-NODL-NEXT: s_bfe_i32 s5, s0, 0x80010 +; GFX9-NODL-NEXT: s_sext_i32_i8 s0, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s8 ; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX9-NODL-NEXT: v_mad_i32_i24 v3, s2, v3, v4 +; GFX9-NODL-NEXT: v_mad_i32_i24 v3, s0, v3, v4 ; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v1, v2, v3 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s5, v2, v1 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s4, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s8, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s2 -; GFX9-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s3 -; GFX9-DL-NEXT: s_ashr_i32 s6, s3, 24 -; GFX9-DL-NEXT: s_bfe_i32 s7, s3, 0x80010 -; GFX9-DL-NEXT: s_sext_i32_i8 s3, s3 -; GFX9-DL-NEXT: s_ashr_i32 s4, s2, 24 -; GFX9-DL-NEXT: s_bfe_i32 s5, s2, 0x80010 -; GFX9-DL-NEXT: s_sext_i32_i8 s2, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s0 +; GFX9-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s1 +; GFX9-DL-NEXT: s_ashr_i32 s6, s1, 24 +; GFX9-DL-NEXT: s_bfe_i32 s7, s1, 0x80010 +; GFX9-DL-NEXT: s_sext_i32_i8 s1, s1 +; GFX9-DL-NEXT: s_ashr_i32 s4, s0, 24 +; GFX9-DL-NEXT: s_bfe_i32 s5, s0, 0x80010 +; GFX9-DL-NEXT: s_sext_i32_i8 s0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s8 ; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX9-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX9-DL-NEXT: v_mad_i32_i24 v3, s2, v3, v4 +; GFX9-DL-NEXT: v_mad_i32_i24 v3, s0, v3, v4 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, v1, v2, v3 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s5, v2, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s4, v2, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_vecMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v0, 8, s2 -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s3 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-DL-NEXT: s_sext_i32_i8 s4, s2 -; GFX10-DL-NEXT: s_sext_i32_i8 s5, s3 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v0, 8, s0 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-DL-NEXT: s_sext_i32_i8 s4, s0 +; GFX10-DL-NEXT: s_sext_i32_i8 s5, s1 ; GFX10-DL-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX10-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2 -; GFX10-DL-NEXT: s_bfe_i32 s4, s2, 0x80010 -; GFX10-DL-NEXT: s_bfe_i32 s5, s3, 0x80010 -; GFX10-DL-NEXT: s_ashr_i32 s2, s2, 24 -; GFX10-DL-NEXT: s_ashr_i32 s3, s3, 24 +; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x80010 +; GFX10-DL-NEXT: s_bfe_i32 s5, s1, 0x80010 +; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 24 +; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 24 ; GFX10-DL-NEXT: v_mad_i32_i24 v0, v0, v1, v2 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s5, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s2, s3, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s0, s1, v0 +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -939,15 +939,15 @@ define amdgpu_kernel void @idot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL-LABEL: idot4_acc16_vecMul: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NODL-NEXT: s_lshr_b32 s5, s3, 16 +; GFX9-NODL-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-NODL-NEXT: s_lshr_b32 s5, s1, 16 ; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v4, 8, s5 ; GFX9-NODL-NEXT: s_bfe_i32 s5, s5, 0x80000 ; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v3, 8, s4 @@ -956,37 +956,37 @@ define amdgpu_kernel void @idot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL-NEXT: v_lshl_or_b32 v4, v4, 16, v6 ; GFX9-NODL-NEXT: v_and_b32_e32 v6, s4, v5 ; GFX9-NODL-NEXT: v_lshl_or_b32 v3, v3, 16, v6 -; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v2, 8, s3 -; GFX9-NODL-NEXT: s_bfe_i32 s3, s3, 0x80000 -; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v1, 8, s2 +; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v2, 8, s1 +; GFX9-NODL-NEXT: s_bfe_i32 s1, s1, 0x80000 +; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v1, 8, s0 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v3, v3, v4 -; GFX9-NODL-NEXT: v_and_b32_e32 v4, s3, v5 -; GFX9-NODL-NEXT: s_bfe_i32 s2, s2, 0x80000 +; GFX9-NODL-NEXT: v_and_b32_e32 v4, s1, v5 +; GFX9-NODL-NEXT: s_bfe_i32 s0, s0, 0x80000 ; GFX9-NODL-NEXT: v_lshl_or_b32 v2, v2, 16, v4 -; GFX9-NODL-NEXT: v_and_b32_e32 v4, s2, v5 +; GFX9-NODL-NEXT: v_and_b32_e32 v4, s0, v5 ; GFX9-NODL-NEXT: v_lshl_or_b32 v1, v1, 16, v4 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-NODL-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_add_u32_e32 v2, v1, v2 ; GFX9-NODL-NEXT: v_add_u32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NODL-NEXT: v_add_u32_e32 v1, v1, v3 ; GFX9-NODL-NEXT: v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc16_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-DL-NEXT: s_lshr_b32 s5, s3, 16 +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-DL-NEXT: s_lshr_b32 s5, s1, 16 ; GFX9-DL-NEXT: v_ashrrev_i16_e64 v4, 8, s5 ; GFX9-DL-NEXT: s_bfe_i32 s5, s5, 0x80000 ; GFX9-DL-NEXT: v_ashrrev_i16_e64 v3, 8, s4 @@ -995,23 +995,23 @@ define amdgpu_kernel void @idot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX9-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v6 ; GFX9-DL-NEXT: v_and_b32_e32 v6, s4, v5 ; GFX9-DL-NEXT: v_lshl_or_b32 v3, v3, 16, v6 -; GFX9-DL-NEXT: v_ashrrev_i16_e64 v2, 8, s3 -; GFX9-DL-NEXT: s_bfe_i32 s3, s3, 0x80000 -; GFX9-DL-NEXT: v_ashrrev_i16_e64 v1, 8, s2 +; GFX9-DL-NEXT: v_ashrrev_i16_e64 v2, 8, s1 +; GFX9-DL-NEXT: s_bfe_i32 s1, s1, 0x80000 +; GFX9-DL-NEXT: v_ashrrev_i16_e64 v1, 8, s0 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v4 -; GFX9-DL-NEXT: v_and_b32_e32 v4, s3, v5 -; GFX9-DL-NEXT: s_bfe_i32 s2, s2, 0x80000 +; GFX9-DL-NEXT: v_and_b32_e32 v4, s1, v5 +; GFX9-DL-NEXT: s_bfe_i32 s0, s0, 0x80000 ; GFX9-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v4 -; GFX9-DL-NEXT: v_and_b32_e32 v4, s2, v5 +; GFX9-DL-NEXT: v_and_b32_e32 v4, s0, v5 ; GFX9-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v4 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-DL-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX9-DL-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u32_e32 v2, v1, v2 ; GFX9-DL-NEXT: v_add_u32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v3 ; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc16_vecMul: @@ -1022,28 +1022,28 @@ define amdgpu_kernel void @idot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s7, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 8, s0 -; GFX10-DL-NEXT: s_bfe_i32 s0, s0, 0x80000 -; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x80000 -; GFX10-DL-NEXT: v_and_b32_e32 v6, s0, v2 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 8, s1 -; GFX10-DL-NEXT: v_and_b32_e32 v5, s3, v2 -; GFX10-DL-NEXT: s_lshr_b32 s0, s1, 16 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 8, s2 +; GFX10-DL-NEXT: s_bfe_i32 s1, s6, 0x80000 +; GFX10-DL-NEXT: s_bfe_i32 s2, s7, 0x80000 +; GFX10-DL-NEXT: v_and_b32_e32 v6, s1, v2 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 8, s6 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 8, s7 +; GFX10-DL-NEXT: v_and_b32_e32 v5, s2, v2 +; GFX10-DL-NEXT: s_lshr_b32 s0, s6, 16 +; GFX10-DL-NEXT: s_lshr_b32 s1, s7, 16 ; GFX10-DL-NEXT: v_lshl_or_b32 v3, v3, 16, v6 -; GFX10-DL-NEXT: s_bfe_i32 s1, s2, 0x80000 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 8, s0 ; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v5 -; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x80000 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 8, s0 +; GFX10-DL-NEXT: s_bfe_i32 s2, s1, 0x80000 +; GFX10-DL-NEXT: s_bfe_i32 s0, s0, 0x80000 ; GFX10-DL-NEXT: v_and_b32_e32 v6, s2, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v2, s1, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v2, s0, v2 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v4 -; GFX10-DL-NEXT: v_lshl_or_b32 v4, v5, 16, v6 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 8, s1 ; GFX10-DL-NEXT: v_lshl_or_b32 v2, v7, 16, v2 +; GFX10-DL-NEXT: v_lshl_or_b32 v4, v5, 16, v6 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v3, v1 diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll index 4159349..48f2f5c 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -74,64 +74,64 @@ define amdgpu_kernel void @udot4_acc32(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL-LABEL: udot4_acc32: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_movk_i32 s8, 0xff ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s5, s3, s2 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NODL-NEXT: s_bfe_u32 s9, s4, 0x80010 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v1, v2 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s3, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s9, s1, 0x80010 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v1, v2 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s0, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NODL-NEXT: s_bfe_u32 s8, s3, 0x80010 +; GFX9-NODL-NEXT: s_bfe_u32 s8, s0, 0x80010 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s6, v2, v1 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s8, v2, v1 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s0, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc32: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_dot4_u32_u8 v1, s4, v1, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, s0, v1, v2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc32: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-DL-NEXT: v_dot4_u32_u8 v0, s0, s1, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -241,48 +241,48 @@ define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL-LABEL: udot4_acc16: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff +; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[0:1] -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s5, s3, s2 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s9, s4, 0x80010 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s3, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NODL-NEXT: s_bfe_u32 s8, s3, 0x80010 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24 +; GFX9-NODL-NEXT: s_and_b32 s4, s1, s0 +; GFX9-NODL-NEXT: s_and_b32 s0, s8, s0 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s8, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NODL-NEXT: s_bfe_u32 s9, s8, 0x80010 +; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80010 +; GFX9-NODL-NEXT: s_lshr_b32 s8, s8, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s9 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v2, v1 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s6, v3, v1 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s8, v4, v1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 -; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v3, v1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s7, v4, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s1, v2, v1 +; GFX9-NODL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc16: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: global_load_ushort v1, v0, s[0:1] +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v1, s2, v2, v1 -; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, s0, v2, v1 +; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc16: @@ -292,10 +292,10 @@ define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s7, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot4_u32_u8 v1, s0, s1, v1 +; GFX10-DL-NEXT: v_dot4_u32_u8 v1, s6, s7, v1 ; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -406,48 +406,48 @@ define amdgpu_kernel void @udot4_acc8(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL-LABEL: udot4_acc8: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff +; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_ubyte v1, v0, s[0:1] -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NODL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_bfe_u32 s7, s3, 0x80008 -; GFX9-NODL-NEXT: s_and_b32 s5, s4, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s4, 0x80008 -; GFX9-NODL-NEXT: s_and_b32 s2, s3, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: s_bfe_u32 s8, s4, 0x80010 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NODL-NEXT: s_bfe_u32 s9, s3, 0x80010 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80008 +; GFX9-NODL-NEXT: s_and_b32 s4, s8, s0 +; GFX9-NODL-NEXT: s_bfe_u32 s5, s8, 0x80008 +; GFX9-NODL-NEXT: s_and_b32 s0, s1, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s8, 0x80010 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NODL-NEXT: s_bfe_u32 s9, s1, 0x80010 +; GFX9-NODL-NEXT: s_lshr_b32 s8, s8, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s7, v3, v1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s0, v2, v1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s6, v3, v1 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s9, v4, v1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 -; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s1, v2, v1 +; GFX9-NODL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc8: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v1, s2, v2, v1 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, s0, v2, v1 +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc8: @@ -457,10 +457,10 @@ define amdgpu_kernel void @udot4_acc8(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s7, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot4_u32_u8 v1, s0, s1, v1 +; GFX10-DL-NEXT: v_dot4_u32_u8 v1, s6, s7, v1 ; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -548,47 +548,47 @@ define amdgpu_kernel void @udot2_8(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL-LABEL: udot2_8: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff +; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_ubyte v1, v0, s[0:1] -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NODL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s5, s4, s2 -; GFX9-NODL-NEXT: s_and_b32 s2, s3, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: s_bfe_u32 s4, s4, 0x80008 -; GFX9-NODL-NEXT: s_bfe_u32 s3, s3, 0x80008 -; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 +; GFX9-NODL-NEXT: s_and_b32 s4, s8, s0 +; GFX9-NODL-NEXT: s_and_b32 s0, s1, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 -; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NODL-NEXT: s_bfe_u32 s5, s8, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s1, s1, 0x80008 +; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s0, v2, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s1, v2, v1 +; GFX9-NODL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_8: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: s_movk_i32 s2, 0xff +; GFX9-DL-NEXT: s_movk_i32 s0, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s5, s4, s2 -; GFX9-DL-NEXT: s_and_b32 s2, s3, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-DL-NEXT: s_bfe_u32 s4, s4, 0x80008 -; GFX9-DL-NEXT: s_bfe_u32 s3, s3, 0x80008 -; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 +; GFX9-DL-NEXT: s_and_b32 s4, s8, s0 +; GFX9-DL-NEXT: s_and_b32 s0, s1, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: s_bfe_u32 s5, s8, 0x80008 +; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x80008 +; GFX9-DL-NEXT: s_waitcnt vmcnt(0) +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s0, v2, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s1, v2, v1 +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_8: @@ -598,17 +598,17 @@ define amdgpu_kernel void @udot2_8(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_movk_i32 s1, 0xff +; GFX10-DL-NEXT: s_load_dword s7, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_movk_i32 s0, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s3, s2, s1 -; GFX10-DL-NEXT: s_and_b32 s1, s0, s1 -; GFX10-DL-NEXT: s_bfe_u32 s0, s0, 0x80008 +; GFX10-DL-NEXT: s_and_b32 s1, s7, s0 +; GFX10-DL-NEXT: s_and_b32 s0, s6, s0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s1, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s1, s2, 0x80008 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s7, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s1, s6, 0x80008 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s1, s0, v1 ; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -700,48 +700,48 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(<4 x i8> addrspace(1)* %sr ; GFX9-NODL-LABEL: udot4_CommutationInsideMAD: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff +; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_ubyte v1, v0, s[0:1] -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NODL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s5, s3, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s3, 0x80008 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: s_bfe_u32 s8, s3, 0x80010 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NODL-NEXT: s_bfe_u32 s9, s4, 0x80010 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24 +; GFX9-NODL-NEXT: s_and_b32 s4, s1, s0 +; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008 +; GFX9-NODL-NEXT: s_and_b32 s0, s8, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80010 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s8, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NODL-NEXT: s_bfe_u32 s9, s8, 0x80010 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NODL-NEXT: s_lshr_b32 s8, s8, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s7, v3, v1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s0, v2, v1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s6, v3, v1 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s9, v4, v1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 -; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s8, v2, v1 +; GFX9-NODL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_CommutationInsideMAD: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v1, s3, v2, v1 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, s1, v2, v1 +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_CommutationInsideMAD: @@ -751,10 +751,10 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(<4 x i8> addrspace(1)* %sr ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s7, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot4_u32_u8 v1, s1, s0, v1 +; GFX10-DL-NEXT: v_dot4_u32_u8 v1, s7, s6, v1 ; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -858,63 +858,63 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(<4 x i8> addrspace(1)* % ; GFX9-NODL-LABEL: udot4_CommutationAccrossMADs: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff +; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_ubyte v1, v0, s[0:1] -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NODL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_bfe_u32 s6, s3, 0x80008 -; GFX9-NODL-NEXT: s_and_b32 s5, s3, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NODL-NEXT: s_bfe_u32 s8, s3, 0x80010 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NODL-NEXT: s_bfe_u32 s9, s4, 0x80010 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24 +; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008 +; GFX9-NODL-NEXT: s_and_b32 s4, s1, s0 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s8, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80010 +; GFX9-NODL-NEXT: s_and_b32 s0, s8, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NODL-NEXT: s_bfe_u32 s9, s8, 0x80010 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NODL-NEXT: s_lshr_b32 s8, s8, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s7, v2, v1 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v3, v1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s6, v2, v1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s0, v3, v1 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s9, v4, v1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 -; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s8, v2, v1 +; GFX9-NODL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_CommutationAccrossMADs: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: s_movk_i32 s2, 0xff +; GFX9-DL-NEXT: s_movk_i32 s0, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_u32 s6, s3, 0x80008 -; GFX9-DL-NEXT: s_and_b32 s5, s3, s2 -; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-DL-NEXT: s_bfe_u32 s8, s3, 0x80010 -; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x80010 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 24 +; GFX9-DL-NEXT: s_bfe_u32 s5, s1, 0x80008 +; GFX9-DL-NEXT: s_and_b32 s4, s1, s0 +; GFX9-DL-NEXT: s_bfe_u32 s6, s8, 0x80008 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-DL-NEXT: s_bfe_u32 s7, s1, 0x80010 +; GFX9-DL-NEXT: s_and_b32 s0, s8, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-DL-NEXT: s_bfe_u32 s9, s8, 0x80010 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 24 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-DL-NEXT: s_lshr_b32 s8, s8, 24 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s7, v2, v1 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v3, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s6, v2, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s0, v3, v1 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s9, v4, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s8, v2, v1 +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_CommutationAccrossMADs: @@ -922,24 +922,24 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(<4 x i8> addrspace(1)* % ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: s_movk_i32 s6, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s7, s[2:3], 0x0 +; GFX10-DL-NEXT: s_movk_i32 s2, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80008 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x80008 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s3, s2, v1 -; GFX10-DL-NEXT: s_and_b32 s2, s0, s6 -; GFX10-DL-NEXT: s_and_b32 s3, s1, s6 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s3, s2, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80010 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s3, s2, v1 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s1, s0, v1 +; GFX10-DL-NEXT: s_and_b32 s0, s6, s2 +; GFX10-DL-NEXT: s_and_b32 s1, s7, s2 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s1, s0, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x80010 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s1, s0, v1 +; GFX10-DL-NEXT: s_lshr_b32 s0, s6, 24 +; GFX10-DL-NEXT: s_lshr_b32 s1, s7, 24 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s1, s0, v1 ; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm @@ -1045,94 +1045,94 @@ define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL-LABEL: udot4_multiuse_mul1: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_movk_i32 s8, 0xff ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s5, s3, s2 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s3, 0x80008 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v1, v2 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s0, 0x80008 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v1, v2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NODL-NEXT: s_bfe_u32 s9, s4, 0x80010 +; GFX9-NODL-NEXT: s_bfe_u32 s9, s1, 0x80010 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s6, v3, v2 -; GFX9-NODL-NEXT: s_bfe_u32 s8, s3, 0x80010 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v1, v2 +; GFX9-NODL-NEXT: s_bfe_u32 s8, s0, 0x80010 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v1, v2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s8, v2, v1 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s0, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_multiuse_mul1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_movk_i32 s2, 0xff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_movk_i32 s8, 0xff ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s5, s3, s2 -; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-DL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-DL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-DL-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-DL-NEXT: s_bfe_u32 s6, s3, 0x80008 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v1, v2 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x80008 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v1, v2 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x80010 +; GFX9-DL-NEXT: s_bfe_u32 s9, s1, 0x80010 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v3, v2 -; GFX9-DL-NEXT: s_bfe_u32 s8, s3, 0x80010 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v1, v2 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x80010 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v1, v2 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 24 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s8, v2, v1 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 24 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s0, v2, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_multiuse_mul1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX10-DL-NEXT: s_movk_i32 s2, 0xff +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_movk_i32 s4, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_and_b32 s3, s0, s2 -; GFX10-DL-NEXT: s_and_b32 s2, s1, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_and_b32 s5, s0, s4 +; GFX10-DL-NEXT: s_and_b32 s4, s1, s4 ; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x80008 ; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x80008 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s4, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s7, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80010 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s4, v0 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x80010 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s3, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, s0, s1, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -1247,97 +1247,97 @@ define amdgpu_kernel void @udot4_multiuse_add1(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL-LABEL: udot4_multiuse_add1: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_movk_i32 s8, 0xff ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s5, s3, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s3, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s0, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s10 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s6, v1, v2 -; GFX9-NODL-NEXT: s_bfe_u32 s9, s4, 0x80010 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s8, s3, 0x80010 +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_bfe_u32 s9, s1, 0x80010 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NODL-NEXT: s_bfe_u32 s8, s0, 0x80010 ; GFX9-NODL-NEXT: v_add_u32_e32 v2, s10, v1 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v3, v1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v3, v1 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s8, v3, v1 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s3, v3, v1 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s0, v3, v1 ; GFX9-NODL-NEXT: v_add_u32_e32 v1, v1, v2 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_multiuse_add1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_movk_i32 s2, 0xff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_movk_i32 s8, 0xff ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s5, s3, s2 -; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: s_bfe_u32 s6, s3, 0x80008 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x80008 +; GFX9-DL-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX9-DL-NEXT: s_and_b32 s5, s1, s8 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s10 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s6, v1, v2 -; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x80010 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-DL-NEXT: s_bfe_u32 s8, s3, 0x80010 +; GFX9-DL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-DL-NEXT: s_bfe_u32 s9, s1, 0x80010 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x80010 ; GFX9-DL-NEXT: v_add_u32_e32 v2, s10, v1 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v3, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v3, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 24 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s8, v3, v1 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s3, v3, v1 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 24 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s0, v3, v1 ; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_multiuse_add1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: s_movk_i32 s7, 0xff +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_movk_i32 s6, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80008 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80008 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s3, v0 -; GFX10-DL-NEXT: s_and_b32 s2, s0, s7 -; GFX10-DL-NEXT: s_and_b32 s3, s1, s7 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v0 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80010 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x80008 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 +; GFX10-DL-NEXT: s_and_b32 s4, s0, s6 +; GFX10-DL-NEXT: s_and_b32 s5, s1, s6 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s4, s5, v0 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x80010 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, s6, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s4, s5, v1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, s8, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, v1, v0 -; GFX10-DL-NEXT: global_store_dword v2, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -1449,61 +1449,61 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL-LABEL: notdot4_mixedtypes: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[0:1] -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_bfe_u32 s6, s2, 0x80008 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s3, 0x80008 -; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s3 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s0, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s1 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NODL-NEXT: s_bfe_u32 s9, s3, 0x80010 -; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s2 +; GFX9-NODL-NEXT: s_bfe_u32 s9, s1, 0x80010 +; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NODL-NEXT: s_bfe_u32 s8, s2, 0x80010 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 +; GFX9-NODL-NEXT: s_bfe_u32 s8, s0, 0x80010 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s9 -; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s6, v2, v1 ; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s4, v3, v1 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s8, v4, v1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 -; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s0, v2, v1 +; GFX9-NODL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notdot4_mixedtypes: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_ushort v1, v0, s[0:1] -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-DL-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x80008 -; GFX9-DL-NEXT: s_bfe_u32 s7, s3, 0x80008 -; GFX9-DL-NEXT: s_sext_i32_i8 s5, s3 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x80008 +; GFX9-DL-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX9-DL-NEXT: s_sext_i32_i8 s5, s1 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-DL-NEXT: s_bfe_u32 s9, s3, 0x80010 -; GFX9-DL-NEXT: s_sext_i32_i8 s4, s2 +; GFX9-DL-NEXT: s_bfe_u32 s9, s1, 0x80010 +; GFX9-DL-NEXT: s_sext_i32_i8 s4, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x80010 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 24 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x80010 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s9 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s6, v2, v1 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s4, v3, v1 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s8, v4, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 -; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s0, v2, v1 +; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notdot4_mixedtypes: @@ -1513,21 +1513,21 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s7, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80008 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x80008 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_sext_i32_i8 s2, s0 -; GFX10-DL-NEXT: s_sext_i32_i8 s3, s1 -; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80010 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_sext_i32_i8 s0, s6 +; GFX10-DL-NEXT: s_sext_i32_i8 s1, s7 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x80010 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_lshr_b32 s0, s6, 24 +; GFX10-DL-NEXT: s_lshr_b32 s1, s7, 24 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 ; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm @@ -1641,94 +1641,94 @@ define amdgpu_kernel void @udot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL-LABEL: udot4_acc32_vecMul: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_movk_i32 s8, 0xff ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_lshr_b32 s5, s3, 24 -; GFX9-NODL-NEXT: s_lshr_b32 s6, s4, 24 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s3, 0x80010 -; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s3 -; GFX9-NODL-NEXT: s_and_b32 s3, s3, s2 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s8, s4, 0x80010 -; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v2, 8, s4 -; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NODL-NEXT: s_lshr_b32 s4, s0, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s5, s1, 24 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s0, 0x80010 +; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s0 +; GFX9-NODL-NEXT: s_and_b32 s0, s0, s8 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80010 +; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v2, 8, s1 +; GFX9-NODL-NEXT: s_and_b32 s1, s1, s8 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NODL-NEXT: v_mad_u32_u24 v3, s3, v3, v4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-NODL-NEXT: v_mad_u32_u24 v3, s0, v3, v4 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v1, v2, v3 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s7, v2, v1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s6, v2, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc32_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_movk_i32 s2, 0xff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_movk_i32 s8, 0xff ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s5, s3, 24 -; GFX9-DL-NEXT: s_lshr_b32 s6, s4, 24 -; GFX9-DL-NEXT: s_bfe_u32 s7, s3, 0x80010 -; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s3 -; GFX9-DL-NEXT: s_and_b32 s3, s3, s2 -; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: s_bfe_u32 s8, s4, 0x80010 -; GFX9-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s4 -; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 24 +; GFX9-DL-NEXT: s_lshr_b32 s5, s1, 24 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x80010 +; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s0 +; GFX9-DL-NEXT: s_and_b32 s0, s0, s8 +; GFX9-DL-NEXT: s_bfe_u32 s7, s1, 0x80010 +; GFX9-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s1 +; GFX9-DL-NEXT: s_and_b32 s1, s1, s8 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-DL-NEXT: v_mad_u32_u24 v3, s3, v3, v4 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-DL-NEXT: v_mad_u32_u24 v3, s0, v3, v4 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, v1, v2, v3 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s7, v2, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v2, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s6, v2, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc32_vecMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: s_movk_i32 s6, 0xff -; GFX10-DL-NEXT: s_mov_b32 s5, 0xffff +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_movk_i32 s5, 0xff +; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-DL-NEXT: s_and_b32 s4, s2, s6 -; GFX10-DL-NEXT: s_and_b32 s6, s3, s6 -; GFX10-DL-NEXT: v_and_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-DL-NEXT: v_and_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s6, v2 -; GFX10-DL-NEXT: s_bfe_u32 s4, s2, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s5, s3, 0x80010 -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 24 -; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 24 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-DL-NEXT: s_and_b32 s6, s0, s5 +; GFX10-DL-NEXT: s_and_b32 s5, s1, s5 +; GFX10-DL-NEXT: v_and_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-DL-NEXT: v_and_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s6, s5, v2 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x80010 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v1, v2 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s3, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s0, s1, v0 +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -1823,71 +1823,71 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL-LABEL: udot4_acc16_vecMul: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_lshr_b32 s5, s2, 16 -; GFX9-NODL-NEXT: s_lshr_b32 s7, s3, 16 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s2, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s5, s0, 16 +; GFX9-NODL-NEXT: s_lshr_b32 s7, s1, 16 +; GFX9-NODL-NEXT: s_lshr_b32 s4, s0, 24 ; GFX9-NODL-NEXT: v_and_b32_sdwa v5, v3, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NODL-NEXT: s_lshr_b32 s6, s3, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s6, s1, 24 ; GFX9-NODL-NEXT: v_and_b32_sdwa v4, v3, s7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NODL-NEXT: v_lshl_or_b32 v4, s6, 16, v4 ; GFX9-NODL-NEXT: v_lshl_or_b32 v5, s4, 16, v5 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v4, v5, v4 -; GFX9-NODL-NEXT: v_and_b32_sdwa v5, v3, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v2, 8, s3 -; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s2 -; GFX9-NODL-NEXT: v_and_b32_sdwa v3, v3, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NODL-NEXT: v_and_b32_sdwa v5, v3, s1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v2, 8, s1 +; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s0 +; GFX9-NODL-NEXT: v_and_b32_sdwa v3, v3, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NODL-NEXT: v_lshl_or_b32 v2, v2, 16, v5 ; GFX9-NODL-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-NODL-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_add_u32_e32 v2, v1, v2 ; GFX9-NODL-NEXT: v_add_u32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NODL-NEXT: v_add_u32_e32 v1, v1, v4 ; GFX9-NODL-NEXT: v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc16_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 16 -; GFX9-DL-NEXT: s_lshr_b32 s7, s3, 16 -; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 24 +; GFX9-DL-NEXT: s_lshr_b32 s5, s0, 16 +; GFX9-DL-NEXT: s_lshr_b32 s7, s1, 16 +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 24 ; GFX9-DL-NEXT: v_and_b32_sdwa v5, v3, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-DL-NEXT: s_lshr_b32 s6, s3, 24 +; GFX9-DL-NEXT: s_lshr_b32 s6, s1, 24 ; GFX9-DL-NEXT: v_and_b32_sdwa v4, v3, s7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-DL-NEXT: v_lshl_or_b32 v4, s6, 16, v4 ; GFX9-DL-NEXT: v_lshl_or_b32 v5, s4, 16, v5 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v4 -; GFX9-DL-NEXT: v_and_b32_sdwa v5, v3, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s3 -; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s2 -; GFX9-DL-NEXT: v_and_b32_sdwa v3, v3, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-DL-NEXT: v_and_b32_sdwa v5, v3, s1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s1 +; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s0 +; GFX9-DL-NEXT: v_and_b32_sdwa v3, v3, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v5 ; GFX9-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-DL-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX9-DL-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u32_e32 v2, v1, v2 ; GFX9-DL-NEXT: v_add_u32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v4 ; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc16_vecMul: @@ -1898,24 +1898,24 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s7, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, s0 -; GFX10-DL-NEXT: v_and_b32_sdwa v6, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, s1 -; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, s1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: s_lshr_b32 s2, s1, 16 -; GFX10-DL-NEXT: s_lshr_b32 s3, s0, 16 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, s6 +; GFX10-DL-NEXT: v_and_b32_sdwa v6, v2, s6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, s7 +; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, s7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: s_lshr_b32 s0, s7, 16 +; GFX10-DL-NEXT: s_lshr_b32 s1, s6, 16 ; GFX10-DL-NEXT: v_lshl_or_b32 v3, v3, 16, v6 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v5 -; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_and_b32_sdwa v2, v2, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 +; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_and_b32_sdwa v2, v2, s1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: s_lshr_b32 s1, s6, 24 +; GFX10-DL-NEXT: s_lshr_b32 s0, s7, 24 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v4 -; GFX10-DL-NEXT: v_lshl_or_b32 v4, s1, 16, v5 -; GFX10-DL-NEXT: v_lshl_or_b32 v2, s0, 16, v2 +; GFX10-DL-NEXT: v_lshl_or_b32 v4, s0, 16, v5 +; GFX10-DL-NEXT: v_lshl_or_b32 v2, s1, 16, v2 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v3, v1 @@ -2039,23 +2039,23 @@ define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL-LABEL: udot4_acc8_vecMul: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: global_load_ubyte v4, v0, s[0:1] +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: global_load_ubyte v4, v0, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NODL-NEXT: s_lshr_b32 s6, s3, 16 -; GFX9-NODL-NEXT: s_lshr_b32 s7, s3, 24 -; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v1, s2, v1 -; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v2, s2, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-NODL-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NODL-NEXT: s_lshr_b32 s6, s1, 16 +; GFX9-NODL-NEXT: s_lshr_b32 s7, s1, 24 +; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v1, s0, v1 +; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v2, s0, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NODL-NEXT: s_lshr_b32 s5, s2, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s5, s0, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v2, s5, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v3, s4, v3 @@ -2068,29 +2068,29 @@ define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL-NEXT: v_add_u32_e32 v1, v1, v3 ; GFX9-NODL-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NODL-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc8_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: global_load_ubyte v4, v0, s[0:1] +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: global_load_ubyte v4, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: s_lshr_b32 s6, s3, 16 -; GFX9-DL-NEXT: s_lshr_b32 s7, s3, 24 -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v1, s2, v1 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v2, s2, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DL-NEXT: s_lshr_b32 s6, s1, 16 +; GFX9-DL-NEXT: s_lshr_b32 s7, s1, 24 +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v1, s0, v1 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v2, s0, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-DL-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 24 +; GFX9-DL-NEXT: s_lshr_b32 s5, s0, 24 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v2, s5, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, s4, v3 @@ -2103,7 +2103,7 @@ define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v3 ; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc8_vecMul: @@ -2113,18 +2113,18 @@ define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s7, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s0 -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, s1 -; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 24 -; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 24 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s2, s3 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s6 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, s7 +; GFX10-DL-NEXT: s_lshr_b32 s0, s6, 24 +; GFX10-DL-NEXT: s_lshr_b32 s1, s7, 24 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s0, s1 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v2, v2, v3 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, s0, s1 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 16 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, s6, s7 +; GFX10-DL-NEXT: s_lshr_b32 s0, s6, 16 +; GFX10-DL-NEXT: s_lshr_b32 s1, s7, 16 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v2, 8, v2 ; GFX10-DL-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v4 diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll index f4609ca..949ade9 100644 --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -3,10 +3,10 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-XNACK %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-XNACK %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-NOXNACK %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-NOXNACK %s define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: idot8_acc32: @@ -118,77 +118,120 @@ define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1, ; GFX9-LABEL: idot8_acc32: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s22, -1 ; GFX9-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-NEXT: s_add_u32 s20, s20, s3 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s4, s2, 0x40000 -; GFX9-NEXT: s_bfe_i32 s5, s3, 0x40000 -; GFX9-NEXT: s_bfe_i32 s7, s3, 0x40004 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s18, s[2:3], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_bfe_i32 s4, s0, 0x40000 +; GFX9-NEXT: s_bfe_i32 s5, s1, 0x40000 +; GFX9-NEXT: s_bfe_i32 s7, s1, 0x40004 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mad_i32_i24 v1, s4, v1, v2 -; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40004 +; GFX9-NEXT: s_bfe_i32 s6, s0, 0x40004 ; GFX9-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NEXT: s_bfe_i32 s9, s3, 0x40008 +; GFX9-NEXT: s_bfe_i32 s9, s1, 0x40008 ; GFX9-NEXT: v_mad_i32_i24 v1, s6, v2, v1 -; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40008 +; GFX9-NEXT: s_bfe_i32 s8, s0, 0x40008 ; GFX9-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-NEXT: s_bfe_i32 s11, s3, 0x4000c +; GFX9-NEXT: s_bfe_i32 s11, s1, 0x4000c ; GFX9-NEXT: v_mad_i32_i24 v1, s8, v2, v1 -; GFX9-NEXT: s_bfe_i32 s10, s2, 0x4000c +; GFX9-NEXT: s_bfe_i32 s10, s0, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v2, s11 -; GFX9-NEXT: s_bfe_i32 s13, s3, 0x40010 +; GFX9-NEXT: s_bfe_i32 s13, s1, 0x40010 ; GFX9-NEXT: v_mad_i32_i24 v1, s10, v2, v1 -; GFX9-NEXT: s_bfe_i32 s12, s2, 0x40010 +; GFX9-NEXT: s_bfe_i32 s12, s0, 0x40010 ; GFX9-NEXT: v_mov_b32_e32 v2, s13 -; GFX9-NEXT: s_bfe_i32 s15, s3, 0x40014 -; GFX9-NEXT: s_bfe_i32 s17, s3, 0x40018 +; GFX9-NEXT: s_bfe_i32 s15, s1, 0x40014 +; GFX9-NEXT: s_bfe_i32 s17, s1, 0x40018 ; GFX9-NEXT: v_mad_i32_i24 v1, s12, v2, v1 -; GFX9-NEXT: s_bfe_i32 s14, s2, 0x40014 +; GFX9-NEXT: s_bfe_i32 s14, s0, 0x40014 ; GFX9-NEXT: v_mov_b32_e32 v2, s15 -; GFX9-NEXT: s_bfe_i32 s16, s2, 0x40018 +; GFX9-NEXT: s_bfe_i32 s16, s0, 0x40018 ; GFX9-NEXT: v_mad_i32_i24 v1, s14, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s17 -; GFX9-NEXT: s_ashr_i32 s3, s3, 28 +; GFX9-NEXT: s_ashr_i32 s1, s1, 28 ; GFX9-NEXT: v_mad_i32_i24 v1, s16, v2, v1 -; GFX9-NEXT: s_ashr_i32 s2, s2, 28 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: v_mad_i32_i24 v1, s2, v2, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_ashr_i32 s0, s0, 28 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mad_i32_i24 v1, s0, v2, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc32: ; GFX9-DL: ; %bb.0: ; %entry +; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s14, -1 +; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s12, s12, s3 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s10, -1 -; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_dot8_i32_i4 v1, s4, v1, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-DL-NEXT: v_dot8_i32_i4 v1, s0, v1, v2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; +; GFX10-DL-XNACK-LABEL: idot8_acc32: +; GFX10-DL-XNACK: ; %bb.0: ; %entry +; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s3 +; GFX10-DL-XNACK-NEXT: s_clause 0x1 +; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-XNACK-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-XNACK-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-XNACK-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v0, s0, s1, v0 +; GFX10-DL-XNACK-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-DL-XNACK-NEXT: s_endpgm +; +; GFX10-DL-NOXNACK-LABEL: idot8_acc32: +; GFX10-DL-NOXNACK: ; %bb.0: ; %entry +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NOXNACK-NEXT: v_dot8_i32_i4 v0, s0, s1, v0 +; GFX10-DL-NOXNACK-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NOXNACK-NEXT: s_endpgm ; GFX10-DL-LABEL: idot8_acc32: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 @@ -410,47 +453,47 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1, ; ; GFX9-LABEL: idot8_acc16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s22, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] ; GFX9-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-NEXT: s_add_u32 s20, s20, s3 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40000 -; GFX9-NEXT: s_bfe_i32 s7, s3, 0x40000 -; GFX9-NEXT: s_bfe_i32 s9, s3, 0x40004 -; GFX9-NEXT: s_bfe_i32 s11, s3, 0x40008 +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_bfe_i32 s6, s0, 0x40000 +; GFX9-NEXT: s_bfe_i32 s7, s1, 0x40000 +; GFX9-NEXT: s_bfe_i32 s9, s1, 0x40004 +; GFX9-NEXT: s_bfe_i32 s11, s1, 0x40008 ; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: s_lshr_b32 s4, s2, 12 -; GFX9-NEXT: s_lshr_b32 s5, s3, 12 -; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40004 -; GFX9-NEXT: s_bfe_i32 s10, s2, 0x40008 +; GFX9-NEXT: s_lshr_b32 s4, s0, 12 +; GFX9-NEXT: s_lshr_b32 s5, s1, 12 +; GFX9-NEXT: s_bfe_i32 s8, s0, 0x40004 +; GFX9-NEXT: s_bfe_i32 s10, s0, 0x40008 ; GFX9-NEXT: v_mov_b32_e32 v2, s11 ; GFX9-NEXT: v_mov_b32_e32 v6, s9 ; GFX9-NEXT: v_lshlrev_b16_e64 v3, 12, s4 ; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s5 ; GFX9-NEXT: v_mul_i32_i24_e32 v2, s10, v2 -; GFX9-NEXT: s_bfe_i32 s13, s3, 0x40010 +; GFX9-NEXT: s_bfe_i32 s13, s1, 0x40010 ; GFX9-NEXT: v_ashrrev_i16_e32 v3, 12, v3 ; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4 -; GFX9-NEXT: s_bfe_i32 s15, s3, 0x40014 -; GFX9-NEXT: s_bfe_i32 s12, s2, 0x40010 +; GFX9-NEXT: s_bfe_i32 s15, s1, 0x40014 +; GFX9-NEXT: s_bfe_i32 s12, s0, 0x40010 ; GFX9-NEXT: v_mov_b32_e32 v7, s13 -; GFX9-NEXT: s_bfe_i32 s17, s3, 0x40018 -; GFX9-NEXT: s_bfe_i32 s14, s2, 0x40014 +; GFX9-NEXT: s_bfe_i32 s17, s1, 0x40018 +; GFX9-NEXT: s_bfe_i32 s14, s0, 0x40014 ; GFX9-NEXT: v_mov_b32_e32 v8, s15 -; GFX9-NEXT: s_bfe_i32 s16, s2, 0x40018 -; GFX9-NEXT: s_ashr_i32 s3, s3, 28 +; GFX9-NEXT: s_bfe_i32 s16, s0, 0x40018 +; GFX9-NEXT: s_ashr_i32 s1, s1, 28 ; GFX9-NEXT: v_mov_b32_e32 v9, s17 -; GFX9-NEXT: s_ashr_i32 s2, s2, 28 +; GFX9-NEXT: s_ashr_i32 s0, s0, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_i32_i24 v1, s6, v5, v1 ; GFX9-NEXT: v_mad_i32_i24 v1, s8, v6, v1 @@ -459,54 +502,54 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: v_mad_i32_i24 v1, s12, v7, v1 ; GFX9-NEXT: v_mad_i32_i24 v1, s14, v8, v1 ; GFX9-NEXT: v_mad_i32_i24 v1, s16, v9, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: v_mad_i32_i24 v1, s2, v2, v1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mad_i32_i24 v1, s0, v2, v1 +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc16: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_mov_b32 s22, -1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_ushort v1, v0, s[0:1] ; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_i32 s6, s2, 0x40000 -; GFX9-DL-NEXT: s_bfe_i32 s7, s3, 0x40000 -; GFX9-DL-NEXT: s_bfe_i32 s9, s3, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s11, s3, 0x40008 +; GFX9-DL-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_bfe_i32 s6, s0, 0x40000 +; GFX9-DL-NEXT: s_bfe_i32 s7, s1, 0x40000 +; GFX9-DL-NEXT: s_bfe_i32 s9, s1, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s11, s1, 0x40008 ; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 12 -; GFX9-DL-NEXT: s_lshr_b32 s5, s3, 12 -; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s10, s2, 0x40008 +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 12 +; GFX9-DL-NEXT: s_lshr_b32 s5, s1, 12 +; GFX9-DL-NEXT: s_bfe_i32 s8, s0, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s10, s0, 0x40008 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s11 ; GFX9-DL-NEXT: v_mov_b32_e32 v6, s9 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s4 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s5 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v2, s10, v2 -; GFX9-DL-NEXT: s_bfe_i32 s13, s3, 0x40010 +; GFX9-DL-NEXT: s_bfe_i32 s13, s1, 0x40010 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v3, 12, v3 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4 -; GFX9-DL-NEXT: s_bfe_i32 s15, s3, 0x40014 -; GFX9-DL-NEXT: s_bfe_i32 s12, s2, 0x40010 +; GFX9-DL-NEXT: s_bfe_i32 s15, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s12, s0, 0x40010 ; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 -; GFX9-DL-NEXT: s_bfe_i32 s17, s3, 0x40018 -; GFX9-DL-NEXT: s_bfe_i32 s14, s2, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s17, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_i32 s14, s0, 0x40014 ; GFX9-DL-NEXT: v_mov_b32_e32 v8, s15 -; GFX9-DL-NEXT: s_bfe_i32 s16, s2, 0x40018 -; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 28 +; GFX9-DL-NEXT: s_bfe_i32 s16, s0, 0x40018 +; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 28 ; GFX9-DL-NEXT: v_mov_b32_e32 v9, s17 -; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 28 +; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v5, v1 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s8, v6, v1 @@ -515,11 +558,114 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s12, v7, v1 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s14, v8, v1 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s16, v9, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_mad_i32_i24 v1, s2, v2, v1 -; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s0, v2, v1 +; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; +; GFX10-DL-XNACK-LABEL: idot8_acc16: +; GFX10-DL-XNACK: ; %bb.0: ; %entry +; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s3 +; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-XNACK-NEXT: global_load_ushort v1, v0, s[4:5] +; GFX10-DL-XNACK-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-XNACK-NEXT: s_load_dword s7, s[2:3], 0x0 +; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-XNACK-NEXT: s_lshr_b32 s0, s6, 12 +; GFX10-DL-XNACK-NEXT: s_lshr_b32 s1, s7, 12 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s2, s6, 0x40000 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s3, s7, 0x40000 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v2, 12, s0 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v3, 12, s1 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s8, s6, 0x40004 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s9, s6, 0x40008 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s10, s7, 0x40008 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s0, s7, 0x40004 +; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e64 v4, s9, s10 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v2, 12, v2 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v3, 12, v3 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s1, s7, 0x40010 +; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s2, s3, v1 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s8, s0, v1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s0, 0xffff +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX10-DL-XNACK-NEXT: v_add_nc_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s0, s6, 0x40010 +; GFX10-DL-XNACK-NEXT: v_mad_u32_u24 v1, v2, v3, v1 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s0, s1, v1 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s0, s6, 0x40014 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s1, s7, 0x40014 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s0, s1, v1 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s0, s6, 0x40018 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s1, s7, 0x40018 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s0, s1, v1 +; GFX10-DL-XNACK-NEXT: s_ashr_i32 s0, s6, 28 +; GFX10-DL-XNACK-NEXT: s_ashr_i32 s1, s7, 28 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s0, s1, v1 +; GFX10-DL-XNACK-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-DL-XNACK-NEXT: s_endpgm +; +; GFX10-DL-NOXNACK-LABEL: idot8_acc16: +; GFX10-DL-NOXNACK: ; %bb.0: ; %entry +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s3 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: global_load_ushort v1, v0, s[4:5] +; GFX10-DL-NOXNACK-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s2, s0, 12 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s3, s1, 12 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s6, s0, 0x40000 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s7, s1, 0x40000 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v2, 12, s2 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v3, 12, s3 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s8, s0, 0x40004 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s9, s0, 0x40008 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s10, s1, 0x40008 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s2, s1, 0x40004 +; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e64 v4, s9, s10 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v2, 12, v2 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v3, 12, v3 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s3, s1, 0x40010 +; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s6, s7, v1 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s8, s2, v1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s2, 0xffff +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v2, s2, v2 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s2, s0, 0x40010 +; GFX10-DL-NOXNACK-NEXT: v_mad_u32_u24 v1, v2, v3, v1 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s2, s3, v1 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s2, s0, 0x40014 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s3, s1, 0x40014 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s2, s3, v1 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s2, s0, 0x40018 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s3, s1, 0x40018 +; GFX10-DL-NOXNACK-NEXT: s_ashr_i32 s0, s0, 28 +; GFX10-DL-NOXNACK-NEXT: s_ashr_i32 s1, s1, 28 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s2, s3, v1 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s0, s1, v1 +; GFX10-DL-NOXNACK-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-DL-NOXNACK-NEXT: s_endpgm ; GFX10-DL-LABEL: idot8_acc16: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 @@ -773,122 +919,225 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1, ; ; GFX9-LABEL: idot8_acc8: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s22, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] ; GFX9-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-NEXT: s_add_u32 s20, s20, s3 -; GFX9-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_addc_u32 s21, s21, 0 -; GFX9-NEXT: s_movk_i32 s2, 0xff +; GFX9-NEXT: s_movk_i32 s0, 0xff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s5, s3, 12 -; GFX9-NEXT: s_bfe_i32 s8, s4, 0x40000 -; GFX9-NEXT: s_lshr_b32 s6, s4, 12 -; GFX9-NEXT: s_bfe_i32 s10, s4, 0x40004 -; GFX9-NEXT: s_bfe_i32 s12, s4, 0x40008 -; GFX9-NEXT: s_bfe_i32 s7, s3, 0x40000 -; GFX9-NEXT: v_mov_b32_e32 v5, s8 -; GFX9-NEXT: v_lshlrev_b16_e64 v3, 12, s5 -; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s6 -; GFX9-NEXT: s_bfe_i32 s9, s3, 0x40004 -; GFX9-NEXT: s_bfe_i32 s11, s3, 0x40008 +; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s8, s[6:7], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s4, s1, 12 +; GFX9-NEXT: s_bfe_i32 s7, s8, 0x40000 +; GFX9-NEXT: s_lshr_b32 s5, s8, 12 +; GFX9-NEXT: s_bfe_i32 s10, s8, 0x40004 +; GFX9-NEXT: s_bfe_i32 s12, s8, 0x40008 +; GFX9-NEXT: s_bfe_i32 s6, s1, 0x40000 +; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: v_lshlrev_b16_e64 v3, 12, s4 +; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s5 +; GFX9-NEXT: s_bfe_i32 s9, s1, 0x40004 +; GFX9-NEXT: s_bfe_i32 s11, s1, 0x40008 ; GFX9-NEXT: v_mov_b32_e32 v2, s12 ; GFX9-NEXT: v_mov_b32_e32 v6, s10 ; GFX9-NEXT: v_ashrrev_i16_e32 v3, 12, v3 ; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX9-NEXT: v_mul_i32_i24_e32 v2, s11, v2 -; GFX9-NEXT: s_bfe_i32 s14, s4, 0x40010 -; GFX9-NEXT: v_and_b32_e32 v3, s2, v3 -; GFX9-NEXT: v_and_b32_e32 v4, s2, v4 -; GFX9-NEXT: s_bfe_i32 s16, s4, 0x40014 -; GFX9-NEXT: s_bfe_i32 s13, s3, 0x40010 +; GFX9-NEXT: s_bfe_i32 s14, s8, 0x40010 +; GFX9-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX9-NEXT: v_and_b32_e32 v4, s0, v4 +; GFX9-NEXT: s_bfe_i32 s16, s8, 0x40014 +; GFX9-NEXT: s_bfe_i32 s13, s1, 0x40010 ; GFX9-NEXT: v_mov_b32_e32 v7, s14 -; GFX9-NEXT: s_bfe_i32 s18, s4, 0x40018 -; GFX9-NEXT: s_bfe_i32 s15, s3, 0x40014 +; GFX9-NEXT: s_bfe_i32 s18, s8, 0x40018 +; GFX9-NEXT: s_bfe_i32 s15, s1, 0x40014 ; GFX9-NEXT: v_mov_b32_e32 v8, s16 -; GFX9-NEXT: s_bfe_i32 s17, s3, 0x40018 -; GFX9-NEXT: s_ashr_i32 s4, s4, 28 +; GFX9-NEXT: s_bfe_i32 s17, s1, 0x40018 +; GFX9-NEXT: s_ashr_i32 s8, s8, 28 ; GFX9-NEXT: v_mov_b32_e32 v9, s18 -; GFX9-NEXT: s_ashr_i32 s3, s3, 28 +; GFX9-NEXT: s_ashr_i32 s1, s1, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_i32_i24 v1, s7, v5, v1 +; GFX9-NEXT: v_mad_i32_i24 v1, s6, v5, v1 ; GFX9-NEXT: v_mad_i32_i24 v1, s9, v6, v1 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NEXT: v_mad_u32_u24 v1, v3, v4, v1 ; GFX9-NEXT: v_mad_i32_i24 v1, s13, v7, v1 ; GFX9-NEXT: v_mad_i32_i24 v1, s15, v8, v1 ; GFX9-NEXT: v_mad_i32_i24 v1, s17, v9, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mad_i32_i24 v1, s3, v2, v1 -; GFX9-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mad_i32_i24 v1, s1, v2, v1 +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_mov_b32 s22, -1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] ; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 -; GFX9-DL-NEXT: s_movk_i32 s2, 0xff +; GFX9-DL-NEXT: s_movk_i32 s0, 0xff +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s5, s3, 12 -; GFX9-DL-NEXT: s_bfe_i32 s8, s4, 0x40000 -; GFX9-DL-NEXT: s_lshr_b32 s6, s4, 12 -; GFX9-DL-NEXT: s_bfe_i32 s10, s4, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s12, s4, 0x40008 -; GFX9-DL-NEXT: s_bfe_i32 s7, s3, 0x40000 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s8 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s5 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s6 -; GFX9-DL-NEXT: s_bfe_i32 s9, s3, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s11, s3, 0x40008 +; GFX9-DL-NEXT: s_lshr_b32 s4, s1, 12 +; GFX9-DL-NEXT: s_bfe_i32 s7, s8, 0x40000 +; GFX9-DL-NEXT: s_lshr_b32 s5, s8, 12 +; GFX9-DL-NEXT: s_bfe_i32 s10, s8, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s12, s8, 0x40008 +; GFX9-DL-NEXT: s_bfe_i32 s6, s1, 0x40000 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s4 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s5 +; GFX9-DL-NEXT: s_bfe_i32 s9, s1, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s11, s1, 0x40008 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s12 ; GFX9-DL-NEXT: v_mov_b32_e32 v6, s10 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v3, 12, v3 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v2, s11, v2 -; GFX9-DL-NEXT: s_bfe_i32 s14, s4, 0x40010 -; GFX9-DL-NEXT: v_and_b32_e32 v3, s2, v3 -; GFX9-DL-NEXT: v_and_b32_e32 v4, s2, v4 -; GFX9-DL-NEXT: s_bfe_i32 s16, s4, 0x40014 -; GFX9-DL-NEXT: s_bfe_i32 s13, s3, 0x40010 +; GFX9-DL-NEXT: s_bfe_i32 s14, s8, 0x40010 +; GFX9-DL-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX9-DL-NEXT: v_and_b32_e32 v4, s0, v4 +; GFX9-DL-NEXT: s_bfe_i32 s16, s8, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s13, s1, 0x40010 ; GFX9-DL-NEXT: v_mov_b32_e32 v7, s14 -; GFX9-DL-NEXT: s_bfe_i32 s18, s4, 0x40018 -; GFX9-DL-NEXT: s_bfe_i32 s15, s3, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s18, s8, 0x40018 +; GFX9-DL-NEXT: s_bfe_i32 s15, s1, 0x40014 ; GFX9-DL-NEXT: v_mov_b32_e32 v8, s16 -; GFX9-DL-NEXT: s_bfe_i32 s17, s3, 0x40018 -; GFX9-DL-NEXT: s_ashr_i32 s4, s4, 28 +; GFX9-DL-NEXT: s_bfe_i32 s17, s1, 0x40018 +; GFX9-DL-NEXT: s_ashr_i32 s8, s8, 28 ; GFX9-DL-NEXT: v_mov_b32_e32 v9, s18 -; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 28 +; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v1, s7, v5, v1 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v5, v1 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s9, v6, v1 ; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, v3, v4, v1 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s13, v7, v1 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s15, v8, v1 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s17, v9, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DL-NEXT: v_mad_i32_i24 v1, s3, v2, v1 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s1, v2, v1 +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; +; GFX10-DL-XNACK-LABEL: idot8_acc8: +; GFX10-DL-XNACK: ; %bb.0: ; %entry +; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s3 +; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-XNACK-NEXT: global_load_ubyte v1, v0, s[4:5] +; GFX10-DL-XNACK-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-XNACK-NEXT: s_load_dword s7, s[2:3], 0x0 +; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-XNACK-NEXT: s_lshr_b32 s0, s6, 12 +; GFX10-DL-XNACK-NEXT: s_lshr_b32 s1, s7, 12 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s2, s6, 0x40000 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s3, s7, 0x40000 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v2, 12, s0 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v3, 12, s1 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s8, s6, 0x40004 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s9, s6, 0x40008 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s10, s7, 0x40008 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s0, s7, 0x40004 +; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e64 v4, s9, s10 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v2, 12, v2 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v3, 12, v3 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s1, s7, 0x40010 +; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s2, s3, v1 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s8, s0, v1 +; GFX10-DL-XNACK-NEXT: s_movk_i32 s0, 0xff +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX10-DL-XNACK-NEXT: v_add_nc_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s0, s6, 0x40010 +; GFX10-DL-XNACK-NEXT: v_mad_u32_u24 v1, v2, v3, v1 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s0, s1, v1 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s0, s6, 0x40014 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s1, s7, 0x40014 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s0, s1, v1 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s0, s6, 0x40018 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s1, s7, 0x40018 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s0, s1, v1 +; GFX10-DL-XNACK-NEXT: s_ashr_i32 s0, s6, 28 +; GFX10-DL-XNACK-NEXT: s_ashr_i32 s1, s7, 28 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s0, s1, v1 +; GFX10-DL-XNACK-NEXT: global_store_byte v0, v1, s[4:5] +; GFX10-DL-XNACK-NEXT: s_endpgm +; +; GFX10-DL-NOXNACK-LABEL: idot8_acc8: +; GFX10-DL-NOXNACK: ; %bb.0: ; %entry +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s3 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: global_load_ubyte v1, v0, s[4:5] +; GFX10-DL-NOXNACK-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s2, s0, 12 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s3, s1, 12 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s6, s0, 0x40000 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s7, s1, 0x40000 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v2, 12, s2 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v3, 12, s3 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s8, s0, 0x40004 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s9, s0, 0x40008 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s10, s1, 0x40008 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s2, s1, 0x40004 +; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e64 v4, s9, s10 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v2, 12, v2 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v3, 12, v3 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s3, s1, 0x40010 +; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s6, s7, v1 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s8, s2, v1 +; GFX10-DL-NOXNACK-NEXT: s_movk_i32 s2, 0xff +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v2, s2, v2 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s2, s0, 0x40010 +; GFX10-DL-NOXNACK-NEXT: v_mad_u32_u24 v1, v2, v3, v1 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s2, s3, v1 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s2, s0, 0x40014 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s3, s1, 0x40014 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s2, s3, v1 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s2, s0, 0x40018 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s3, s1, 0x40018 +; GFX10-DL-NOXNACK-NEXT: s_ashr_i32 s0, s0, 28 +; GFX10-DL-NOXNACK-NEXT: s_ashr_i32 s1, s1, 28 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s2, s3, v1 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s0, s1, v1 +; GFX10-DL-NOXNACK-NEXT: global_store_byte v0, v1, s[4:5] +; GFX10-DL-NOXNACK-NEXT: s_endpgm ; GFX10-DL-LABEL: idot8_acc8: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 @@ -1124,111 +1373,204 @@ define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, ; GFX9-LABEL: idot8_multiuses_mul1: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s22, -1 ; GFX9-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-NEXT: s_add_u32 s20, s20, s3 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s4, s2, 0x40000 -; GFX9-NEXT: s_bfe_i32 s5, s3, 0x40000 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s18, s[2:3], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_bfe_i32 s4, s0, 0x40000 +; GFX9-NEXT: s_bfe_i32 s5, s1, 0x40000 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mad_i32_i24 v2, s4, v1, v2 -; GFX9-NEXT: s_bfe_i32 s7, s3, 0x40004 -; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40004 -; GFX9-NEXT: s_bfe_i32 s9, s3, 0x40008 +; GFX9-NEXT: s_bfe_i32 s7, s1, 0x40004 +; GFX9-NEXT: s_bfe_i32 s6, s0, 0x40004 +; GFX9-NEXT: s_bfe_i32 s9, s1, 0x40008 ; GFX9-NEXT: v_mad_i32_i24 v1, s4, v1, v2 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: v_mad_i32_i24 v1, s6, v3, v1 -; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40008 +; GFX9-NEXT: s_bfe_i32 s8, s0, 0x40008 ; GFX9-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NEXT: s_bfe_i32 s11, s3, 0x4000c +; GFX9-NEXT: s_bfe_i32 s11, s1, 0x4000c ; GFX9-NEXT: v_mad_i32_i24 v1, s8, v3, v1 -; GFX9-NEXT: s_bfe_i32 s10, s2, 0x4000c +; GFX9-NEXT: s_bfe_i32 s10, s0, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: s_bfe_i32 s13, s3, 0x40010 +; GFX9-NEXT: s_bfe_i32 s13, s1, 0x40010 ; GFX9-NEXT: v_mad_i32_i24 v1, s10, v3, v1 -; GFX9-NEXT: s_bfe_i32 s12, s2, 0x40010 +; GFX9-NEXT: s_bfe_i32 s12, s0, 0x40010 ; GFX9-NEXT: v_mov_b32_e32 v3, s13 -; GFX9-NEXT: s_bfe_i32 s15, s3, 0x40014 -; GFX9-NEXT: s_bfe_i32 s17, s3, 0x40018 +; GFX9-NEXT: s_bfe_i32 s15, s1, 0x40014 +; GFX9-NEXT: s_bfe_i32 s17, s1, 0x40018 ; GFX9-NEXT: v_mad_i32_i24 v1, s12, v3, v1 -; GFX9-NEXT: s_bfe_i32 s14, s2, 0x40014 +; GFX9-NEXT: s_bfe_i32 s14, s0, 0x40014 ; GFX9-NEXT: v_mov_b32_e32 v3, s15 -; GFX9-NEXT: s_bfe_i32 s16, s2, 0x40018 +; GFX9-NEXT: s_bfe_i32 s16, s0, 0x40018 ; GFX9-NEXT: v_mad_i32_i24 v1, s14, v3, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: s_ashr_i32 s3, s3, 28 +; GFX9-NEXT: s_ashr_i32 s1, s1, 28 ; GFX9-NEXT: v_mad_i32_i24 v1, s16, v3, v1 -; GFX9-NEXT: s_ashr_i32 s2, s2, 28 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_mad_i32_i24 v1, s2, v3, v1 +; GFX9-NEXT: s_ashr_i32 s0, s0, 28 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mad_i32_i24 v1, s0, v3, v1 ; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_multiuses_mul1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_mov_b32 s22, -1 ; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_i32 s4, s2, 0x40000 -; GFX9-DL-NEXT: s_bfe_i32 s5, s3, 0x40000 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s18, s[2:3], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_bfe_i32 s4, s0, 0x40000 +; GFX9-DL-NEXT: s_bfe_i32 s5, s1, 0x40000 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s4, v1, v2 -; GFX9-DL-NEXT: s_bfe_i32 s7, s3, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s6, s2, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s9, s3, 0x40008 +; GFX9-DL-NEXT: s_bfe_i32 s7, s1, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s6, s0, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s9, s1, 0x40008 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s4, v1, v2 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v3, v1 -; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x40008 +; GFX9-DL-NEXT: s_bfe_i32 s8, s0, 0x40008 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-DL-NEXT: s_bfe_i32 s11, s3, 0x4000c +; GFX9-DL-NEXT: s_bfe_i32 s11, s1, 0x4000c ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s8, v3, v1 -; GFX9-DL-NEXT: s_bfe_i32 s10, s2, 0x4000c +; GFX9-DL-NEXT: s_bfe_i32 s10, s0, 0x4000c ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-DL-NEXT: s_bfe_i32 s13, s3, 0x40010 +; GFX9-DL-NEXT: s_bfe_i32 s13, s1, 0x40010 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s10, v3, v1 -; GFX9-DL-NEXT: s_bfe_i32 s12, s2, 0x40010 +; GFX9-DL-NEXT: s_bfe_i32 s12, s0, 0x40010 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s13 -; GFX9-DL-NEXT: s_bfe_i32 s15, s3, 0x40014 -; GFX9-DL-NEXT: s_bfe_i32 s17, s3, 0x40018 +; GFX9-DL-NEXT: s_bfe_i32 s15, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s17, s1, 0x40018 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s12, v3, v1 -; GFX9-DL-NEXT: s_bfe_i32 s14, s2, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s14, s0, 0x40014 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s15 -; GFX9-DL-NEXT: s_bfe_i32 s16, s2, 0x40018 +; GFX9-DL-NEXT: s_bfe_i32 s16, s0, 0x40018 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s14, v3, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 28 +; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 28 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s16, v3, v1 -; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-DL-NEXT: v_mad_i32_i24 v1, s2, v3, v1 +; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s0, v3, v1 ; GFX9-DL-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; +; GFX10-DL-XNACK-LABEL: idot8_multiuses_mul1: +; GFX10-DL-XNACK: ; %bb.0: ; %entry +; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s3 +; GFX10-DL-XNACK-NEXT: s_clause 0x1 +; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-XNACK-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-XNACK-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-XNACK-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s4, s0, 0x40000 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s5, s1, 0x40000 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v0, s4, s5, v0 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s4, s5, v0 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s4, s0, 0x40004 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s5, s1, 0x40004 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s4, s5, v1 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s4, s0, 0x40008 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s5, s1, 0x40008 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s4, s5, v1 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s4, s0, 0x4000c +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s5, s1, 0x4000c +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s4, s5, v1 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s4, s0, 0x40010 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s5, s1, 0x40010 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s4, s5, v1 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s4, s0, 0x40014 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s5, s1, 0x40014 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s4, s5, v1 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s4, s0, 0x40018 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s5, s1, 0x40018 +; GFX10-DL-XNACK-NEXT: s_ashr_i32 s0, s0, 28 +; GFX10-DL-XNACK-NEXT: s_ashr_i32 s1, s1, 28 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s4, s5, v1 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s0, s1, v1 +; GFX10-DL-XNACK-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX10-DL-XNACK-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-DL-XNACK-NEXT: s_endpgm +; +; GFX10-DL-NOXNACK-LABEL: idot8_multiuses_mul1: +; GFX10-DL-NOXNACK: ; %bb.0: ; %entry +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s2, s0, 0x40000 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s3, s1, 0x40000 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v0, s2, s3, v0 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s2, s3, v0 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s2, s0, 0x40004 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s3, s1, 0x40004 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s2, s3, v1 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s2, s0, 0x40008 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s3, s1, 0x40008 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s2, s3, v1 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s2, s0, 0x4000c +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s3, s1, 0x4000c +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s2, s3, v1 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s2, s0, 0x40010 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s3, s1, 0x40010 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s2, s3, v1 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s2, s0, 0x40014 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s3, s1, 0x40014 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s2, s3, v1 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s2, s0, 0x40018 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s3, s1, 0x40018 +; GFX10-DL-NOXNACK-NEXT: s_ashr_i32 s0, s0, 28 +; GFX10-DL-NOXNACK-NEXT: s_ashr_i32 s1, s1, 28 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s2, s3, v1 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s0, s1, v1 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[4:5] +; GFX10-DL-NOXNACK-NEXT: s_endpgm ; GFX10-DL-LABEL: idot8_multiuses_mul1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 @@ -1456,38 +1798,38 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-LABEL: idot8_acc32_vecMul: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s22, -1 ; GFX9-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-NEXT: s_add_u32 s20, s20, s3 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s4, s2, 28 -; GFX9-NEXT: s_ashr_i32 s11, s3, 28 -; GFX9-NEXT: s_bfe_i32 s12, s3, 0x40018 -; GFX9-NEXT: s_bfe_i32 s13, s3, 0x40014 -; GFX9-NEXT: s_bfe_i32 s14, s3, 0x40010 -; GFX9-NEXT: s_bfe_i32 s15, s3, 0x4000c -; GFX9-NEXT: s_bfe_i32 s16, s3, 0x40008 -; GFX9-NEXT: s_bfe_i32 s17, s3, 0x40004 -; GFX9-NEXT: s_bfe_i32 s3, s3, 0x40000 -; GFX9-NEXT: s_bfe_i32 s5, s2, 0x40018 -; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40014 -; GFX9-NEXT: s_bfe_i32 s7, s2, 0x40010 -; GFX9-NEXT: s_bfe_i32 s8, s2, 0x4000c -; GFX9-NEXT: s_bfe_i32 s9, s2, 0x40008 -; GFX9-NEXT: s_bfe_i32 s10, s2, 0x40004 -; GFX9-NEXT: s_bfe_i32 s2, s2, 0x40000 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s18, s[2:3], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_ashr_i32 s4, s0, 28 +; GFX9-NEXT: s_ashr_i32 s11, s1, 28 +; GFX9-NEXT: s_bfe_i32 s12, s1, 0x40018 +; GFX9-NEXT: s_bfe_i32 s13, s1, 0x40014 +; GFX9-NEXT: s_bfe_i32 s14, s1, 0x40010 +; GFX9-NEXT: s_bfe_i32 s15, s1, 0x4000c +; GFX9-NEXT: s_bfe_i32 s16, s1, 0x40008 +; GFX9-NEXT: s_bfe_i32 s17, s1, 0x40004 +; GFX9-NEXT: s_bfe_i32 s1, s1, 0x40000 +; GFX9-NEXT: s_bfe_i32 s5, s0, 0x40018 +; GFX9-NEXT: s_bfe_i32 s6, s0, 0x40014 +; GFX9-NEXT: s_bfe_i32 s7, s0, 0x40010 +; GFX9-NEXT: s_bfe_i32 s8, s0, 0x4000c +; GFX9-NEXT: s_bfe_i32 s9, s0, 0x40008 +; GFX9-NEXT: s_bfe_i32 s10, s0, 0x40004 +; GFX9-NEXT: s_bfe_i32 s0, s0, 0x40000 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mad_i32_i24 v1, s2, v1, v2 +; GFX9-NEXT: v_mad_i32_i24 v1, s0, v1, v2 ; GFX9-NEXT: v_mov_b32_e32 v2, s17 ; GFX9-NEXT: v_mad_i32_i24 v1, s10, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s16 @@ -1502,31 +1844,74 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: v_mad_i32_i24 v1, s5, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s11 ; GFX9-NEXT: v_mad_i32_i24 v1, s4, v2, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc32_vecMul: ; GFX9-DL: ; %bb.0: ; %entry +; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s14, -1 +; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s12, s12, s3 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s10, -1 -; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_dot8_i32_i4 v1, s4, v1, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-DL-NEXT: v_dot8_i32_i4 v1, s0, v1, v2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; +; GFX10-DL-XNACK-LABEL: idot8_acc32_vecMul: +; GFX10-DL-XNACK: ; %bb.0: ; %entry +; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s3 +; GFX10-DL-XNACK-NEXT: s_clause 0x1 +; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-XNACK-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-XNACK-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-XNACK-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v0, s0, s1, v0 +; GFX10-DL-XNACK-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-DL-XNACK-NEXT: s_endpgm +; +; GFX10-DL-NOXNACK-LABEL: idot8_acc32_vecMul: +; GFX10-DL-NOXNACK: ; %bb.0: ; %entry +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NOXNACK-NEXT: v_dot8_i32_i4 v0, s0, s1, v0 +; GFX10-DL-NOXNACK-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NOXNACK-NEXT: s_endpgm ; GFX10-DL-LABEL: idot8_acc32_vecMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 @@ -1705,33 +2090,35 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; ; GFX9-LABEL: idot8_acc16_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s22, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-NEXT: s_add_u32 s20, s20, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s3, s2, 0x40018 -; GFX9-NEXT: s_lshr_b32 s4, s2, 28 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX9-NEXT: s_and_b32 s11, s2, 15 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40004 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s11, s2 -; GFX9-NEXT: v_pk_lshlrev_b16 v1, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s9, s10 -; GFX9-NEXT: v_pk_lshlrev_b16 v2, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s5, s8 -; GFX9-NEXT: v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s4 +; GFX9-NEXT: s_bfe_u32 s1, s0, 0x40018 +; GFX9-NEXT: s_lshr_b32 s4, s0, 28 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s10, s0, 0x4000c +; GFX9-NEXT: s_and_b32 s11, s0, 15 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x40004 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s11, s0 +; GFX9-NEXT: v_pk_lshlrev_b16 v1, 12, s0 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s9, s10 +; GFX9-NEXT: v_pk_lshlrev_b16 v2, 12, s0 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s5, s8 +; GFX9-NEXT: v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s1, s4 ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x40018 ; GFX9-NEXT: s_lshr_b32 s12, s6, 28 ; GFX9-NEXT: s_bfe_u32 s13, s6, 0x40010 @@ -1740,25 +2127,24 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_bfe_u32 s16, s6, 0x4000c ; GFX9-NEXT: s_and_b32 s17, s6, 15 ; GFX9-NEXT: s_bfe_u32 s6, s6, 0x40004 -; GFX9-NEXT: v_pk_lshlrev_b16 v4, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s17, s6 -; GFX9-NEXT: v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_pk_lshlrev_b16 v4, 12, s0 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s17, s6 +; GFX9-NEXT: v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v5 -; GFX9-NEXT: global_load_ushort v5, v0, s[0:1] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s15, s16 -; GFX9-NEXT: v_pk_lshlrev_b16 v6, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s13, s14 +; GFX9-NEXT: global_load_ushort v5, v0, s[2:3] +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s15, s16 +; GFX9-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s13, s14 ; GFX9-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v7, 12, s2 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v7, 12, s0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_mul_lo_u16 v2, v2, v6 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s7, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s7, s12 ; GFX9-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v8, 12, s2 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v8, 12, s0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_mul_lo_u16 v3, v3, v7 ; GFX9-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1] @@ -1772,38 +2158,40 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v4 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc16_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_mov_b32 s22, -1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_u32 s3, s2, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 28 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX9-DL-NEXT: s_and_b32 s11, s2, 15 -; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40004 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s11, s2 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v1, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s9, s10 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v2, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s5, s8 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s3, s4 +; GFX9-DL-NEXT: s_bfe_u32 s1, s0, 0x40018 +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x4000c +; GFX9-DL-NEXT: s_and_b32 s11, s0, 15 +; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x40004 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s11, s0 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v1, 12, s0 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s9, s10 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v2, 12, s0 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s5, s8 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s1, s4 ; GFX9-DL-NEXT: s_bfe_u32 s7, s6, 0x40018 ; GFX9-DL-NEXT: s_lshr_b32 s12, s6, 28 ; GFX9-DL-NEXT: s_bfe_u32 s13, s6, 0x40010 @@ -1812,25 +2200,24 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_bfe_u32 s16, s6, 0x4000c ; GFX9-DL-NEXT: s_and_b32 s17, s6, 15 ; GFX9-DL-NEXT: s_bfe_u32 s6, s6, 0x40004 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s17, s6 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s0 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s17, s6 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v5 -; GFX9-DL-NEXT: global_load_ushort v5, v0, s[0:1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s15, s16 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s13, s14 +; GFX9-DL-NEXT: global_load_ushort v5, v0, s[2:3] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s15, s16 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s13, s14 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s2 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s0 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v6 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s7, s12 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s7, s12 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v8, 12, s2 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v8, 12, s0 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v7 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1] @@ -1844,9 +2231,152 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v4 ; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; +; GFX10-DL-XNACK-LABEL: idot8_acc16_vecMul: +; GFX10-DL-XNACK: ; %bb.0: ; %entry +; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s3 +; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-XNACK-NEXT: global_load_ushort v1, v0, s[4:5] +; GFX10-DL-XNACK-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-XNACK-NEXT: s_load_dword s7, s[2:3], 0x0 +; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-XNACK-NEXT: s_bfe_u32 s0, s6, 0x40018 +; GFX10-DL-XNACK-NEXT: s_lshr_b32 s1, s6, 28 +; GFX10-DL-XNACK-NEXT: s_bfe_u32 s2, s6, 0x40010 +; GFX10-DL-XNACK-NEXT: s_bfe_u32 s3, s6, 0x40014 +; GFX10-DL-XNACK-NEXT: s_bfe_u32 s8, s6, 0x40008 +; GFX10-DL-XNACK-NEXT: s_bfe_u32 s9, s6, 0x4000c +; GFX10-DL-XNACK-NEXT: s_and_b32 s10, s6, 15 +; GFX10-DL-XNACK-NEXT: s_bfe_u32 s6, s6, 0x40004 +; GFX10-DL-XNACK-NEXT: s_and_b32 s11, s7, 15 +; GFX10-DL-XNACK-NEXT: s_pack_ll_b32_b16 s6, s10, s6 +; GFX10-DL-XNACK-NEXT: s_bfe_u32 s10, s7, 0x40004 +; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v2, 12, s6 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: s_pack_ll_b32_b16 s6, s11, s10 +; GFX10-DL-XNACK-NEXT: s_bfe_u32 s11, s7, 0x4000c +; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v3, 12, s6 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: s_bfe_u32 s6, s7, 0x40008 +; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX10-DL-XNACK-NEXT: s_pack_ll_b32_b16 s6, s6, s11 +; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v4, 12, s8 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v5, 12, s6 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: s_bfe_u32 s8, s7, 0x40010 +; GFX10-DL-XNACK-NEXT: s_bfe_u32 s6, s7, 0x40014 +; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v2, v2, v3 +; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v3, 12, v4 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v4, 12, v5 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: s_pack_ll_b32_b16 s2, s2, s3 +; GFX10-DL-XNACK-NEXT: s_pack_ll_b32_b16 s3, s8, s6 +; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v6, 12, s3 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v3, v3, v4 +; GFX10-DL-XNACK-NEXT: s_bfe_u32 s10, s7, 0x40018 +; GFX10-DL-XNACK-NEXT: s_lshr_b32 s2, s7, 28 +; GFX10-DL-XNACK-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v4, 12, v6 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: s_pack_ll_b32_b16 s1, s10, s2 +; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v6, 12, s1 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-XNACK-NEXT: v_add_nc_u32_e32 v1, v2, v1 +; GFX10-DL-XNACK-NEXT: v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v2, 12, v5 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v2, v2, v4 +; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v4, 12, v6 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v3, 12, v5 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v3, v3, v4 +; GFX10-DL-XNACK-NEXT: v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-XNACK-NEXT: v_add_nc_u32_e32 v1, v1, v3 +; GFX10-DL-XNACK-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-XNACK-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-DL-XNACK-NEXT: s_endpgm +; +; GFX10-DL-NOXNACK-LABEL: idot8_acc16_vecMul: +; GFX10-DL-NOXNACK: ; %bb.0: ; %entry +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s3 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: global_load_ushort v1, v0, s[4:5] +; GFX10-DL-NOXNACK-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: s_bfe_u32 s2, s0, 0x40018 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s3, s0, 28 +; GFX10-DL-NOXNACK-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX10-DL-NOXNACK-NEXT: s_bfe_u32 s7, s0, 0x40014 +; GFX10-DL-NOXNACK-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX10-DL-NOXNACK-NEXT: s_bfe_u32 s9, s0, 0x4000c +; GFX10-DL-NOXNACK-NEXT: s_and_b32 s10, s0, 15 +; GFX10-DL-NOXNACK-NEXT: s_bfe_u32 s0, s0, 0x40004 +; GFX10-DL-NOXNACK-NEXT: s_and_b32 s11, s1, 15 +; GFX10-DL-NOXNACK-NEXT: s_pack_ll_b32_b16 s0, s10, s0 +; GFX10-DL-NOXNACK-NEXT: s_bfe_u32 s10, s1, 0x40004 +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v2, 12, s0 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: s_pack_ll_b32_b16 s0, s11, s10 +; GFX10-DL-NOXNACK-NEXT: s_bfe_u32 s11, s1, 0x4000c +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: s_bfe_u32 s0, s1, 0x40008 +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX10-DL-NOXNACK-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v4, 12, s8 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: s_bfe_u32 s8, s1, 0x40010 +; GFX10-DL-NOXNACK-NEXT: s_bfe_u32 s0, s1, 0x40014 +; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v2, v2, v3 +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v3, 12, v4 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v4, 12, v5 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX10-DL-NOXNACK-NEXT: s_pack_ll_b32_b16 s0, s8, s0 +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v5, 12, s6 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v3, v3, v4 +; GFX10-DL-NOXNACK-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s0, s1, 28 +; GFX10-DL-NOXNACK-NEXT: s_pack_ll_b32_b16 s1, s2, s3 +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v4, 12, v6 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: s_pack_ll_b32_b16 s0, s10, s0 +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_e32 v1, v2, v1 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v2, 12, v5 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v5, 12, s1 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v2, v2, v4 +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v4, 12, v6 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v3, 12, v5 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v3, v3, v4 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_e32 v1, v1, v3 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NOXNACK-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-DL-NOXNACK-NEXT: s_endpgm ; GFX10-DL-LABEL: idot8_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 @@ -2118,31 +2648,31 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; ; GFX9-LABEL: idot8_acc8_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s22, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] ; GFX9-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-NEXT: s_add_u32 s20, s20, s3 -; GFX9-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_addc_u32 s21, s21, 0 -; GFX9-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s9, s3, 4 -; GFX9-NEXT: s_lshr_b32 s16, s4, 4 -; GFX9-NEXT: v_lshlrev_b16_e64 v2, 12, s3 -; GFX9-NEXT: v_lshlrev_b16_e64 v3, 12, s4 +; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s8, s[6:7], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s9, s1, 4 +; GFX9-NEXT: s_lshr_b32 s16, s8, 4 +; GFX9-NEXT: v_lshlrev_b16_e64 v2, 12, s1 +; GFX9-NEXT: v_lshlrev_b16_e64 v3, 12, s8 ; GFX9-NEXT: v_lshlrev_b16_e64 v6, 12, s9 ; GFX9-NEXT: v_lshlrev_b16_e64 v13, 12, s16 -; GFX9-NEXT: s_lshr_b32 s10, s3, 12 -; GFX9-NEXT: s_lshr_b32 s11, s3, 8 -; GFX9-NEXT: s_lshr_b32 s17, s4, 12 -; GFX9-NEXT: s_lshr_b32 s18, s4, 8 +; GFX9-NEXT: s_lshr_b32 s10, s1, 12 +; GFX9-NEXT: s_lshr_b32 s11, s1, 8 +; GFX9-NEXT: s_lshr_b32 s17, s8, 12 +; GFX9-NEXT: s_lshr_b32 s18, s8, 8 ; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s11 ; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s10 ; GFX9-NEXT: v_lshlrev_b16_e64 v11, 12, s18 @@ -2158,24 +2688,24 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: v_mul_lo_u16_e32 v2, v2, v3 ; GFX9-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_lshr_b32 s5, s3, 20 -; GFX9-NEXT: s_lshr_b32 s6, s3, 16 -; GFX9-NEXT: s_lshr_b32 s12, s4, 20 -; GFX9-NEXT: s_lshr_b32 s13, s4, 16 +; GFX9-NEXT: s_lshr_b32 s4, s1, 20 +; GFX9-NEXT: s_lshr_b32 s5, s1, 16 +; GFX9-NEXT: s_lshr_b32 s12, s8, 20 +; GFX9-NEXT: s_lshr_b32 s13, s8, 16 ; GFX9-NEXT: v_mul_lo_u16_sdwa v5, v5, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_mul_lo_u16_e32 v4, v4, v11 -; GFX9-NEXT: v_lshlrev_b16_e64 v9, 12, s6 -; GFX9-NEXT: v_lshlrev_b16_e64 v10, 12, s5 +; GFX9-NEXT: v_lshlrev_b16_e64 v9, 12, s5 +; GFX9-NEXT: v_lshlrev_b16_e64 v10, 12, s4 ; GFX9-NEXT: v_lshlrev_b16_e64 v16, 12, s13 ; GFX9-NEXT: v_lshlrev_b16_e64 v17, 12, s12 -; GFX9-NEXT: s_lshr_b32 s7, s3, 28 -; GFX9-NEXT: s_lshr_b32 s8, s3, 24 -; GFX9-NEXT: s_lshr_b32 s14, s4, 28 -; GFX9-NEXT: s_lshr_b32 s15, s4, 24 -; GFX9-NEXT: v_and_b32_e32 v2, s2, v2 +; GFX9-NEXT: s_lshr_b32 s6, s1, 28 +; GFX9-NEXT: s_lshr_b32 s7, s1, 24 +; GFX9-NEXT: s_lshr_b32 s14, s8, 28 +; GFX9-NEXT: s_lshr_b32 s15, s8, 24 +; GFX9-NEXT: v_and_b32_e32 v2, s0, v2 ; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e64 v7, 12, s8 -; GFX9-NEXT: v_lshlrev_b16_e64 v8, 12, s7 +; GFX9-NEXT: v_lshlrev_b16_e64 v7, 12, s7 +; GFX9-NEXT: v_lshlrev_b16_e64 v8, 12, s6 ; GFX9-NEXT: v_lshlrev_b16_e64 v14, 12, s15 ; GFX9-NEXT: v_lshlrev_b16_e64 v15, 12, s14 ; GFX9-NEXT: v_or_b32_e32 v4, v2, v4 @@ -2194,7 +2724,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_mul_lo_u16_e32 v7, v7, v14 ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX9-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX9-NEXT: v_or_b32_e32 v5, v3, v7 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 @@ -2206,36 +2736,36 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc8_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_mov_b32 s22, -1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] ; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 -; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-DL-NEXT: s_mov_b32 s0, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s9, s3, 4 -; GFX9-DL-NEXT: s_lshr_b32 s16, s4, 4 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v2, 12, s3 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s4 +; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[6:7], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_lshr_b32 s9, s1, 4 +; GFX9-DL-NEXT: s_lshr_b32 s16, s8, 4 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v2, 12, s1 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s8 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s9 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s16 -; GFX9-DL-NEXT: s_lshr_b32 s10, s3, 12 -; GFX9-DL-NEXT: s_lshr_b32 s11, s3, 8 -; GFX9-DL-NEXT: s_lshr_b32 s17, s4, 12 -; GFX9-DL-NEXT: s_lshr_b32 s18, s4, 8 +; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 12 +; GFX9-DL-NEXT: s_lshr_b32 s11, s1, 8 +; GFX9-DL-NEXT: s_lshr_b32 s17, s8, 12 +; GFX9-DL-NEXT: s_lshr_b32 s18, s8, 8 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s11 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s10 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s18 @@ -2251,24 +2781,24 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v2, v2, v3 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: s_lshr_b32 s5, s3, 20 -; GFX9-DL-NEXT: s_lshr_b32 s6, s3, 16 -; GFX9-DL-NEXT: s_lshr_b32 s12, s4, 20 -; GFX9-DL-NEXT: s_lshr_b32 s13, s4, 16 +; GFX9-DL-NEXT: s_lshr_b32 s4, s1, 20 +; GFX9-DL-NEXT: s_lshr_b32 s5, s1, 16 +; GFX9-DL-NEXT: s_lshr_b32 s12, s8, 20 +; GFX9-DL-NEXT: s_lshr_b32 s13, s8, 16 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v5, v5, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v4, v4, v11 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s6 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s5 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s5 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s4 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v16, 12, s13 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v17, 12, s12 -; GFX9-DL-NEXT: s_lshr_b32 s7, s3, 28 -; GFX9-DL-NEXT: s_lshr_b32 s8, s3, 24 -; GFX9-DL-NEXT: s_lshr_b32 s14, s4, 28 -; GFX9-DL-NEXT: s_lshr_b32 s15, s4, 24 -; GFX9-DL-NEXT: v_and_b32_e32 v2, s2, v2 +; GFX9-DL-NEXT: s_lshr_b32 s6, s1, 28 +; GFX9-DL-NEXT: s_lshr_b32 s7, s1, 24 +; GFX9-DL-NEXT: s_lshr_b32 s14, s8, 28 +; GFX9-DL-NEXT: s_lshr_b32 s15, s8, 24 +; GFX9-DL-NEXT: v_and_b32_e32 v2, s0, v2 ; GFX9-DL-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s8 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s7 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s7 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s6 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v14, 12, s15 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s14 ; GFX9-DL-NEXT: v_or_b32_e32 v4, v2, v4 @@ -2287,7 +2817,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, v7, v14 ; GFX9-DL-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX9-DL-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX9-DL-NEXT: v_or_b32_e32 v5, v3, v7 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u32_e32 v1, v2, v1 @@ -2299,9 +2829,202 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; +; GFX10-DL-XNACK-LABEL: idot8_acc8_vecMul: +; GFX10-DL-XNACK: ; %bb.0: ; %entry +; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s22, -1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s23, 0x31c16000 +; GFX10-DL-XNACK-NEXT: s_add_u32 s20, s20, s3 +; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-XNACK-NEXT: s_addc_u32 s21, s21, 0 +; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-XNACK-NEXT: global_load_ubyte v1, v0, s[4:5] +; GFX10-DL-XNACK-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-XNACK-NEXT: s_load_dword s7, s[2:3], 0x0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s0, 0xffff +; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-XNACK-NEXT: s_lshr_b32 s9, s6, 4 +; GFX10-DL-XNACK-NEXT: s_lshr_b32 s16, s7, 4 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v6, 12, s9 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v12, 12, s16 +; GFX10-DL-XNACK-NEXT: s_lshr_b32 s10, s6, 12 +; GFX10-DL-XNACK-NEXT: s_lshr_b32 s17, s7, 12 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v2, 12, s6 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v3, 12, s7 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v13, 12, s17 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v5, 12, s10 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v6, 12, v6 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v12, 12, v12 +; GFX10-DL-XNACK-NEXT: s_lshr_b32 s11, s6, 8 +; GFX10-DL-XNACK-NEXT: s_lshr_b32 s18, s7, 8 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v4, 12, s11 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v11, 12, s18 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v2, 12, v2 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v3, 12, v3 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v19, 12, v5 +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16_e64 v6, v6, v12 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v13, 12, v13 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v4, 12, v4 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v11, 12, v11 +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16_e64 v2, v2, v3 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v5, 8, v6 +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16_e64 v3, v19, v13 +; GFX10-DL-XNACK-NEXT: s_lshr_b32 s1, s6, 20 +; GFX10-DL-XNACK-NEXT: s_lshr_b32 s2, s6, 16 +; GFX10-DL-XNACK-NEXT: s_lshr_b32 s3, s6, 28 +; GFX10-DL-XNACK-NEXT: s_lshr_b32 s8, s6, 24 +; GFX10-DL-XNACK-NEXT: s_lshr_b32 s12, s7, 20 +; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v7, 12, s8 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v8, 12, s3 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v9, 12, s2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v10, 12, s1 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v12, 12, s12 +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16_e64 v4, v4, v11 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v3, 8, v3 +; GFX10-DL-XNACK-NEXT: s_lshr_b32 s13, s7, 16 +; GFX10-DL-XNACK-NEXT: s_lshr_b32 s14, s7, 28 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v6, 12, s13 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v5, 12, v7 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v7, 12, v8 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v8, 12, v9 +; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v15, 12, s14 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v4, 12, v10 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v9, 12, v12 +; GFX10-DL-XNACK-NEXT: s_lshr_b32 s15, s7, 24 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v6, 12, v6 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v10, 12, v15 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v14, 12, s15 +; GFX10-DL-XNACK-NEXT: v_or_b32_e32 v3, v2, v3 +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16_e64 v4, v4, v9 +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16_e64 v15, v8, v6 +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16_e64 v7, v7, v10 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v11, 12, v14 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 8, v3 +; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-XNACK-NEXT: v_add_nc_u32_e32 v1, v2, v1 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v2, 8, v4 +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16_e64 v4, v5, v11 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v5, 8, v7 +; GFX10-DL-XNACK-NEXT: v_add_nc_u32_e32 v1, v1, v8 +; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-XNACK-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX10-DL-XNACK-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX10-DL-XNACK-NEXT: v_or_b32_e32 v3, v2, v4 +; GFX10-DL-XNACK-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 8, v3 +; GFX10-DL-XNACK-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX10-DL-XNACK-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-XNACK-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX10-DL-XNACK-NEXT: global_store_byte v0, v1, s[4:5] +; GFX10-DL-XNACK-NEXT: s_endpgm +; +; GFX10-DL-NOXNACK-LABEL: idot8_acc8_vecMul: +; GFX10-DL-NOXNACK: ; %bb.0: ; %entry +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s22, -1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s23, 0x31c16000 +; GFX10-DL-NOXNACK-NEXT: s_add_u32 s20, s20, s3 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s21, s21, 0 +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: global_load_ubyte v1, v0, s[4:5] +; GFX10-DL-NOXNACK-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s2, 0xffff +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s9, s0, 4 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s16, s1, 4 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v6, 12, s9 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v12, 12, s16 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s10, s0, 12 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s17, s1, 12 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v2, 12, s0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v3, 12, s1 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v13, 12, s17 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v5, 12, s10 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v6, 12, v6 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v12, 12, v12 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s11, s0, 8 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s18, s1, 8 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v4, 12, s11 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v11, 12, s18 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v2, 12, v2 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v3, 12, v3 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v19, 12, v5 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16_e64 v6, v6, v12 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v13, 12, v13 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v4, 12, v4 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v11, 12, v11 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16_e64 v2, v2, v3 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v5, 8, v6 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16_e64 v3, v19, v13 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s3, s0, 20 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s6, s0, 16 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s7, s0, 28 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s8, s0, 24 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s12, s1, 20 +; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v7, 12, s8 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v8, 12, s7 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v9, 12, s6 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v10, 12, s3 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v12, 12, s12 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16_e64 v4, v4, v11 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v3, 8, v3 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s13, s1, 16 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s14, s1, 28 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v6, 12, s13 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v5, 12, v7 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v7, 12, v8 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v8, 12, v9 +; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v2, s2, v2 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v15, 12, s14 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v4, 12, v10 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v9, 12, v12 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s15, s1, 24 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v6, 12, v6 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v10, 12, v15 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v14, 12, s15 +; GFX10-DL-NOXNACK-NEXT: v_or_b32_e32 v3, v2, v3 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16_e64 v4, v4, v9 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16_e64 v15, v8, v6 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16_e64 v7, v7, v10 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v11, 12, v14 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 8, v3 +; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_e32 v1, v2, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v2, 8, v4 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16_e64 v4, v5, v11 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v5, 8, v7 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_e32 v1, v1, v8 +; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v2, s2, v2 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX10-DL-NOXNACK-NEXT: v_or_b32_e32 v3, v2, v4 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v2, 8, v3 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX10-DL-NOXNACK-NEXT: global_store_byte v0, v1, s[4:5] +; GFX10-DL-NOXNACK-NEXT: s_endpgm ; GFX10-DL-LABEL: idot8_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll index 1937cca..157cfec 100644 --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -115,18 +115,20 @@ define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1, ; ; GFX9-LABEL: udot8_acc32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s22, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-NEXT: s_add_u32 s20, s20, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s18, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s7, s6, 28 ; GFX9-NEXT: s_bfe_u32 s12, s6, 0x40018 @@ -136,17 +138,17 @@ define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_bfe_u32 s16, s6, 0x40008 ; GFX9-NEXT: s_bfe_u32 s17, s6, 0x40004 ; GFX9-NEXT: s_and_b32 s6, s6, 15 -; GFX9-NEXT: s_lshr_b32 s3, s2, 28 -; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40004 -; GFX9-NEXT: s_and_b32 s2, s2, 15 +; GFX9-NEXT: s_lshr_b32 s1, s0, 28 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x4000c +; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s11, s0, 0x40004 +; GFX9-NEXT: s_and_b32 s0, s0, 15 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mad_u32_u24 v1, s2, v1, v2 +; GFX9-NEXT: v_mad_u32_u24 v1, s0, v1, v2 ; GFX9-NEXT: v_mov_b32_e32 v2, s17 ; GFX9-NEXT: v_mad_u32_u24 v1, s11, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s16 @@ -160,53 +162,52 @@ define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: v_mov_b32_e32 v2, s12 ; GFX9-NEXT: v_mad_u32_u24 v1, s4, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mad_u32_u24 v1, s3, v2, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_mad_u32_u24 v1, s1, v2, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc32: ; GFX9-DL: ; %bb.0: ; %entry +; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s14, -1 +; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s12, s12, s3 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s10, -1 -; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_dot8_u32_u4 v1, s4, v1, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-DL-NEXT: v_dot8_u32_u4 v1, s0, v1, v2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc32: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s10, -1 -; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s12, s12, s3 ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-DL-NEXT: v_dot8_u32_u4 v0, s0, s1, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -388,37 +389,37 @@ define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1, ; ; GFX9-LABEL: udot8_acc16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s22, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] ; GFX9-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-NEXT: s_add_u32 s20, s20, s3 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s4, s2, 28 -; GFX9-NEXT: s_bfe_u32 s12, s3, 0x40018 -; GFX9-NEXT: s_bfe_u32 s13, s3, 0x40014 -; GFX9-NEXT: s_bfe_u32 s14, s3, 0x40010 -; GFX9-NEXT: s_bfe_u32 s15, s3, 0x4000c -; GFX9-NEXT: s_bfe_u32 s16, s3, 0x40008 -; GFX9-NEXT: s_bfe_u32 s17, s3, 0x40004 -; GFX9-NEXT: s_lshr_b32 s11, s3, 28 -; GFX9-NEXT: s_and_b32 s3, s3, 15 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40004 -; GFX9-NEXT: s_and_b32 s2, s2, 15 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s4, s0, 28 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40018 +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s15, s1, 0x4000c +; GFX9-NEXT: s_bfe_u32 s16, s1, 0x40008 +; GFX9-NEXT: s_bfe_u32 s17, s1, 0x40004 +; GFX9-NEXT: s_lshr_b32 s11, s1, 28 +; GFX9-NEXT: s_and_b32 s1, s1, 15 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x4000c +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40004 +; GFX9-NEXT: s_and_b32 s0, s0, 15 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, s17 ; GFX9-NEXT: v_mov_b32_e32 v4, s16 ; GFX9-NEXT: v_mov_b32_e32 v5, s15 @@ -426,7 +427,7 @@ define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: v_mov_b32_e32 v7, s13 ; GFX9-NEXT: v_mov_b32_e32 v8, s12 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v1, s2, v2, v1 +; GFX9-NEXT: v_mad_u32_u24 v1, s0, v2, v1 ; GFX9-NEXT: v_mad_u32_u24 v1, s10, v3, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_mad_u32_u24 v1, s9, v4, v1 @@ -436,42 +437,42 @@ define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: v_mad_u32_u24 v1, s5, v8, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s11 ; GFX9-NEXT: v_mad_u32_u24 v1, s4, v2, v1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc16: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_mov_b32 s22, -1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_ushort v1, v0, s[0:1] ; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 28 -; GFX9-DL-NEXT: s_bfe_u32 s12, s3, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s13, s3, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s14, s3, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s15, s3, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s16, s3, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s17, s3, 0x40004 -; GFX9-DL-NEXT: s_lshr_b32 s11, s3, 28 -; GFX9-DL-NEXT: s_and_b32 s3, s3, 15 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40004 -; GFX9-DL-NEXT: s_and_b32 s2, s2, 15 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s16, s1, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s17, s1, 0x40004 +; GFX9-DL-NEXT: s_lshr_b32 s11, s1, 28 +; GFX9-DL-NEXT: s_and_b32 s1, s1, 15 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x40004 +; GFX9-DL-NEXT: s_and_b32 s0, s0, 15 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s17 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 ; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 @@ -479,7 +480,7 @@ define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 ; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s0, v2, v1 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s10, v3, v1 ; GFX9-DL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s9, v4, v1 @@ -489,7 +490,7 @@ define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v8, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s11 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 -; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc16: @@ -505,34 +506,34 @@ define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s7, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 +; GFX10-DL-NEXT: s_and_b32 s0, s6, 15 +; GFX10-DL-NEXT: s_and_b32 s1, s7, 15 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40004 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40008 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x40004 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x40008 ; GFX10-DL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x4000c -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40010 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x4000c +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x40010 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x40018 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_lshr_b32 s0, s6, 28 +; GFX10-DL-NEXT: s_lshr_b32 s1, s7, 28 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 ; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm @@ -716,37 +717,37 @@ define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1, ; ; GFX9-LABEL: udot8_acc8: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s22, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] ; GFX9-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-NEXT: s_add_u32 s20, s20, s3 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s4, s2, 28 -; GFX9-NEXT: s_bfe_u32 s12, s3, 0x40018 -; GFX9-NEXT: s_bfe_u32 s13, s3, 0x40014 -; GFX9-NEXT: s_bfe_u32 s14, s3, 0x40010 -; GFX9-NEXT: s_bfe_u32 s15, s3, 0x4000c -; GFX9-NEXT: s_bfe_u32 s16, s3, 0x40008 -; GFX9-NEXT: s_bfe_u32 s17, s3, 0x40004 -; GFX9-NEXT: s_lshr_b32 s11, s3, 28 -; GFX9-NEXT: s_and_b32 s3, s3, 15 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40004 -; GFX9-NEXT: s_and_b32 s2, s2, 15 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s4, s0, 28 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40018 +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s15, s1, 0x4000c +; GFX9-NEXT: s_bfe_u32 s16, s1, 0x40008 +; GFX9-NEXT: s_bfe_u32 s17, s1, 0x40004 +; GFX9-NEXT: s_lshr_b32 s11, s1, 28 +; GFX9-NEXT: s_and_b32 s1, s1, 15 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x4000c +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40004 +; GFX9-NEXT: s_and_b32 s0, s0, 15 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, s17 ; GFX9-NEXT: v_mov_b32_e32 v4, s16 ; GFX9-NEXT: v_mov_b32_e32 v5, s15 @@ -754,7 +755,7 @@ define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: v_mov_b32_e32 v7, s13 ; GFX9-NEXT: v_mov_b32_e32 v8, s12 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v1, s2, v2, v1 +; GFX9-NEXT: v_mad_u32_u24 v1, s0, v2, v1 ; GFX9-NEXT: v_mad_u32_u24 v1, s10, v3, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX9-NEXT: v_mad_u32_u24 v1, s9, v4, v1 @@ -764,42 +765,42 @@ define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: v_mad_u32_u24 v1, s5, v8, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s11 ; GFX9-NEXT: v_mad_u32_u24 v1, s4, v2, v1 -; GFX9-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_mov_b32 s22, -1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] ; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 28 -; GFX9-DL-NEXT: s_bfe_u32 s12, s3, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s13, s3, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s14, s3, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s15, s3, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s16, s3, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s17, s3, 0x40004 -; GFX9-DL-NEXT: s_lshr_b32 s11, s3, 28 -; GFX9-DL-NEXT: s_and_b32 s3, s3, 15 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40004 -; GFX9-DL-NEXT: s_and_b32 s2, s2, 15 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s16, s1, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s17, s1, 0x40004 +; GFX9-DL-NEXT: s_lshr_b32 s11, s1, 28 +; GFX9-DL-NEXT: s_and_b32 s1, s1, 15 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x40004 +; GFX9-DL-NEXT: s_and_b32 s0, s0, 15 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s17 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 ; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 @@ -807,7 +808,7 @@ define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 ; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s0, v2, v1 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s10, v3, v1 ; GFX9-DL-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s9, v4, v1 @@ -817,7 +818,7 @@ define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v8, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s11 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc8: @@ -833,34 +834,34 @@ define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s7, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 +; GFX10-DL-NEXT: s_and_b32 s0, s6, 15 +; GFX10-DL-NEXT: s_and_b32 s1, s7, 15 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40004 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40008 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x40004 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x40008 ; GFX10-DL-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x4000c -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40010 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x4000c +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x40010 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x40018 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_lshr_b32 s0, s6, 28 +; GFX10-DL-NEXT: s_lshr_b32 s1, s7, 28 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 ; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm @@ -1048,41 +1049,41 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1, ; ; GFX9-LABEL: udot8_acc4: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s22, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] ; GFX9-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-NEXT: s_add_u32 s20, s20, s3 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s10, s2, 15 -; GFX9-NEXT: s_and_b32 s17, s3, 15 -; GFX9-NEXT: s_bfe_u32 s16, s3, 0x40004 +; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_and_b32 s10, s0, 15 +; GFX9-NEXT: s_and_b32 s17, s1, 15 +; GFX9-NEXT: s_bfe_u32 s16, s1, 0x40004 ; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: s_bfe_u32 s12, s3, 0x40018 -; GFX9-NEXT: s_bfe_u32 s13, s3, 0x40014 -; GFX9-NEXT: s_bfe_u32 s14, s3, 0x40010 -; GFX9-NEXT: s_bfe_u32 s15, s3, 0x40008 -; GFX9-NEXT: s_lshr_b32 s11, s3, 28 -; GFX9-NEXT: s_bfe_u32 s3, s3, 0x4000c -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40004 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40018 +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40008 +; GFX9-NEXT: s_lshr_b32 s11, s1, 28 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40004 ; GFX9-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-NEXT: s_lshr_b32 s4, s2, 28 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: s_lshr_b32 s4, s0, 28 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x4000c +; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_mov_b32_e32 v5, s15 -; GFX9-NEXT: v_mul_u32_u24_e32 v2, s2, v2 +; GFX9-NEXT: v_mul_u32_u24_e32 v2, s0, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: v_mov_b32_e32 v6, s14 ; GFX9-NEXT: v_mov_b32_e32 v7, s13 @@ -1099,46 +1100,46 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: v_mov_b32_e32 v2, s11 ; GFX9-NEXT: v_mad_u32_u24 v1, s4, v2, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX9-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc4: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_mov_b32 s22, -1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] ; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s10, s2, 15 -; GFX9-DL-NEXT: s_and_b32 s17, s3, 15 -; GFX9-DL-NEXT: s_bfe_u32 s16, s3, 0x40004 +; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_and_b32 s10, s0, 15 +; GFX9-DL-NEXT: s_and_b32 s17, s1, 15 +; GFX9-DL-NEXT: s_bfe_u32 s16, s1, 0x40004 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-DL-NEXT: s_bfe_u32 s12, s3, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s13, s3, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s14, s3, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s15, s3, 0x40008 -; GFX9-DL-NEXT: s_lshr_b32 s11, s3, 28 -; GFX9-DL-NEXT: s_bfe_u32 s3, s3, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40004 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40008 +; GFX9-DL-NEXT: s_lshr_b32 s11, s1, 28 +; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40004 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 28 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, s2, v2 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, s0, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 ; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 @@ -1155,7 +1156,7 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s11 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 ; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc4: @@ -1171,36 +1172,36 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s7, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x4000c +; GFX10-DL-NEXT: s_and_b32 s0, s6, 15 +; GFX10-DL-NEXT: s_and_b32 s1, s7, 15 +; GFX10-DL-NEXT: s_bfe_u32 s2, s7, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s3, s7, 0x4000c ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40004 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x4000c -; GFX10-DL-NEXT: v_mul_u32_u24_e64 v2, s3, s7 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s6, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40010 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x40004 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s1, s6, 0x4000c +; GFX10-DL-NEXT: v_mul_u32_u24_e64 v2, s1, s3 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s2, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x40010 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x40018 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_lshr_b32 s0, s6, 28 +; GFX10-DL-NEXT: s_lshr_b32 s1, s7, 28 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 ; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] @@ -1373,41 +1374,41 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %sr ; ; GFX9-LABEL: udot8_CommutationInsideMAD: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s22, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] ; GFX9-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-NEXT: s_add_u32 s20, s20, s3 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s10, s2, 15 -; GFX9-NEXT: s_and_b32 s17, s3, 15 -; GFX9-NEXT: s_bfe_u32 s16, s3, 0x40004 +; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_and_b32 s10, s0, 15 +; GFX9-NEXT: s_and_b32 s17, s1, 15 +; GFX9-NEXT: s_bfe_u32 s16, s1, 0x40004 ; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: s_bfe_u32 s12, s3, 0x40018 -; GFX9-NEXT: s_bfe_u32 s13, s3, 0x40014 -; GFX9-NEXT: s_bfe_u32 s14, s3, 0x40010 -; GFX9-NEXT: s_bfe_u32 s15, s3, 0x40008 -; GFX9-NEXT: s_lshr_b32 s11, s3, 28 -; GFX9-NEXT: s_bfe_u32 s3, s3, 0x4000c -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40004 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40018 +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40008 +; GFX9-NEXT: s_lshr_b32 s11, s1, 28 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40004 ; GFX9-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-NEXT: s_lshr_b32 s4, s2, 28 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: s_lshr_b32 s4, s0, 28 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x4000c +; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_mov_b32_e32 v5, s15 -; GFX9-NEXT: v_mul_u32_u24_e32 v2, s2, v2 +; GFX9-NEXT: v_mul_u32_u24_e32 v2, s0, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: v_mov_b32_e32 v6, s14 ; GFX9-NEXT: v_mov_b32_e32 v7, s13 @@ -1424,46 +1425,46 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %sr ; GFX9-NEXT: v_mov_b32_e32 v2, s11 ; GFX9-NEXT: v_mad_u32_u24 v1, s4, v2, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX9-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_CommutationInsideMAD: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_mov_b32 s22, -1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] ; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s10, s2, 15 -; GFX9-DL-NEXT: s_and_b32 s17, s3, 15 -; GFX9-DL-NEXT: s_bfe_u32 s16, s3, 0x40004 +; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_and_b32 s10, s0, 15 +; GFX9-DL-NEXT: s_and_b32 s17, s1, 15 +; GFX9-DL-NEXT: s_bfe_u32 s16, s1, 0x40004 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-DL-NEXT: s_bfe_u32 s12, s3, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s13, s3, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s14, s3, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s15, s3, 0x40008 -; GFX9-DL-NEXT: s_lshr_b32 s11, s3, 28 -; GFX9-DL-NEXT: s_bfe_u32 s3, s3, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40004 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40008 +; GFX9-DL-NEXT: s_lshr_b32 s11, s1, 28 +; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40004 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 28 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, s2, v2 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, s0, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 ; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 @@ -1480,7 +1481,7 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %sr ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s11 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 ; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_CommutationInsideMAD: @@ -1496,36 +1497,36 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %sr ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s7, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40008 +; GFX10-DL-NEXT: s_and_b32 s0, s6, 15 +; GFX10-DL-NEXT: s_and_b32 s1, s7, 15 +; GFX10-DL-NEXT: s_bfe_u32 s2, s6, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s3, s7, 0x40008 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x40004 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x4000c +; GFX10-DL-NEXT: v_mul_u32_u24_e64 v2, s0, s1 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40004 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x4000c -; GFX10-DL-NEXT: v_mul_u32_u24_e64 v2, s2, s3 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s6, s7, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x40010 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v2, v1 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x40018 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_lshr_b32 s0, s6, 28 +; GFX10-DL-NEXT: s_lshr_b32 s1, s7, 28 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 ; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] @@ -1695,18 +1696,20 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, ; ; GFX9-LABEL: udot8_multiuses_mul1: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s22, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-NEXT: s_add_u32 s20, s20, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s18, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_u32 s17, s6, 0x40004 ; GFX9-NEXT: s_lshr_b32 s7, s6, 28 @@ -1716,19 +1719,19 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_bfe_u32 s15, s6, 0x4000c ; GFX9-NEXT: s_bfe_u32 s16, s6, 0x40008 ; GFX9-NEXT: s_and_b32 s6, s6, 15 -; GFX9-NEXT: s_lshr_b32 s3, s2, 28 -; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40004 -; GFX9-NEXT: s_and_b32 s2, s2, 15 +; GFX9-NEXT: s_lshr_b32 s1, s0, 28 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x4000c +; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s11, s0, 0x40004 +; GFX9-NEXT: s_and_b32 s0, s0, 15 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mad_u32_u24 v2, s2, v1, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s0, v1, v2 ; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: v_mad_u32_u24 v1, s2, v1, v2 +; GFX9-NEXT: v_mad_u32_u24 v1, s0, v1, v2 ; GFX9-NEXT: v_mad_u32_u24 v2, s11, v3, v2 ; GFX9-NEXT: v_mov_b32_e32 v3, s16 ; GFX9-NEXT: v_mad_u32_u24 v2, s10, v3, v2 @@ -1741,26 +1744,27 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: v_mov_b32_e32 v3, s12 ; GFX9-NEXT: v_mad_u32_u24 v2, s4, v3, v2 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mad_u32_u24 v2, s3, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_multiuses_mul1: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_mov_b32 s22, -1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s18, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s18, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_bfe_u32 s17, s6, 0x40004 ; GFX9-DL-NEXT: s_lshr_b32 s7, s6, 28 @@ -1770,19 +1774,19 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_bfe_u32 s15, s6, 0x4000c ; GFX9-DL-NEXT: s_bfe_u32 s16, s6, 0x40008 ; GFX9-DL-NEXT: s_and_b32 s6, s6, 15 -; GFX9-DL-NEXT: s_lshr_b32 s3, s2, 28 -; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40004 -; GFX9-DL-NEXT: s_and_b32 s2, s2, 15 +; GFX9-DL-NEXT: s_lshr_b32 s1, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s11, s0, 0x40004 +; GFX9-DL-NEXT: s_and_b32 s0, s0, 15 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v1, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v1, v2 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v1, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s0, v1, v2 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s11, v3, v2 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s16 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v3, v2 @@ -1795,39 +1799,38 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s12 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v3, v2 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v3, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_multiuses_mul1: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s10, -1 -; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s12, s12, s3 ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_and_b32 s4, s0, 15 +; GFX10-DL-NEXT: s_and_b32 s5, s1, 15 ; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40004 ; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40004 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s3, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s6, s7, v0 ; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40008 ; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40008 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s3, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s6, s7, v1 ; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x4000c ; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x4000c @@ -1845,7 +1848,7 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s6, s7, v1 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, v0, v1 -; GFX10-DL-NEXT: global_store_dword v2, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -2026,18 +2029,20 @@ define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, ; ; GFX9-LABEL: udot8_acc32_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s22, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-NEXT: s_add_u32 s20, s20, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s18, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s7, s6, 28 ; GFX9-NEXT: s_bfe_u32 s12, s6, 0x40018 @@ -2047,17 +2052,17 @@ define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_bfe_u32 s16, s6, 0x40008 ; GFX9-NEXT: s_bfe_u32 s17, s6, 0x40004 ; GFX9-NEXT: s_and_b32 s6, s6, 15 -; GFX9-NEXT: s_lshr_b32 s3, s2, 28 -; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40004 -; GFX9-NEXT: s_and_b32 s2, s2, 15 +; GFX9-NEXT: s_lshr_b32 s1, s0, 28 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x4000c +; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s11, s0, 0x40004 +; GFX9-NEXT: s_and_b32 s0, s0, 15 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mad_u32_u24 v1, s2, v1, v2 +; GFX9-NEXT: v_mad_u32_u24 v1, s0, v1, v2 ; GFX9-NEXT: v_mov_b32_e32 v2, s17 ; GFX9-NEXT: v_mad_u32_u24 v1, s11, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s16 @@ -2071,53 +2076,52 @@ define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: v_mov_b32_e32 v2, s12 ; GFX9-NEXT: v_mad_u32_u24 v1, s4, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mad_u32_u24 v1, s3, v2, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_mad_u32_u24 v1, s1, v2, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc32_vecMul: ; GFX9-DL: ; %bb.0: ; %entry +; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s14, -1 +; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s12, s12, s3 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s10, -1 -; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_dot8_u32_u4 v1, s4, v1, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-DL-NEXT: v_dot8_u32_u4 v1, s0, v1, v2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc32_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s10, -1 -; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s12, s12, s3 ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-DL-NEXT: v_dot8_u32_u4 v0, s0, s1, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -2274,49 +2278,49 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; ; GFX9-LABEL: udot8_acc16_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX9-NEXT: global_load_ushort v5, v0, s[0:1] ; GFX9-NEXT: s_mov_b32 s22, -1 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-NEXT: s_add_u32 s20, s20, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX9-NEXT: global_load_ushort v5, v0, s[2:3] +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x40018 ; GFX9-NEXT: s_lshr_b32 s12, s6, 28 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s12 -; GFX9-NEXT: s_bfe_u32 s3, s2, 0x40018 -; GFX9-NEXT: s_lshr_b32 s4, s2, 28 +; GFX9-NEXT: s_bfe_u32 s1, s0, 0x40018 +; GFX9-NEXT: s_lshr_b32 s4, s0, 28 ; GFX9-NEXT: s_bfe_u32 s13, s6, 0x40010 ; GFX9-NEXT: s_bfe_u32 s14, s6, 0x40014 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_pk_mul_lo_u16 v1, s3, v1 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s13, s14 +; GFX9-NEXT: v_pk_mul_lo_u16 v1, s1, v1 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s13, s14 ; GFX9-NEXT: s_bfe_u32 s15, s6, 0x40008 ; GFX9-NEXT: s_bfe_u32 s16, s6, 0x4000c ; GFX9-NEXT: s_and_b32 s17, s6, 15 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s15, s16 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s15, s16 ; GFX9-NEXT: s_bfe_u32 s6, s6, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s17, s6 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX9-NEXT: s_and_b32 s11, s2, 15 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40004 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s11, s2 -; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s17, s6 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s10, s0, 0x4000c +; GFX9-NEXT: s_and_b32 s11, s0, 15 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x40004 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s11, s0 +; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s5, s8 -; GFX9-NEXT: v_pk_mul_lo_u16 v4, s2, v4 +; GFX9-NEXT: v_pk_mul_lo_u16 v4, s0, v4 ; GFX9-NEXT: v_pk_mul_lo_u16 v2, s4, v2 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s9, s10 ; GFX9-NEXT: v_pk_mul_lo_u16 v3, s4, v3 @@ -2329,54 +2333,54 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 ; GFX9-NEXT: v_add_u32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc16_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX9-DL-NEXT: global_load_ushort v5, v0, s[0:1] ; GFX9-DL-NEXT: s_mov_b32 s22, -1 -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX9-DL-NEXT: global_load_ushort v5, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_bfe_u32 s7, s6, 0x40018 ; GFX9-DL-NEXT: s_lshr_b32 s12, s6, 28 ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s7, s7, s12 -; GFX9-DL-NEXT: s_bfe_u32 s3, s2, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 28 +; GFX9-DL-NEXT: s_bfe_u32 s1, s0, 0x40018 +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 28 ; GFX9-DL-NEXT: s_bfe_u32 s13, s6, 0x40010 ; GFX9-DL-NEXT: s_bfe_u32 s14, s6, 0x40014 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s1, s4 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, s3, v1 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s3, s13, s14 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, s1, v1 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s13, s14 ; GFX9-DL-NEXT: s_bfe_u32 s15, s6, 0x40008 ; GFX9-DL-NEXT: s_bfe_u32 s16, s6, 0x4000c ; GFX9-DL-NEXT: s_and_b32 s17, s6, 15 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s3, s15, s16 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s15, s16 ; GFX9-DL-NEXT: s_bfe_u32 s6, s6, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s3, s17, s6 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX9-DL-NEXT: s_and_b32 s11, s2, 15 -; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40004 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s11, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s17, s6 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x4000c +; GFX9-DL-NEXT: s_and_b32 s11, s0, 15 +; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x40004 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s11, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s5, s8 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, s2, v4 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, s0, v4 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, s4, v2 ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s9, s10 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, s4, v3 @@ -2389,7 +2393,7 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v1 ; GFX9-DL-NEXT: v_add_u32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc16_vecMul: @@ -2405,42 +2409,42 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s7, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_bfe_u32 s7, s0, 0x40004 -; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40004 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s7 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s3, s3, s6 -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40008 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, s2, s3 -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x4000c -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX10-DL-NEXT: s_and_b32 s0, s6, 15 +; GFX10-DL-NEXT: s_bfe_u32 s3, s6, 0x40004 +; GFX10-DL-NEXT: s_and_b32 s1, s7, 15 +; GFX10-DL-NEXT: s_bfe_u32 s2, s7, 0x40004 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s7, 0x40008 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, s0, s1 +; GFX10-DL-NEXT: s_bfe_u32 s3, s7, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s1, s6, 0x4000c ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s3 -; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x40014 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s2, s6 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40014 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX10-DL-NEXT: s_bfe_u32 s1, s6, 0x40014 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s0, s2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s2, s7, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s3, s7, 0x40014 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s3 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s6, s6, s7 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s3, s1 +; GFX10-DL-NEXT: s_lshr_b32 s1, s6, 28 +; GFX10-DL-NEXT: s_lshr_b32 s3, s7, 28 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v2, v1 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, s2, s6 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, s0, s2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s2, s7, 0x40018 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s3 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s0, s1 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s0, s2 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v3 @@ -2639,61 +2643,61 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; ; GFX9-LABEL: udot8_acc8_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s22, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] ; GFX9-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-NEXT: s_add_u32 s20, s20, s3 -; GFX9-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_addc_u32 s21, s21, 0 -; GFX9-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s5, s3, 0x40010 -; GFX9-NEXT: s_bfe_u32 s12, s4, 0x40010 -; GFX9-NEXT: s_bfe_u32 s13, s4, 0x40014 -; GFX9-NEXT: s_bfe_u32 s14, s4, 0x40018 -; GFX9-NEXT: s_lshr_b32 s15, s4, 28 -; GFX9-NEXT: s_and_b32 s16, s4, 15 -; GFX9-NEXT: s_bfe_u32 s17, s4, 0x40004 -; GFX9-NEXT: s_bfe_u32 s18, s4, 0x40008 +; GFX9-NEXT: s_bfe_u32 s4, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s12, s8, 0x40010 +; GFX9-NEXT: s_bfe_u32 s13, s8, 0x40014 +; GFX9-NEXT: s_bfe_u32 s14, s8, 0x40018 +; GFX9-NEXT: s_lshr_b32 s15, s8, 28 +; GFX9-NEXT: s_and_b32 s16, s8, 15 +; GFX9-NEXT: s_bfe_u32 s17, s8, 0x40004 +; GFX9-NEXT: s_bfe_u32 s18, s8, 0x40008 ; GFX9-NEXT: v_mov_b32_e32 v2, s12 -; GFX9-NEXT: s_bfe_u32 s4, s4, 0x4000c -; GFX9-NEXT: s_bfe_u32 s6, s3, 0x40014 +; GFX9-NEXT: s_bfe_u32 s8, s8, 0x4000c +; GFX9-NEXT: s_bfe_u32 s5, s1, 0x40014 ; GFX9-NEXT: v_mov_b32_e32 v3, s13 -; GFX9-NEXT: s_bfe_u32 s7, s3, 0x40018 +; GFX9-NEXT: s_bfe_u32 s6, s1, 0x40018 ; GFX9-NEXT: v_mov_b32_e32 v4, s14 -; GFX9-NEXT: s_lshr_b32 s8, s3, 28 +; GFX9-NEXT: s_lshr_b32 s7, s1, 28 ; GFX9-NEXT: v_mov_b32_e32 v5, s15 -; GFX9-NEXT: s_and_b32 s9, s3, 15 +; GFX9-NEXT: s_and_b32 s9, s1, 15 ; GFX9-NEXT: v_mov_b32_e32 v6, s16 -; GFX9-NEXT: s_bfe_u32 s10, s3, 0x40004 +; GFX9-NEXT: s_bfe_u32 s10, s1, 0x40004 ; GFX9-NEXT: v_mov_b32_e32 v7, s17 -; GFX9-NEXT: s_bfe_u32 s11, s3, 0x40008 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40008 ; GFX9-NEXT: v_mov_b32_e32 v8, s18 -; GFX9-NEXT: s_bfe_u32 s3, s3, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v9, s4 -; GFX9-NEXT: v_mul_lo_u16_e32 v2, s5, v2 -; GFX9-NEXT: v_mul_lo_u16_sdwa v3, s6, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_e32 v4, s7, v4 -; GFX9-NEXT: v_mul_lo_u16_sdwa v5, s8, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-NEXT: v_mov_b32_e32 v9, s8 +; GFX9-NEXT: v_mul_lo_u16_e32 v2, s4, v2 +; GFX9-NEXT: v_mul_lo_u16_sdwa v3, s5, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_e32 v4, s6, v4 +; GFX9-NEXT: v_mul_lo_u16_sdwa v5, s7, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_mul_lo_u16_e32 v6, s9, v6 ; GFX9-NEXT: v_mul_lo_u16_sdwa v7, s10, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX9-NEXT: v_or_b32_sdwa v3, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v4, v6, v7 ; GFX9-NEXT: v_mul_lo_u16_e32 v8, s11, v8 -; GFX9-NEXT: v_mul_lo_u16_sdwa v9, s3, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX9-NEXT: v_mul_lo_u16_sdwa v9, s1, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v4, s0, v4 ; GFX9-NEXT: v_or_b32_sdwa v5, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v5 -; GFX9-NEXT: v_and_b32_e32 v2, s2, v2 +; GFX9-NEXT: v_and_b32_e32 v2, s0, v2 ; GFX9-NEXT: v_or_b32_e32 v3, v2, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, v4, v1 @@ -2705,66 +2709,66 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc8_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_mov_b32 s22, -1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] ; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 -; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-DL-NEXT: s_mov_b32 s0, 0xffff +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_u32 s5, s3, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s12, s4, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s13, s4, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s14, s4, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s15, s4, 28 -; GFX9-DL-NEXT: s_and_b32 s16, s4, 15 -; GFX9-DL-NEXT: s_bfe_u32 s17, s4, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s18, s4, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s4, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s12, s8, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s13, s8, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s14, s8, 0x40018 +; GFX9-DL-NEXT: s_lshr_b32 s15, s8, 28 +; GFX9-DL-NEXT: s_and_b32 s16, s8, 15 +; GFX9-DL-NEXT: s_bfe_u32 s17, s8, 0x40004 +; GFX9-DL-NEXT: s_bfe_u32 s18, s8, 0x40008 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s12 -; GFX9-DL-NEXT: s_bfe_u32 s4, s4, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s6, s3, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s8, s8, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s5, s1, 0x40014 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s13 -; GFX9-DL-NEXT: s_bfe_u32 s7, s3, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s6, s1, 0x40018 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s14 -; GFX9-DL-NEXT: s_lshr_b32 s8, s3, 28 +; GFX9-DL-NEXT: s_lshr_b32 s7, s1, 28 ; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 -; GFX9-DL-NEXT: s_and_b32 s9, s3, 15 +; GFX9-DL-NEXT: s_and_b32 s9, s1, 15 ; GFX9-DL-NEXT: v_mov_b32_e32 v6, s16 -; GFX9-DL-NEXT: s_bfe_u32 s10, s3, 0x40004 +; GFX9-DL-NEXT: s_bfe_u32 s10, s1, 0x40004 ; GFX9-DL-NEXT: v_mov_b32_e32 v7, s17 -; GFX9-DL-NEXT: s_bfe_u32 s11, s3, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40008 ; GFX9-DL-NEXT: v_mov_b32_e32 v8, s18 -; GFX9-DL-NEXT: s_bfe_u32 s3, s3, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s4 -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v2, s5, v2 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v3, s6, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v4, s7, v4 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v5, s8, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s8 +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v2, s4, v2 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v3, s5, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v4, s6, v4 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v5, s7, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v6, s9, v6 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v7, s10, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX9-DL-NEXT: v_or_b32_sdwa v3, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_or_b32_e32 v4, v6, v7 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v8, s11, v8 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v9, s3, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v9, s1, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_e32 v4, s0, v4 ; GFX9-DL-NEXT: v_or_b32_sdwa v5, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v5 -; GFX9-DL-NEXT: v_and_b32_e32 v2, s2, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v2, s0, v2 ; GFX9-DL-NEXT: v_or_b32_e32 v3, v2, v3 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u32_e32 v1, v4, v1 @@ -2776,7 +2780,7 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc8_vecMul: @@ -2792,45 +2796,45 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s7, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40004 -; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v2, s3, s7 -; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s8, s0, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x4000c -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, s2, s3 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s8, s7 +; GFX10-DL-NEXT: s_bfe_u32 s1, s6, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s3, s7, 0x40004 +; GFX10-DL-NEXT: s_and_b32 s0, s6, 15 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v2, s1, s3 +; GFX10-DL-NEXT: s_and_b32 s1, s7, 15 +; GFX10-DL-NEXT: s_bfe_u32 s8, s6, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s3, s7, 0x4000c +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, s0, s1 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s8, s3 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v2, 8, v2 -; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s2, s1, 0x40008 -; GFX10-DL-NEXT: s_mov_b32 s3, 0xffff -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s6, s2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s6, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s0, s7, 0x40008 +; GFX10-DL-NEXT: s_mov_b32 s1, 0xffff +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s2, s0 ; GFX10-DL-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 8, v4 -; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: v_and_b32_e32 v2, s3, v2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s6, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s8, s7, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40010 +; GFX10-DL-NEXT: v_and_b32_e32 v2, s1, v2 ; GFX10-DL-NEXT: v_or_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s6, s8 -; GFX10-DL-NEXT: s_bfe_u32 s7, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s9, s1, 0x40010 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s2, s8 +; GFX10-DL-NEXT: s_bfe_u32 s3, s6, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s9, s7, 0x40010 +; GFX10-DL-NEXT: s_lshr_b32 s6, s6, 28 ; GFX10-DL-NEXT: v_or_b32_e32 v3, v2, v3 -; GFX10-DL-NEXT: s_lshr_b32 s6, s1, 28 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s2, s9 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s0, s6 +; GFX10-DL-NEXT: s_lshr_b32 s2, s7, 28 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s0, s9 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s6, s2 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 8, v4 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v3 -; GFX10-DL-NEXT: s_bfe_u32 s0, s1, 0x40018 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v11, s7, s0 +; GFX10-DL-NEXT: s_bfe_u32 s0, s7, 0x40018 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v11, s3, s0 ; GFX10-DL-NEXT: v_or_b32_e32 v4, v5, v4 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 8, v6 -; GFX10-DL-NEXT: v_and_b32_e32 v4, s3, v4 +; GFX10-DL-NEXT: v_and_b32_e32 v4, s1, v4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v2, v1 ; GFX10-DL-NEXT: v_or_b32_sdwa v2, v11, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -2993,41 +2997,41 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1, ; ; GFX9-LABEL: udot8_acc4_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s22, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] ; GFX9-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-NEXT: s_add_u32 s20, s20, s3 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s10, s2, 15 -; GFX9-NEXT: s_and_b32 s17, s3, 15 -; GFX9-NEXT: s_bfe_u32 s16, s3, 0x40004 +; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_and_b32 s10, s0, 15 +; GFX9-NEXT: s_and_b32 s17, s1, 15 +; GFX9-NEXT: s_bfe_u32 s16, s1, 0x40004 ; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: s_bfe_u32 s12, s3, 0x40018 -; GFX9-NEXT: s_bfe_u32 s13, s3, 0x40014 -; GFX9-NEXT: s_bfe_u32 s14, s3, 0x40010 -; GFX9-NEXT: s_bfe_u32 s15, s3, 0x40008 -; GFX9-NEXT: s_lshr_b32 s11, s3, 28 -; GFX9-NEXT: s_bfe_u32 s3, s3, 0x4000c -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40004 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40018 +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40008 +; GFX9-NEXT: s_lshr_b32 s11, s1, 28 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40004 ; GFX9-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-NEXT: s_lshr_b32 s4, s2, 28 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: s_lshr_b32 s4, s0, 28 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x4000c +; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_mov_b32_e32 v5, s15 -; GFX9-NEXT: v_mul_u32_u24_e32 v2, s2, v2 +; GFX9-NEXT: v_mul_u32_u24_e32 v2, s0, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: v_mov_b32_e32 v6, s14 ; GFX9-NEXT: v_mov_b32_e32 v7, s13 @@ -3044,46 +3048,46 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: v_mov_b32_e32 v2, s11 ; GFX9-NEXT: v_mad_u32_u24 v1, s4, v2, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX9-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc4_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_mov_b32 s22, -1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] ; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s10, s2, 15 -; GFX9-DL-NEXT: s_and_b32 s17, s3, 15 -; GFX9-DL-NEXT: s_bfe_u32 s16, s3, 0x40004 +; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_and_b32 s10, s0, 15 +; GFX9-DL-NEXT: s_and_b32 s17, s1, 15 +; GFX9-DL-NEXT: s_bfe_u32 s16, s1, 0x40004 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-DL-NEXT: s_bfe_u32 s12, s3, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s13, s3, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s14, s3, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s15, s3, 0x40008 -; GFX9-DL-NEXT: s_lshr_b32 s11, s3, 28 -; GFX9-DL-NEXT: s_bfe_u32 s3, s3, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40004 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40008 +; GFX9-DL-NEXT: s_lshr_b32 s11, s1, 28 +; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40004 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 28 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, s2, v2 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, s0, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 ; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 @@ -3100,7 +3104,7 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s11 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 ; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc4_vecMul: @@ -3116,36 +3120,36 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s7, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x4000c +; GFX10-DL-NEXT: s_and_b32 s0, s6, 15 +; GFX10-DL-NEXT: s_and_b32 s1, s7, 15 +; GFX10-DL-NEXT: s_bfe_u32 s2, s7, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s3, s7, 0x4000c ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40004 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x4000c -; GFX10-DL-NEXT: v_mul_u32_u24_e64 v2, s3, s7 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s6, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40010 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x40004 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s1, s6, 0x4000c +; GFX10-DL-NEXT: v_mul_u32_u24_e64 v2, s1, s3 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s2, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x40010 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x40018 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_lshr_b32 s0, s6, 28 +; GFX10-DL-NEXT: s_lshr_b32 s1, s7, 28 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 ; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] @@ -3278,34 +3282,34 @@ define amdgpu_kernel void @udot8_variant1(i32 addrspace(1)* %v1addr, ; GFX9-LABEL: udot8_variant1: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s18, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s2, 15 -; GFX9-NEXT: s_and_b32 s5, s3, 15 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40004 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s14, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s16, s2, 0x40018 -; GFX9-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-NEXT: s_and_b32 s4, s0, 15 +; GFX9-NEXT: s_and_b32 s5, s1, 15 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40004 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s10, s0, 0x4000c +; GFX9-NEXT: s_bfe_u32 s12, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s14, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s16, s0, 0x40018 +; GFX9-NEXT: s_lshr_b32 s0, s0, 28 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mad_u32_u24 v1, s5, v1, v2 -; GFX9-NEXT: s_bfe_u32 s7, s3, 0x40004 -; GFX9-NEXT: s_bfe_u32 s9, s3, 0x40008 -; GFX9-NEXT: s_bfe_u32 s11, s3, 0x4000c -; GFX9-NEXT: s_bfe_u32 s13, s3, 0x40010 -; GFX9-NEXT: s_bfe_u32 s15, s3, 0x40014 -; GFX9-NEXT: s_bfe_u32 s17, s3, 0x40018 -; GFX9-NEXT: s_lshr_b32 s3, s3, 28 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mad_u32_u24 v1, s3, v2, v1 +; GFX9-NEXT: s_bfe_u32 s7, s1, 0x40004 +; GFX9-NEXT: s_bfe_u32 s9, s1, 0x40008 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x4000c +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s17, s1, 0x40018 +; GFX9-NEXT: s_lshr_b32 s1, s1, 28 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mad_u32_u24 v1, s1, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mad_u32_u24 v1, s7, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s8 @@ -3318,39 +3322,39 @@ define amdgpu_kernel void @udot8_variant1(i32 addrspace(1)* %v1addr, ; GFX9-NEXT: v_mad_u32_u24 v1, s15, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s16 ; GFX9-NEXT: v_mad_u32_u24 v1, s17, v2, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_variant1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_dot8_u32_u4 v1, s4, v1, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-DL-NEXT: v_dot8_u32_u4 v1, s1, v1, v2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_variant1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-DL-NEXT: v_dot8_u32_u4 v0, s1, s0, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm i32 addrspace(1)* %v2addr, i32 addrspace(1)* %dst) { diff --git a/llvm/test/CodeGen/AMDGPU/immv216.ll b/llvm/test/CodeGen/AMDGPU/immv216.ll index b94d908..55157f1 100644 --- a/llvm/test/CodeGen/AMDGPU/immv216.ll +++ b/llvm/test/CodeGen/AMDGPU/immv216.ll @@ -1,7 +1,7 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope --check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=-flat-for-global,-xnack -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global,-xnack -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global,-xnack -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s ; FIXME: Merge into imm.ll ; GCN-LABEL: {{^}}store_inline_imm_neg_0.0_v2i16: diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll index 8e77a1a..367a126 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -53,7 +53,7 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr() { ; GCN-NEXT: is_ptr64 = 1 ; GCN-NEXT: is_dynamic_callstack = 1 ; GCN-NEXT: is_debug_enabled = 0 -; GCN-NEXT: is_xnack_enabled = 1 +; GCN-NEXT: is_xnack_enabled = 0 ; GCN-NEXT: workitem_private_segment_byte_size = 16384 ; GCN-NEXT: workgroup_group_segment_byte_size = 0 ; GCN-NEXT: gds_segment_byte_size = 0 @@ -149,7 +149,7 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg() { ; GCN-NEXT: is_ptr64 = 1 ; GCN-NEXT: is_dynamic_callstack = 1 ; GCN-NEXT: is_debug_enabled = 0 -; GCN-NEXT: is_xnack_enabled = 1 +; GCN-NEXT: is_xnack_enabled = 0 ; GCN-NEXT: workitem_private_segment_byte_size = 16384 ; GCN-NEXT: workgroup_group_segment_byte_size = 0 ; GCN-NEXT: gds_segment_byte_size = 0 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index 1d15482..fc89c67 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -40,12 +40,12 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* % ; GFX9-LABEL: s_insertelement_v2i16_0_reg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_lh_b32_b16 s2, s4, s2 +; GFX9-NEXT: s_pack_lh_b32_b16 s2, s6, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm @@ -91,13 +91,13 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> ad ; GFX9-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s2 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s6, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: ;;#ASMSTART @@ -157,12 +157,12 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* ; GFX9-LABEL: s_insertelement_v2i16_0_reghi: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_hh_b32_b16 s2, s4, s2 +; GFX9-NEXT: s_pack_hh_b32_b16 s2, s6, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm @@ -210,11 +210,11 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> a ; GFX9-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX9-NEXT: s_lshr_b32 s3, s4, 16 +; GFX9-NEXT: s_lshr_b32 s3, s6, 16 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_pack_lh_b32_b16 s2, s3, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -275,11 +275,11 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(<2 x i ; GFX9-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX9-NEXT: s_lshr_b32 s3, s4, 16 +; GFX9-NEXT: s_lshr_b32 s3, s6, 16 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s2, s2, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s3, s2 @@ -387,12 +387,12 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* % ; GFX9-LABEL: s_insertelement_v2i16_1_reg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm @@ -562,12 +562,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* ; GFX9-LABEL: v_insertelement_v2i16_0_reghi: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff0000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: v_lshrrev_b32_e64 v2, 16, s4 +; GFX9-NEXT: v_lshrrev_b32_e64 v2, 16, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_or_b32 v1, v1, v3, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -1039,17 +1039,17 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)* ; GFX9-LABEL: s_insertelement_v2i16_dynamic: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s3, s4, 4 -; GFX9-NEXT: s_lshl_b32 s3, 0xffff, s3 -; GFX9-NEXT: s_andn2_b32 s2, s2, s3 -; GFX9-NEXT: s_and_b32 s3, s3, 0x3e703e7 -; GFX9-NEXT: s_or_b32 s2, s3, s2 +; GFX9-NEXT: s_lshl_b32 s2, s4, 4 +; GFX9-NEXT: s_lshl_b32 s2, 0xffff, s2 +; GFX9-NEXT: s_andn2_b32 s3, s5, s2 +; GFX9-NEXT: s_and_b32 s2, s2, 0x3e703e7 +; GFX9-NEXT: s_or_b32 s2, s2, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm @@ -1102,12 +1102,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspac ; GFX9-LABEL: v_insertelement_v2i16_dynamic_sgpr: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x3e703e7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: s_lshl_b32 s2, s4, 4 +; GFX9-NEXT: s_lshl_b32 s2, s6, 4 ; GFX9-NEXT: s_lshl_b32 s2, 0xffff, s2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v1, s2, v2, v1 @@ -1169,10 +1169,10 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspa ; GFX9-LABEL: v_insertelement_v2f16_dynamic_vgpr: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-NEXT: global_load_dword v1, v0, s[8:9] ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -1252,13 +1252,13 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out ; GFX9-LABEL: v_insertelement_v4f16_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v0, v3, s4, v0 +; GFX9-NEXT: v_bfi_b32 v0, v3, s6, v0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -1317,13 +1317,13 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(<4 x half> addrspace(1)* %out ; GFX9-LABEL: v_insertelement_v4f16_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, s4, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, s6, 16, v0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -1382,13 +1382,13 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out ; GFX9-LABEL: v_insertelement_v4f16_2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v1, v3, s4, v1 +; GFX9-NEXT: v_bfi_b32 v1, v3, s6, v1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -1447,13 +1447,13 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(<4 x half> addrspace(1)* %out ; GFX9-LABEL: v_insertelement_v4f16_3: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, s4, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v1, s6, 16, v1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -1512,13 +1512,13 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(<4 x i16> addrspace(1)* %out, ; GFX9-LABEL: v_insertelement_v4i16_2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v1, v3, s4, v1 +; GFX9-NEXT: v_bfi_b32 v1, v3, s6, v1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -1580,7 +1580,7 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspac ; GFX9-NEXT: global_load_dword v2, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] @@ -1588,7 +1588,7 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspac ; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s4, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s6, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v1, v3, s2, v1 ; GFX9-NEXT: v_bfi_b32 v0, v2, s2, v0 @@ -1667,17 +1667,17 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspa ; GFX9-LABEL: v_insertelement_v4f16_dynamic_sgpr: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s6, s6 ; GFX9-NEXT: s_mov_b32 s3, 0 ; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: s_lshl_b32 s5, s5, 4 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s5 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_lshl_b32 s4, s7, 4 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_mov_b32_e32 v4, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v1, s3, v3, v1 ; GFX9-NEXT: v_bfi_b32 v0, s2, v4, v0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll index 8fc352b..67891d5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll @@ -6,14 +6,14 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float %x, float %y) #0 { ; SI-LABEL: s_cvt_pkrtz_v2f16_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v0, s2, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v0, s4, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_cvt_pkrtz_v2f16_f32: @@ -31,11 +31,11 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, ; GFX9-LABEL: s_cvt_pkrtz_v2f16_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s4, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y) @@ -46,13 +46,13 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(<2 x half> addrspace(1)* %out, float %x) #0 { ; SI-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v0, s2, s2 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v0, s4, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: @@ -69,10 +69,10 @@ define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(<2 x half> addrspace(1) ; GFX9-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s0, s0 +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s4, s4 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %x) @@ -92,22 +92,22 @@ define amdgpu_kernel void @s_cvt_pkrtz_undef_undef(<2 x half> addrspace(1)* %out define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[6:7], s[2:3] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[0:1], s[10:11] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[10:11], s[2:3] +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] ; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v2, v2, v3 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32: @@ -136,12 +136,12 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -269,22 +269,22 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(<2 x half> addrspace(1) define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[6:7], s[2:3] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[0:1], s[10:11] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[10:11], s[2:3] +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] ; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, -v2, v3 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: @@ -313,12 +313,12 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1) ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -339,22 +339,22 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1) define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[6:7], s[2:3] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[0:1], s[10:11] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[10:11], s[2:3] +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] ; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, v2, -v3 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: @@ -383,12 +383,12 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1) ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, -v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -409,22 +409,22 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1) define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[6:7], s[2:3] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[0:1], s[10:11] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[10:11], s[2:3] +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] ; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, -v2, -v3 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: @@ -453,12 +453,12 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -v1, -v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -480,22 +480,22 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[6:7], s[2:3] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[0:1], s[10:11] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[10:11], s[2:3] +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] ; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, -|v2|, -v3 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: @@ -524,12 +524,12 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(<2 x half> ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -|v1|, -v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll index 89f6b8a..68c0362 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll @@ -32,8 +32,9 @@ main_body: } ; GCN-LABEL: {{^}}sample_contig_nsa: -; GCN: image_sample_c_l v0, v[0:7], -; NSA: image_sample v1, [v6, v7, v5], +; NONSA: image_sample_c_l v5, v[0:7], +; NSA: image_sample_c_l v8, v[0:7], +; NSA: image_sample v9, [v6, v7, v5], define amdgpu_ps <2 x float> @sample_contig_nsa(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s1, float %t1, float %r1, float %lod, float %r2, float %s2, float %t2) { main_body: %v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -44,8 +45,8 @@ main_body: } ; GCN-LABEL: {{^}}sample_nsa_nsa: -; NSA: image_sample_c_l v0, [v1, v2, v3, v4, v0], -; NSA: image_sample v1, [v6, v7, v5], +; NSA: image_sample_c_l v8, [v1, v2, v3, v4, v0], +; NSA: image_sample v9, [v6, v7, v5], define amdgpu_ps <2 x float> @sample_nsa_nsa(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %lod, float %zcompare, float %s1, float %t1, float %r1, float %r2, float %s2, float %t2) { main_body: %v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -56,8 +57,8 @@ main_body: } ; GCN-LABEL: {{^}}sample_nsa_contig: -; NSA: image_sample_c_l v0, [v1, v2, v3, v4, v0], -; NSA: image_sample v1, v[5:7], +; NSA: image_sample_c_l v8, [v1, v2, v3, v4, v0], +; NSA: image_sample v9, v[5:7], define amdgpu_ps <2 x float> @sample_nsa_contig(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %lod, float %zcompare, float %s1, float %t1, float %r1, float %s2, float %t2, float %r2) { main_body: %v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -68,9 +69,10 @@ main_body: } ; GCN-LABEL: {{^}}sample_contig_contig: -; GCN: image_sample_c_l v0, v[0:7], -; NSA: image_sample v1, v[5:7], -; NONSA: image_sample v1, v[5:7], +; NSA: image_sample_c_l v8, v[0:7], +; NSA: image_sample v9, v[5:7], +; NONSA: image_sample_c_l v8, v[0:7], +; NONSA: image_sample v9, v[5:7], define amdgpu_ps <2 x float> @sample_contig_contig(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s1, float %t1, float %r1, float %lod, float %s2, float %t2, float %r2) { main_body: %v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll index 0579e46..4a3c1a5a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out, i32 %size) #0 { ; VARIANT0-LABEL: test_barrier: ; VARIANT0: ; %bb.0: ; %entry ; VARIANT0-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; VARIANT0-NEXT: s_load_dword s2, s[0:1], 0xb +; VARIANT0-NEXT: s_load_dword s0, s[0:1], 0xb ; VARIANT0-NEXT: s_mov_b32 s7, 0xf000 ; VARIANT0-NEXT: s_mov_b32 s6, 0 ; VARIANT0-NEXT: v_lshlrev_b32_e32 v1, 2, v0 @@ -18,7 +18,7 @@ define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out, i32 %size) #0 { ; VARIANT0-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 ; VARIANT0-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VARIANT0-NEXT: s_barrier -; VARIANT0-NEXT: v_add_i32_e32 v3, vcc, s2, v3 +; VARIANT0-NEXT: v_add_i32_e32 v3, vcc, s0, v3 ; VARIANT0-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; VARIANT0-NEXT: v_lshl_b64 v[3:4], v[3:4], 2 ; VARIANT0-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64 @@ -29,7 +29,7 @@ define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out, i32 %size) #0 { ; VARIANT1-LABEL: test_barrier: ; VARIANT1: ; %bb.0: ; %entry ; VARIANT1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; VARIANT1-NEXT: s_load_dword s2, s[0:1], 0xb +; VARIANT1-NEXT: s_load_dword s0, s[0:1], 0xb ; VARIANT1-NEXT: s_mov_b32 s7, 0xf000 ; VARIANT1-NEXT: s_mov_b32 s6, 0 ; VARIANT1-NEXT: v_lshlrev_b32_e32 v1, 2, v0 @@ -38,7 +38,7 @@ define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out, i32 %size) #0 { ; VARIANT1-NEXT: s_waitcnt lgkmcnt(0) ; VARIANT1-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 ; VARIANT1-NEXT: s_barrier -; VARIANT1-NEXT: v_add_i32_e32 v3, vcc, s2, v3 +; VARIANT1-NEXT: v_add_i32_e32 v3, vcc, s0, v3 ; VARIANT1-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; VARIANT1-NEXT: v_lshl_b64 v[3:4], v[3:4], 2 ; VARIANT1-NEXT: s_waitcnt expcnt(0) @@ -50,11 +50,11 @@ define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out, i32 %size) #0 { ; VARIANT2-LABEL: test_barrier: ; VARIANT2: ; %bb.0: ; %entry ; VARIANT2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VARIANT2-NEXT: s_load_dword s0, s[0:1], 0x2c +; VARIANT2-NEXT: s_load_dword s4, s[0:1], 0x2c ; VARIANT2-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VARIANT2-NEXT: s_waitcnt lgkmcnt(0) ; VARIANT2-NEXT: global_store_dword v2, v0, s[2:3] -; VARIANT2-NEXT: v_xad_u32 v0, v0, -1, s0 +; VARIANT2-NEXT: v_xad_u32 v0, v0, -1, s4 ; VARIANT2-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VARIANT2-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; VARIANT2-NEXT: v_mov_b32_e32 v3, s3 @@ -70,11 +70,11 @@ define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out, i32 %size) #0 { ; VARIANT3-LABEL: test_barrier: ; VARIANT3: ; %bb.0: ; %entry ; VARIANT3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VARIANT3-NEXT: s_load_dword s0, s[0:1], 0x2c +; VARIANT3-NEXT: s_load_dword s4, s[0:1], 0x2c ; VARIANT3-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VARIANT3-NEXT: s_waitcnt lgkmcnt(0) ; VARIANT3-NEXT: global_store_dword v2, v0, s[2:3] -; VARIANT3-NEXT: v_xad_u32 v0, v0, -1, s0 +; VARIANT3-NEXT: v_xad_u32 v0, v0, -1, s4 ; VARIANT3-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VARIANT3-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; VARIANT3-NEXT: v_mov_b32_e32 v3, s3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll index 9b92dd7..d1c383f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll @@ -5,14 +5,14 @@ define amdgpu_kernel void @bfe_u32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #0 { ; SI-LABEL: bfe_u32_arg_arg_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_bfe_u32 v0, v0, s3, s3 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_bfe_u32 v0, v0, s5, s5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: bfe_u32_arg_arg_arg: @@ -34,15 +34,15 @@ define amdgpu_kernel void @bfe_u32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0 define amdgpu_kernel void @bfe_u32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 { ; SI-LABEL: bfe_u32_arg_arg_imm: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7b ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_bfe_u32 v0, s2, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_bfe_u32 v0, s4, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: bfe_u32_arg_arg_imm: @@ -65,15 +65,15 @@ define amdgpu_kernel void @bfe_u32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0 define amdgpu_kernel void @bfe_u32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #0 { ; SI-LABEL: bfe_u32_arg_imm_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7b ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_bfe_u32 v0, s2, v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_bfe_u32 v0, s4, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: bfe_u32_arg_imm_arg: @@ -96,16 +96,16 @@ define amdgpu_kernel void @bfe_u32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0 define amdgpu_kernel void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) #0 { ; SI-LABEL: bfe_u32_imm_arg_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_movk_i32 s0, 0x7b +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_movk_i32 s6, 0x7b ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_bfe_u32 v0, s0, v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_bfe_u32 v0, s6, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: bfe_u32_imm_arg_arg: @@ -1590,13 +1590,13 @@ define amdgpu_kernel void @lshr_and(i32 addrspace(1)* %out, i32 %a) #0 { ; SI-LABEL: lshr_and: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s0, s2, 0x30006 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_bfe_u32 s4, s2, 0x30006 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: lshr_and: @@ -1619,15 +1619,15 @@ define amdgpu_kernel void @lshr_and(i32 addrspace(1)* %out, i32 %a) #0 { define amdgpu_kernel void @v_lshr_and(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { ; SI-LABEL: v_lshr_and: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s0, s2, s3 -; SI-NEXT: s_and_b32 s0, s0, 7 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_lshr_b32 s2, s4, s5 +; SI-NEXT: s_and_b32 s4, s2, 7 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_lshr_and: @@ -1652,13 +1652,13 @@ define amdgpu_kernel void @and_lshr(i32 addrspace(1)* %out, i32 %a) #0 { ; SI-LABEL: and_lshr: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s0, s2, 0x30006 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_bfe_u32 s4, s2, 0x30006 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: and_lshr: @@ -1682,13 +1682,13 @@ define amdgpu_kernel void @and_lshr2(i32 addrspace(1)* %out, i32 %a) #0 { ; SI-LABEL: and_lshr2: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s0, s2, 0x30006 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_bfe_u32 s4, s2, 0x30006 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: and_lshr2: @@ -1712,13 +1712,13 @@ define amdgpu_kernel void @shl_lshr(i32 addrspace(1)* %out, i32 %a) #0 { ; SI-LABEL: shl_lshr: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s0, s2, 0x150002 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_bfe_u32 s4, s2, 0x150002 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: shl_lshr: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll index 8bd7a11..c0a1314 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -292,17 +292,17 @@ define amdgpu_kernel void @maxnum_v2f16( ; GFX9-LABEL: maxnum_v2f16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s7, s[0:1], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_load_dword s10, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s11, s[8:9], 0x0 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v1, s6, s6 -; GFX9-NEXT: v_pk_max_f16 v0, s7, s7 +; GFX9-NEXT: v_pk_max_f16 v1, s10, s10 +; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 ; GFX9-NEXT: v_pk_max_f16 v0, v1, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -529,12 +529,12 @@ define amdgpu_kernel void @maxnum_v3f16( ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, s4, s4 -; GFX9-NEXT: v_pk_max_f16 v0, s6, s6 +; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 ; GFX9-NEXT: v_pk_max_f16 v0, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v2, s7, s7 +; GFX9-NEXT: v_pk_max_f16 v2, s11, s11 ; GFX9-NEXT: v_pk_max_f16 v1, s5, s5 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v2 ; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 @@ -643,12 +643,12 @@ define amdgpu_kernel void @maxnum_v4f16( ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, s5, s5 -; GFX9-NEXT: v_pk_max_f16 v0, s7, s7 +; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v2, s6, s6 +; GFX9-NEXT: v_pk_max_f16 v2, s10, s10 ; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v2 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll index 69af4c1..cd6b546 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -315,17 +315,17 @@ define amdgpu_kernel void @minnum_v2f16_ieee( ; GFX9-LABEL: minnum_v2f16_ieee: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s7, s[0:1], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_load_dword s10, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s11, s[8:9], 0x0 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v1, s6, s6 -; GFX9-NEXT: v_pk_max_f16 v0, s7, s7 +; GFX9-NEXT: v_pk_max_f16 v1, s10, s10 +; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 ; GFX9-NEXT: v_pk_min_f16 v0, v1, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -582,12 +582,12 @@ define amdgpu_kernel void @minnum_v3f16( ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, s4, s4 -; GFX9-NEXT: v_pk_max_f16 v0, s6, s6 +; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 ; GFX9-NEXT: v_pk_min_f16 v0, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v2, s7, s7 +; GFX9-NEXT: v_pk_max_f16 v2, s11, s11 ; GFX9-NEXT: v_pk_max_f16 v1, s5, s5 ; GFX9-NEXT: v_pk_min_f16 v1, v1, v2 ; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 @@ -696,12 +696,12 @@ define amdgpu_kernel void @minnum_v4f16( ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, s5, s5 -; GFX9-NEXT: v_pk_max_f16 v0, s7, s7 +; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 ; GFX9-NEXT: v_pk_min_f16 v1, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v2, s6, s6 +; GFX9-NEXT: v_pk_max_f16 v2, s10, s10 ; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 ; GFX9-NEXT: v_pk_min_f16 v0, v0, v2 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index a31b5812..b0d686b 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -3636,98 +3636,98 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspa ; GCN-NOHSA-SI: ; %bb.0: ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x0 -; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x10 +; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x0 +; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x10 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s20, s37, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s21, s36, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s22, s37 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s23, s36 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s24, s39, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s25, s38, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s26, s39 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s27, s38 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s28, s41, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s29, s40, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s30, s41 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s31, s40 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s43, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s34, s42, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s35, s43 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s36, s42 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s37, s45, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s38, s44, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s39, s45 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s40, s44 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s41, s47, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s42, s46, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s43, s47 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s44, s46 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s45, s49, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s46, s48, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s47, s49 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s48, s48 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s49, s51, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s52, s50, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s51, s51 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s50, s50 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s53, s5, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s54, s4, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s20, s5, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s21, s4, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s5, s5 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s4, s4 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s55, s7, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s56, s6, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s22, s7, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s23, s6, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s7, s7 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s6, s6 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s57, s9, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s58, s8, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s24, s9, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s25, s8, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s9, s9 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s8, s8 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s59, s10, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s60, s11 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s26, s11, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s27, s10, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s11, s11 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s10, s10 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s61, s13, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s62, s12, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s28, s13, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s29, s12, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s13, s13 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s12, s12 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s63, s15, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s64, s14, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s30, s15, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s31, s14, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s15, s15 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s14, s14 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s65, s17, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s66, s16, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s17, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s34, s16, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s17, s17 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s16, s16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s67, s19, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s68, s18, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s35, s19, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s52, s18, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s19, s19 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s18, s18 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s11, s11, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s53, s37, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s54, s36, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s37, s37 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s36, s36 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s55, s39, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s56, s38, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s39, s39 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s38, s38 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s57, s41, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s58, s40, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s41, s41 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s40, s40 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s59, s42, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s60, s43 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s42, s42 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s61, s45, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s62, s44, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s45, s45 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s44, s44 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s63, s47, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s64, s46, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s47, s47 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s46, s46 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s65, s49, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s66, s48, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s49, s49 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s48, s48 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s67, s51, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s68, s50, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s51, s51 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s50, s50 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s43, s43, 16 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s50 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s68 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s51 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s67 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s48 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s66 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s49 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s65 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s46 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s64 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s47 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s63 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s44 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s62 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s45 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s61 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s42 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s59 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s60 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s40 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s43 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s58 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s41 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s57 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 @@ -3736,63 +3736,63 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspa ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:160 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(5) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s38 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s56 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s39 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s55 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s54 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s37 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s53 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s50 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s52 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s51 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s49 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s35 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s48 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s46 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s47 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s45 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s34 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s44 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s42 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s43 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s41 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s30 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s40 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s38 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s37 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s28 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s34 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s35 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s26 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s31 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s29 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s30 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s28 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s25 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s24 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s22 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s20 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index 204ba769..2f131ba 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -510,14 +510,14 @@ define amdgpu_kernel void @global_load_v16i16(<16 x i16> addrspace(1)* %out, <16 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_load_v16i16: @@ -8003,33 +8003,33 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, v7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, v3 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v16, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[18:19], v[6:7], 48 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[18:19], v[2:3], 48 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[18:19], v[4:5], 48 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v5, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[18:19], v[0:1], 48 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v1, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:208 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(4) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v3 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v7 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v5, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[18:19], v[2:3], 48 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[18:19], v[6:7], 48 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[18:19], v[0:1], 48 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[18:19], v[4:5], 48 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v5, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(4) @@ -8055,48 +8055,48 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v9, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v1, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v6, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v2, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v3, v4, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v5, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v1, 0, 16 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v12 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v10 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v3, v7, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v1, v2, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 offset:160 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v5, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v6, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v3, v1, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v1, v8, 0, 16 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v0, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v8, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v5, v10, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v7, v6, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v7, v7, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v11, v9, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v9, v12, 0, 16 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v14 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v14, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v2, 0, 16 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v0, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v19, v2, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v4, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v19, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v10, 31, v9 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v8, 31, v7 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 @@ -8105,7 +8105,7 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i64: diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll index 4385500..3052e73 100644 --- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll +++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll @@ -44,14 +44,14 @@ define amdgpu_kernel void @local_stack_offset_uses_sp(i64 addrspace(1)* %out) { ; MUBUF-NEXT: v_add_u32_e32 v1, 0x20d0, v1 ; MUBUF-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 glc +; MUBUF-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen glc +; MUBUF-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4 glc +; MUBUF-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, v2, v3 -; MUBUF-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc +; MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, v2, v4 +; MUBUF-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v5, vcc ; MUBUF-NEXT: v_mov_b32_e32 v2, 0 ; MUBUF-NEXT: s_waitcnt lgkmcnt(0) ; MUBUF-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] @@ -135,16 +135,16 @@ define void @func_local_stack_offset_uses_sp(i64 addrspace(1)* %out) { ; MUBUF-NEXT: v_add_u32_e32 v3, 0x20d0, v3 ; MUBUF-NEXT: buffer_load_dword v4, v3, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen offset:4 glc +; MUBUF-NEXT: buffer_load_dword v5, v3, s[0:3], 0 offen offset:4 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen glc +; MUBUF-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:4 glc +; MUBUF-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:4 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_sub_u32 s32, s32, 0x180000 ; MUBUF-NEXT: s_mov_b32 s33, s5 -; MUBUF-NEXT: v_add_co_u32_e32 v2, vcc, v4, v5 -; MUBUF-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc +; MUBUF-NEXT: v_add_co_u32_e32 v2, vcc, v4, v6 +; MUBUF-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v7, vcc ; MUBUF-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_setpc_b64 s[30:31] @@ -234,25 +234,26 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(<3 x i64> addrspace(1 ; MUBUF-NEXT: v_or_b32_e32 v2, 0x12c4, v0 ; MUBUF-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc +; MUBUF-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_or_b32_e32 v2, 0x12cc, v0 +; MUBUF-NEXT: v_or_b32_e32 v1, 0x12cc, v0 ; MUBUF-NEXT: v_or_b32_e32 v0, 0x12c8, v0 ; MUBUF-NEXT: v_mov_b32_e32 v13, 0x4000 -; MUBUF-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen glc +; MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: v_mov_b32_e32 v12, 0 ; MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: buffer_load_dword v7, v13, s[0:3], 0 offen glc +; MUBUF-NEXT: buffer_load_dword v8, v13, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: v_mov_b32_e32 v13, 0x4000 -; MUBUF-NEXT: buffer_load_dword v8, v13, s[0:3], 0 offen offset:4 glc +; MUBUF-NEXT: buffer_load_dword v9, v13, s[0:3], 0 offen offset:4 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: v_mov_b32_e32 v13, 0x4000 ; MUBUF-NEXT: buffer_load_dword v2, v13, s[0:3], 0 offen offset:8 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: v_mov_b32_e32 v13, 0x4000 -; MUBUF-NEXT: buffer_load_dword v9, v13, s[0:3], 0 offen offset:12 glc +; MUBUF-NEXT: buffer_load_dword v3, v13, s[0:3], 0 offen offset:12 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: v_mov_b32_e32 v13, 0x4000 ; MUBUF-NEXT: buffer_load_dword v10, v13, s[0:3], 0 offen offset:16 glc @@ -260,11 +261,10 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(<3 x i64> addrspace(1 ; MUBUF-NEXT: v_mov_b32_e32 v13, 0x4000 ; MUBUF-NEXT: buffer_load_dword v11, v13, s[0:3], 0 offen offset:20 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_mov_b32_e32 v12, 0 ; MUBUF-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2 -; MUBUF-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v9, vcc -; MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, v1, v7 -; MUBUF-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v8, vcc +; MUBUF-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc +; MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, v7, v8 +; MUBUF-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v9, vcc ; MUBUF-NEXT: v_add_co_u32_e32 v4, vcc, v4, v10 ; MUBUF-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v11, vcc ; MUBUF-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll index 1f61333..aedb081 100644 --- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -10,11 +10,11 @@ define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x30 +; GFX9-NEXT: s_load_dword s5, s[0:1], 0x30 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_pk_lshrrev_b16 v1, s0, v1 +; GFX9-NEXT: v_pk_lshrrev_b16 v1, s5, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; @@ -140,12 +140,12 @@ define amdgpu_kernel void @lshr_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16 ; GFX9-LABEL: lshr_v_s_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshrrev_b16 v1, s0, v1 +; GFX9-NEXT: v_pk_lshrrev_b16 v1, s2, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; @@ -209,12 +209,12 @@ define amdgpu_kernel void @lshr_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16 ; GFX9-LABEL: lshr_s_v_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, s0 +; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll index dfbdd2b..28e4cdf 100644 --- a/llvm/test/CodeGen/AMDGPU/max.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll @@ -29,11 +29,11 @@ define amdgpu_kernel void @v_test_imax_sge_i16(i16 addrspace(1)* %out, i16 addrs ; GFX9-LABEL: v_test_imax_sge_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_i16_e32 v1, v1, v2 ; GFX9-NEXT: global_store_short v0, v1, s[4:5] @@ -79,11 +79,11 @@ define amdgpu_kernel void @v_test_imax_sge_v2i16(<2 x i16> addrspace(1)* %out, < ; GFX9-LABEL: v_test_imax_sge_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_i16 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -140,14 +140,17 @@ define amdgpu_kernel void @v_test_imax_sge_v3i16(<3 x i16> addrspace(1)* %out, < ; GFX9-LABEL: v_test_imax_sge_v3i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_short_d16 v1, v0, s[0:1] offset:4 -; GFX9-NEXT: global_load_dword v3, v0, s[0:1] +; GFX9-NEXT: global_load_short_d16 v1, v0, s[2:3] offset:4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_load_dword v3, v0, s[2:3] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_short_d16 v2, v0, s[6:7] offset:4 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v4, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_pk_max_i16 v1, v2, v1 @@ -200,11 +203,11 @@ define amdgpu_kernel void @v_test_imax_sge_v4i16(<4 x i16> addrspace(1)* %out, < ; GFX9-LABEL: v_test_imax_sge_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_i16 v1, v1, v3 ; GFX9-NEXT: v_pk_max_i16 v0, v0, v2 @@ -249,11 +252,11 @@ define amdgpu_kernel void @v_test_imax_sgt_i16(i16 addrspace(1)* %out, i16 addrs ; GFX9-LABEL: v_test_imax_sgt_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_i16_e32 v1, v1, v2 ; GFX9-NEXT: global_store_short v0, v1, s[4:5] @@ -297,11 +300,11 @@ define amdgpu_kernel void @v_test_umax_uge_i16(i16 addrspace(1)* %out, i16 addrs ; GFX9-LABEL: v_test_umax_uge_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_u16_e32 v1, v1, v2 ; GFX9-NEXT: global_store_short v0, v1, s[4:5] @@ -345,11 +348,11 @@ define amdgpu_kernel void @v_test_umax_ugt_i16(i16 addrspace(1)* %out, i16 addrs ; GFX9-LABEL: v_test_umax_ugt_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_u16_e32 v1, v1, v2 ; GFX9-NEXT: global_store_short v0, v1, s[4:5] @@ -394,11 +397,11 @@ define amdgpu_kernel void @v_test_umax_ugt_v2i16(<2 x i16> addrspace(1)* %out, < ; GFX9-LABEL: v_test_umax_ugt_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_u16 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll index 9beeb85..26e2a1b 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll @@ -1219,25 +1219,25 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_monotonic_monotonic_cmpxchg: @@ -1292,13 +1292,13 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1308,13 +1308,13 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; GFX10-CU-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv @@ -1372,29 +1372,29 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_release_monotonic_cmpxchg: @@ -1452,15 +1452,15 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1470,15 +1470,15 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; GFX10-CU-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv @@ -1541,15 +1541,15 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1559,15 +1559,15 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; GFX10-CU-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv @@ -1628,13 +1628,13 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; GFX10-WGP-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1644,13 +1644,13 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; GFX10-CU-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv @@ -1712,15 +1712,15 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; GFX10-WGP-LABEL: global_agent_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1730,15 +1730,15 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; GFX10-CU-LABEL: global_agent_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv @@ -1801,15 +1801,15 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; GFX10-WGP-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1819,15 +1819,15 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; GFX10-CU-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv @@ -1890,15 +1890,15 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; GFX10-WGP-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1908,15 +1908,15 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; GFX10-CU-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv @@ -1979,15 +1979,15 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1997,15 +1997,15 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv @@ -2070,33 +2070,33 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: @@ -2161,37 +2161,37 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: @@ -2257,37 +2257,37 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: @@ -2351,33 +2351,33 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_acquire_acquire_ret_cmpxchg: @@ -2442,37 +2442,37 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_release_acquire_ret_cmpxchg: @@ -2538,37 +2538,37 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: @@ -2634,37 +2634,37 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: @@ -2730,37 +2730,37 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: @@ -3994,25 +3994,25 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: @@ -4067,13 +4067,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -4082,13 +4082,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; GFX10-CU-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -4145,29 +4145,29 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_monotonic_cmpxchg: @@ -4225,15 +4225,15 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -4242,15 +4242,15 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -4312,15 +4312,15 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -4329,15 +4329,15 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -4397,13 +4397,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; GFX10-WGP-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -4412,13 +4412,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; GFX10-CU-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -4479,15 +4479,15 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; GFX10-WGP-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -4496,15 +4496,15 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; GFX10-CU-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -4566,15 +4566,15 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -4583,15 +4583,15 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -4653,15 +4653,15 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -4670,15 +4670,15 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -4740,15 +4740,15 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -4757,15 +4757,15 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -4829,33 +4829,33 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: @@ -4920,37 +4920,37 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: @@ -5016,37 +5016,37 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: @@ -5110,33 +5110,33 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: @@ -5201,37 +5201,37 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: @@ -5297,37 +5297,37 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: @@ -5393,37 +5393,37 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: @@ -5489,37 +5489,37 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll index 5cb013b..426d8aa 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll @@ -1096,25 +1096,25 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: @@ -1165,25 +1165,25 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_monotonic_cmpxchg: @@ -1234,25 +1234,25 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_release_monotonic_cmpxchg: @@ -1303,25 +1303,25 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: @@ -1372,25 +1372,25 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: @@ -1441,25 +1441,25 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; GFX10-WGP-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_acquire_cmpxchg: @@ -1510,25 +1510,25 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; GFX10-WGP-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_release_acquire_cmpxchg: @@ -1579,25 +1579,25 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; GFX10-WGP-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: @@ -1648,25 +1648,25 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; GFX10-WGP-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: @@ -1717,25 +1717,25 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: @@ -1792,29 +1792,29 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: @@ -1875,29 +1875,29 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: @@ -1958,29 +1958,29 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: @@ -2041,29 +2041,29 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: @@ -2124,29 +2124,29 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_release_acquire_ret_cmpxchg: @@ -2207,29 +2207,29 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: @@ -2290,29 +2290,29 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: @@ -2373,29 +2373,29 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: @@ -3511,25 +3511,25 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch ; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: @@ -3580,25 +3580,25 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -3649,25 +3649,25 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: @@ -3718,25 +3718,25 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -3787,25 +3787,25 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -3856,25 +3856,25 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: @@ -3925,25 +3925,25 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; GFX10-WGP-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: @@ -3994,25 +3994,25 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -4063,25 +4063,25 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -4132,25 +4132,25 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -4207,29 +4207,29 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: @@ -4290,29 +4290,29 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: @@ -4373,29 +4373,29 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: @@ -4456,29 +4456,29 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: @@ -4539,29 +4539,29 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; GFX10-WGP-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: @@ -4622,29 +4622,29 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: @@ -4705,29 +4705,29 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: @@ -4788,29 +4788,29 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll index 0874452..f1357c8 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll @@ -1219,25 +1219,25 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_monotonic_monotonic_cmpxchg: @@ -1292,13 +1292,13 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1308,13 +1308,13 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; GFX10-CU-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv @@ -1372,29 +1372,29 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_system_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_release_monotonic_cmpxchg: @@ -1452,15 +1452,15 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1470,15 +1470,15 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; GFX10-CU-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv @@ -1541,15 +1541,15 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1559,15 +1559,15 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; GFX10-CU-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv @@ -1628,13 +1628,13 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; GFX10-WGP-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1644,13 +1644,13 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; GFX10-CU-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv @@ -1712,15 +1712,15 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; GFX10-WGP-LABEL: global_system_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1730,15 +1730,15 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; GFX10-CU-LABEL: global_system_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv @@ -1801,15 +1801,15 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; GFX10-WGP-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1819,15 +1819,15 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; GFX10-CU-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv @@ -1890,15 +1890,15 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; GFX10-WGP-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1908,15 +1908,15 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; GFX10-CU-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv @@ -1979,15 +1979,15 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1997,15 +1997,15 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv @@ -2070,33 +2070,33 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_acquire_monotonic_ret_cmpxchg: @@ -2161,37 +2161,37 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: @@ -2257,37 +2257,37 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: @@ -2351,33 +2351,33 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_acquire_acquire_ret_cmpxchg: @@ -2442,37 +2442,37 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_release_acquire_ret_cmpxchg: @@ -2538,37 +2538,37 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: @@ -2634,37 +2634,37 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: @@ -2730,37 +2730,37 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: @@ -3994,25 +3994,25 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: @@ -4067,13 +4067,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -4082,13 +4082,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; GFX10-CU-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -4145,29 +4145,29 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_release_monotonic_cmpxchg: @@ -4225,15 +4225,15 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -4242,15 +4242,15 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -4312,15 +4312,15 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -4329,15 +4329,15 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -4397,13 +4397,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; GFX10-WGP-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -4412,13 +4412,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; GFX10-CU-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -4479,15 +4479,15 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; GFX10-WGP-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -4496,15 +4496,15 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; GFX10-CU-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -4566,15 +4566,15 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -4583,15 +4583,15 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -4653,15 +4653,15 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -4670,15 +4670,15 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -4740,15 +4740,15 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -4757,15 +4757,15 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -4829,33 +4829,33 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: @@ -4920,37 +4920,37 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: @@ -5016,37 +5016,37 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: @@ -5110,33 +5110,33 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: @@ -5201,37 +5201,37 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: @@ -5297,37 +5297,37 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: @@ -5393,37 +5393,37 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: @@ -5489,37 +5489,37 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll index cb32f3a..c86d05b 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll @@ -1096,25 +1096,25 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: @@ -1165,25 +1165,25 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_monotonic_cmpxchg: @@ -1234,25 +1234,25 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_release_monotonic_cmpxchg: @@ -1303,25 +1303,25 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: @@ -1372,25 +1372,25 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: @@ -1441,25 +1441,25 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; GFX10-WGP-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_acquire_cmpxchg: @@ -1510,25 +1510,25 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; GFX10-WGP-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_release_acquire_cmpxchg: @@ -1579,25 +1579,25 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; GFX10-WGP-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: @@ -1648,25 +1648,25 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; GFX10-WGP-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: @@ -1717,25 +1717,25 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: @@ -1792,29 +1792,29 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: @@ -1875,29 +1875,29 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: @@ -1958,29 +1958,29 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: @@ -2041,29 +2041,29 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: @@ -2124,29 +2124,29 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_release_acquire_ret_cmpxchg: @@ -2207,29 +2207,29 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: @@ -2290,29 +2290,29 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: @@ -2373,29 +2373,29 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: @@ -3511,25 +3511,25 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: @@ -3580,25 +3580,25 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -3649,25 +3649,25 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: @@ -3718,25 +3718,25 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -3787,25 +3787,25 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -3856,25 +3856,25 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: @@ -3925,25 +3925,25 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; GFX10-WGP-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: @@ -3994,25 +3994,25 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -4063,25 +4063,25 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -4132,25 +4132,25 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -4207,29 +4207,29 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: @@ -4290,29 +4290,29 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: @@ -4373,29 +4373,29 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: @@ -4456,29 +4456,29 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: @@ -4539,29 +4539,29 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: @@ -4622,29 +4622,29 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: @@ -4705,29 +4705,29 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: @@ -4788,29 +4788,29 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll index a1ec279..46e4bad 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll @@ -1168,25 +1168,25 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: @@ -1239,13 +1239,13 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1254,13 +1254,13 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; GFX10-CU-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm ; @@ -1315,28 +1315,28 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_release_monotonic_cmpxchg: @@ -1392,15 +1392,15 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1409,14 +1409,14 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX10-CU-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm ; @@ -1474,15 +1474,15 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1491,14 +1491,14 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX10-CU-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm ; @@ -1554,13 +1554,13 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; GFX10-WGP-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1569,13 +1569,13 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; GFX10-CU-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm ; @@ -1632,15 +1632,15 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX10-WGP-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1649,14 +1649,14 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX10-CU-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm ; @@ -1714,15 +1714,15 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX10-WGP-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1731,14 +1731,14 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX10-CU-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm ; @@ -1796,15 +1796,15 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX10-WGP-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1813,14 +1813,14 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX10-CU-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm ; @@ -1878,15 +1878,15 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1895,14 +1895,14 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm ; @@ -1963,30 +1963,30 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: @@ -2050,33 +2050,33 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: @@ -2141,33 +2141,33 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: @@ -2230,30 +2230,30 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: @@ -2317,33 +2317,33 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_release_acquire_ret_cmpxchg: @@ -2408,33 +2408,33 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: @@ -2499,33 +2499,33 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: @@ -2590,33 +2590,33 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: @@ -3759,25 +3759,25 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: @@ -3828,13 +3828,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -3842,13 +3842,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX10-CU-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: @@ -3899,27 +3899,27 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: @@ -3970,15 +3970,15 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -3986,13 +3986,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -4043,15 +4043,15 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -4059,13 +4059,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -4116,13 +4116,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -4130,13 +4130,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX10-CU-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: @@ -4187,15 +4187,15 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; GFX10-WGP-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -4203,13 +4203,13 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; GFX10-CU-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: @@ -4260,15 +4260,15 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -4276,13 +4276,13 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -4333,15 +4333,15 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -4349,13 +4349,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -4406,15 +4406,15 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -4422,13 +4422,13 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -4485,30 +4485,30 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: @@ -4569,32 +4569,32 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: @@ -4655,32 +4655,32 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: @@ -4741,30 +4741,30 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: @@ -4825,32 +4825,32 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: @@ -4911,32 +4911,32 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: @@ -4997,32 +4997,32 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: @@ -5083,32 +5083,32 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll index c6919a0..385ea6f 100644 --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -495,8 +495,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(<8 x i16> addrspace(1)* %out, < ; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umin_ult_i16: ; GCN-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xa|0x28}} ; GCN-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}} -; GCN: s_min_u32 [[MIN:s[0-9]+]], [[A]], [[B]] -; GCN: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] +; GCN: s_min_u32 [[MIN:s[0-9]+]], s{{[0-9]}}, s{{[0-9]}} +; GCN: v_mov_b32_e32 [[VMIN:v[0-9]+]], s{{[0-9]}} ; GCN: buffer_store_dword [[VMIN]] ; EG: MIN_UINT diff --git a/llvm/test/CodeGen/AMDGPU/nsa-reassign.ll b/llvm/test/CodeGen/AMDGPU/nsa-reassign.ll index bf23d01..962e33b 100644 --- a/llvm/test/CodeGen/AMDGPU/nsa-reassign.ll +++ b/llvm/test/CodeGen/AMDGPU/nsa-reassign.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -enable-misched=0 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-xnack -verify-machineinstrs -enable-misched=0 < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}sample_contig_nsa: ; GCN-DAG: image_sample_c_l v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], diff --git a/llvm/test/CodeGen/AMDGPU/nsa-vmem-hazard.mir b/llvm/test/CodeGen/AMDGPU/nsa-vmem-hazard.mir index 89ce7fd..93873a1 100644 --- a/llvm/test/CodeGen/AMDGPU/nsa-vmem-hazard.mir +++ b/llvm/test/CodeGen/AMDGPU/nsa-vmem-hazard.mir @@ -1,4 +1,4 @@ -# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-xnack -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s # GCN-LABEL: name: hazard_image_sample_d_buf_off6 # GCN: IMAGE_SAMPLE diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll index 1657f69..fba59ff 100644 --- a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll +++ b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll @@ -622,7 +622,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_1(i8* %p) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:1 +; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:1 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -635,7 +635,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_1(i8* %p) { ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -652,7 +652,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(i8* %p) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2047 +; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2047 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -665,7 +665,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(i8* %p) { ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -682,7 +682,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(i8* %p) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 +; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -695,7 +695,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(i8* %p) { ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -714,7 +714,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(i8* %p) { ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 +; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -727,7 +727,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(i8* %p) { ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -746,7 +746,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(i8* %p) { ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX9-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-NEXT: flat_load_ubyte v0, v[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -759,7 +759,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(i8* %p) { ; GFX10-NEXT: s_addc_u32 s1, s1, -1 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -778,7 +778,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(i8* %p) { ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX9-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-NEXT: flat_load_ubyte v0, v[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -791,7 +791,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(i8* %p) { ; GFX10-NEXT: s_addc_u32 s1, s1, -1 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -810,7 +810,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(i8* %p) { ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX9-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-NEXT: flat_load_ubyte v0, v[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -823,7 +823,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(i8* %p) { ; GFX10-NEXT: s_addc_u32 s1, s1, -1 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -840,7 +840,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(i8* %p) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 +; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -853,7 +853,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(i8* %p) { ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -872,7 +872,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(i8* %p) { ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 +; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -885,7 +885,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(i8* %p) { ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -904,7 +904,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(i8* %p) { ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 +; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -917,7 +917,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(i8* %p) { ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -936,7 +936,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(i8* %p) { ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX9-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-NEXT: flat_load_ubyte v0, v[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -949,7 +949,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(i8* %p) { ; GFX10-NEXT: s_addc_u32 s1, s1, -1 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -968,7 +968,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(i8* %p) { ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX9-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-NEXT: flat_load_ubyte v0, v[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -981,7 +981,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(i8* %p) { ; GFX10-NEXT: s_addc_u32 s1, s1, -1 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -1000,7 +1000,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(i8* %p) { ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffc000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX9-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-NEXT: flat_load_ubyte v0, v[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -1013,7 +1013,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(i8* %p) { ; GFX10-NEXT: s_addc_u32 s1, s1, -1 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -1032,7 +1032,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(i8* %p) { ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc -; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2047 +; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2047 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -1045,7 +1045,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(i8* %p) { ; GFX10-NEXT: s_addc_u32 s1, s1, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -1064,7 +1064,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(i8* %p) { ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc -; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2048 +; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2048 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -1077,7 +1077,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(i8* %p) { ; GFX10-NEXT: s_addc_u32 s1, s1, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -1096,7 +1096,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(i8* %p) { ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc -; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 +; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -1109,7 +1109,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(i8* %p) { ; GFX10-NEXT: s_addc_u32 s1, s1, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -1129,7 +1129,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(i8* %p) { ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc -; GFX9-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-NEXT: flat_load_ubyte v0, v[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -1142,7 +1142,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(i8* %p) { ; GFX10-NEXT: s_addc_u32 s1, s1, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -1162,7 +1162,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(i8* %p) { ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc -; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 +; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -1175,7 +1175,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(i8* %p) { ; GFX10-NEXT: s_addc_u32 s1, s1, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -1195,7 +1195,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(i8* %p) { ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc -; GFX9-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-NEXT: flat_load_ubyte v0, v[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -1208,7 +1208,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(i8* %p) { ; GFX10-NEXT: s_addc_u32 s1, s1, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -1229,7 +1229,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(i8* ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x7ff, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-NEXT: flat_load_ubyte v0, v[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -1242,7 +1242,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(i8* ; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -1263,7 +1263,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(i8* ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x800, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-NEXT: flat_load_ubyte v0, v[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -1276,7 +1276,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(i8* ; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -1297,7 +1297,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(i8* ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfff, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-NEXT: flat_load_ubyte v0, v[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -1310,7 +1310,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(i8* ; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -1331,7 +1331,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(i8* ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-NEXT: flat_load_ubyte v0, v[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -1344,7 +1344,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(i8* ; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -1365,7 +1365,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(i8* ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1fff, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-NEXT: flat_load_ubyte v0, v[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -1378,7 +1378,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(i8* ; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -1399,7 +1399,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(i8* ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-NEXT: flat_load_ubyte v0, v[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -1412,7 +1412,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(i8* ; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll index c515dc1..00d9f58 100644 --- a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll +++ b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll @@ -609,7 +609,7 @@ define amdgpu_kernel void @global_inst_salu_offset_1(i8 addrspace(1)* %p) { ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:1 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:1 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -619,7 +619,7 @@ define amdgpu_kernel void @global_inst_salu_offset_1(i8 addrspace(1)* %p) { ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:1 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:1 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -635,7 +635,7 @@ define amdgpu_kernel void @global_inst_salu_offset_11bit_max(i8 addrspace(1)* %p ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -645,7 +645,7 @@ define amdgpu_kernel void @global_inst_salu_offset_11bit_max(i8 addrspace(1)* %p ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -661,7 +661,7 @@ define amdgpu_kernel void @global_inst_salu_offset_12bit_max(i8 addrspace(1)* %p ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -671,7 +671,7 @@ define amdgpu_kernel void @global_inst_salu_offset_12bit_max(i8 addrspace(1)* %p ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x800 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -687,7 +687,7 @@ define amdgpu_kernel void @global_inst_salu_offset_13bit_max(i8 addrspace(1)* %p ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x1000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -697,7 +697,7 @@ define amdgpu_kernel void @global_inst_salu_offset_13bit_max(i8 addrspace(1)* %p ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x1800 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -713,7 +713,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(i8 addrspace(1) ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-2048 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-2048 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -723,7 +723,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(i8 addrspace(1) ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-2048 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-2048 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -739,7 +739,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(i8 addrspace(1) ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-4096 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-4096 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -750,7 +750,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(i8 addrspace(1) ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0xfffff000, s0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -769,7 +769,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(i8 addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -780,7 +780,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(i8 addrspace(1) ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0xffffe000, s0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -796,7 +796,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(i8 addrspace(1)* ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -806,7 +806,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(i8 addrspace(1)* ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x800 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -822,7 +822,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(i8 addrspace(1)* ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x1000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -832,7 +832,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(i8 addrspace(1)* ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x1800 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -848,7 +848,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(i8 addrspace(1)* ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -858,7 +858,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(i8 addrspace(1)* ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x3800 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -874,7 +874,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(i8 addrspace ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-4096 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-4096 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -885,7 +885,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(i8 addrspace ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0xfffff000, s0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -904,7 +904,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(i8 addrspace ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -915,7 +915,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(i8 addrspace ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0xffffe000, s0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -934,7 +934,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(i8 addrspace ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffc000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -945,7 +945,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(i8 addrspace ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0xffffc000, s0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -964,7 +964,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(i8 addrspa ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -975,7 +975,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(i8 addrspa ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0, s0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -994,7 +994,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(i8 addrspa ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2048 +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2048 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1005,7 +1005,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(i8 addrspa ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x800, s0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -1024,7 +1024,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(i8 addrspa ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1035,7 +1035,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(i8 addrspa ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x800, s0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -1055,7 +1055,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(i8 addrspa ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1066,7 +1066,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(i8 addrspa ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x1000, s0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -1086,7 +1086,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(i8 addrspa ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1097,7 +1097,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(i8 addrspa ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x1800, s0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -1117,7 +1117,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(i8 addrspa ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1128,7 +1128,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(i8 addrspa ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x2000, s0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -1149,7 +1149,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(i ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2049 +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2049 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1161,7 +1161,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(i ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, s0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -1182,7 +1182,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(i ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048 +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1194,7 +1194,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(i ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, s0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -1215,7 +1215,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(i ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1227,7 +1227,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(i ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, s0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -1248,7 +1248,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(i ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1260,7 +1260,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(i ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, s0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -1281,7 +1281,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(i ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1293,7 +1293,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(i ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x2000, s0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -1314,7 +1314,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(i ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1326,7 +1326,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(i ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x2000, s0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/reassoc-scalar.ll b/llvm/test/CodeGen/AMDGPU/reassoc-scalar.ll index 2af8db9..3093d97 100644 --- a/llvm/test/CodeGen/AMDGPU/reassoc-scalar.ll +++ b/llvm/test/CodeGen/AMDGPU/reassoc-scalar.ll @@ -47,10 +47,10 @@ bb: ; GCN-LABEL: reassoc_v2i32: ; GCN: s_add_i32 [[ADD1:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}} -; GCN: s_add_i32 [[ADD2:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}} -; GFX8: v_add_u32_e32 v{{[0-9]+}}, vcc, [[ADD1]], v{{[0-9]+}} +; GCN-DAG: s_add_i32 [[ADD2:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}} +; GFX8-DAG: v_add_u32_e32 v{{[0-9]+}}, vcc, [[ADD1]], v{{[0-9]+}} ; GFX8: v_add_u32_e32 v{{[0-9]+}}, vcc, [[ADD2]], v{{[0-9]+}} -; GFX9: v_add_u32_e32 v{{[0-9]+}}, [[ADD1]], v{{[0-9]+}} +; GFX9-DAG: v_add_u32_e32 v{{[0-9]+}}, [[ADD1]], v{{[0-9]+}} ; GFX9: v_add_u32_e32 v{{[0-9]+}}, [[ADD2]], v{{[0-9]+}} define amdgpu_kernel void @reassoc_v2i32(<2 x i32> addrspace(1)* %arg, <2 x i32> %x, <2 x i32> %y) { bb: diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll index 0a223e8..99f0f9e 100644 --- a/llvm/test/CodeGen/AMDGPU/saddo.ll +++ b/llvm/test/CodeGen/AMDGPU/saddo.ll @@ -58,19 +58,19 @@ define amdgpu_kernel void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b ; GFX9-LABEL: saddo_i64_zext: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_add_u32 s2, s6, s0 +; GFX9-NEXT: s_add_u32 s0, s6, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_addc_u32 s3, s7, s1 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_xor_b64 s[0:1], s[8:9], vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: s_addc_u32 s1, s7, s3 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[8:9], s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_xor_b64 s[2:3], s[8:9], vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm @@ -132,14 +132,14 @@ define amdgpu_kernel void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* ; GFX9-LABEL: s_saddo_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_add_i32 s1, s0, s1 -; GFX9-NEXT: v_add_i32 v1, s0, v1 clamp -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_add_i32 s0, s2, s3 +; GFX9-NEXT: v_add_i32 v1, s2, v1 clamp +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s0, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX9-NEXT: global_store_dword v0, v2, s[4:5] ; GFX9-NEXT: global_store_byte v0, v1, s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll index ed216cb..953cc60 100644 --- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll @@ -223,13 +223,13 @@ bb: define amdgpu_kernel void @scalar_to_vector_test6(<2 x half> addrspace(1)* %out, i8 zeroext %val) nounwind { ; SI-LABEL: scalar_to_vector_test6: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: scalar_to_vector_test6: diff --git a/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll b/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll index 746a277..95c0894 100644 --- a/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll @@ -8,15 +8,15 @@ define amdgpu_kernel void @select_constant_cttz(i32 addrspace(1)* noalias %out, ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dword s8, s[2:3], 0x0 +; GCN-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s0, 1, s8 +; GCN-NEXT: s_lshr_b32 s0, 1, s2 ; GCN-NEXT: s_ff1_i32_b32 s0, s0 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_cmp_ne_u32_e64 s[2:3], s8, 0 +; GCN-NEXT: v_cmp_ne_u32_e64 s[2:3], s2, 0 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, -1, s[2:3] ; GCN-NEXT: v_ffbh_i32_e32 v1, v0 ; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll b/llvm/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll index ae50d4f..f5f1d63 100644 --- a/llvm/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll +++ b/llvm/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll @@ -1,6 +1,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}const_load_no_shrink_dword_to_unaligned_byte: +; GCN: s_load_dword s{{[0-9]+}} ; GCN: s_load_dword [[LD:s[0-9]+]], ; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10013 define amdgpu_kernel void @const_load_no_shrink_dword_to_unaligned_byte(i32 addrspace(1)* %out, i32 addrspace(4)* %in, i32 %x) { @@ -14,6 +15,7 @@ define amdgpu_kernel void @const_load_no_shrink_dword_to_unaligned_byte(i32 addr } ; GCN-LABEL: const_load_no_shrink_dword_to_aligned_byte: +; GCN: s_load_dword s{{[0-9]+}} ; GCN: s_load_dword [[LD:s[0-9]+]], ; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10003 define amdgpu_kernel void @const_load_no_shrink_dword_to_aligned_byte(i32 addrspace(1)* %out, i32 addrspace(4)* %in, i32 %x) { @@ -27,6 +29,7 @@ define amdgpu_kernel void @const_load_no_shrink_dword_to_aligned_byte(i32 addrsp } ; GCN-LABEL: global_load_no_shrink_dword_to_unaligned_byte: +; GCN: s_load_dword s{{[0-9]+}} ; GCN: s_load_dword [[LD:s[0-9]+]], ; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10013 define amdgpu_kernel void @global_load_no_shrink_dword_to_unaligned_byte(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %x) { @@ -40,6 +43,7 @@ define amdgpu_kernel void @global_load_no_shrink_dword_to_unaligned_byte(i32 add } ; GCN-LABEL: global_load_no_shrink_dword_to_aligned_byte: +; GCN: s_load_dword s{{[0-9]+}} ; GCN: s_load_dword [[LD:s[0-9]+]], ; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10003 define amdgpu_kernel void @global_load_no_shrink_dword_to_aligned_byte(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %x) { diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll index 2c2a64c..8162f3518 100644 --- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -8,12 +8,12 @@ define amdgpu_kernel void @s_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> % ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x30 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x30 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_pk_lshlrev_b16 v0, s0, v0 +; GFX9-NEXT: v_pk_lshlrev_b16 v0, s3, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; @@ -140,12 +140,12 @@ define amdgpu_kernel void @shl_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> ; GFX9-LABEL: shl_v_s_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshlrev_b16 v1, s0, v1 +; GFX9-NEXT: v_pk_lshlrev_b16 v1, s2, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; @@ -209,12 +209,12 @@ define amdgpu_kernel void @shl_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> ; GFX9-LABEL: shl_s_v_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, s0 +; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/smrd.ll b/llvm/test/CodeGen/AMDGPU/smrd.ll index 4b62a98..fba92c6 100644 --- a/llvm/test/CodeGen/AMDGPU/smrd.ll +++ b/llvm/test/CodeGen/AMDGPU/smrd.ll @@ -149,8 +149,8 @@ main_body: ; GCN-LABEL: {{^}}smrd_load_const1: ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff glc ; encoding: [0xff -; VIGFX9_10-DAG: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc ; -; VIGFX9_10-DAG: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc glc ; +; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]+}}], 0x3fc ; +; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]+}}], 0x3fc glc ; define amdgpu_ps void @smrd_load_const1(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 { main_body: %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0 @@ -172,8 +172,8 @@ main_body: ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]] ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100 -; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400 -; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400 +; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]+}}], 0x400 +; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]+}}], 0x400 define amdgpu_ps void @smrd_load_const2(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 { main_body: %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0 @@ -193,8 +193,8 @@ main_body: ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff -; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc -; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc +; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]+}}], 0xffffc +; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]+}}], 0xffffc define amdgpu_ps void @smrd_load_const3(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 { main_body: %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0 @@ -210,8 +210,8 @@ main_body: ; SMRD load with an offset greater than the largest possible immediate on VI ; GCN-LABEL: {{^}}smrd_load_const4: ; SIVIGFX9_10: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000 -; SIVIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] -; SIVIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] +; SIVIGFX9_10: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]:[0-9]+}}], [[OFFSET]] +; SIVIGFX9_10: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]:[0-9]+}}], [[OFFSET]] ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000 ; GCN: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll index 7b0a69f..d1fa465 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -1,6 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=verde -enable-misched=0 -post-RA-scheduler=0 -amdgpu-spill-sgpr-to-vgpr=0 < %s | FileCheck -check-prefixes=CHECK,GFX6 %s ; RUN: llc -regalloc=basic -march=amdgcn -mcpu=tonga -enable-misched=0 -post-RA-scheduler=0 -amdgpu-spill-sgpr-to-vgpr=0 < %s | FileCheck --check-prefix=CHECK %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -amdgpu-spill-sgpr-to-vgpr=0 -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=CHECK,GFX9-FLATSCR,FLATSCR %s +; RUN: llc -march=amdgcn -mattr=-xnack -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -amdgpu-spill-sgpr-to-vgpr=0 -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=CHECK,GFX9-FLATSCR,FLATSCR %s ; RUN: llc -march=amdgcn -mcpu=gfx1030 -enable-misched=0 -post-RA-scheduler=0 -amdgpu-spill-sgpr-to-vgpr=0 -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=CHECK,GFX10-FLATSCR,FLATSCR %s ; ; There is something about Tonga that causes this test to spend a lot of time diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll index a9885a0..41377c9 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll @@ -137,7 +137,7 @@ define amdgpu_kernel void @max_10_vgprs_used_9a(i32 addrspace(1)* %p) #0 { ; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a9 ; Reload Reuse ; GCN: NumVgprs: 10 -; GFX900: ScratchSize: 44 +; GFX900: ScratchSize: 52 ; GFX908: ScratchSize: 20 ; GCN: VGPRBlocks: 2 ; GCN: NumVGPRsForWavesPerEU: 10 @@ -246,7 +246,7 @@ define amdgpu_kernel void @max_256_vgprs_spill_9x32(<32 x float> addrspace(1)* % ; GFX908-DAG: v_accvgpr_read_b32 ; GCN: NumVgprs: 256 -; GFX900: ScratchSize: 2052 +; GFX900: ScratchSize: 1028 ; GFX908-FIXME: ScratchSize: 0 ; GCN: VGPRBlocks: 63 ; GCN: NumVGPRsForWavesPerEU: 256 diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir b/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir index f686d9f..417f28c 100644 --- a/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir +++ b/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir @@ -308,12 +308,12 @@ body: | ; VR: renamable $sgpr8 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr12, 0, 0 :: (dereferenceable invariant load 4) ; VR: renamable $sgpr9 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr13, 0, 0 :: (dereferenceable invariant load 4) ; VR: renamable $sgpr14 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr15, 0, 0 :: (dereferenceable invariant load 4) - ; VR: renamable $sgpr15 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr16, 0, 0 :: (dereferenceable invariant load 4) ; VR: renamable $sgpr10_sgpr11 = IMPLICIT_DEF + ; VR: renamable $sgpr17 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr22, 0, 0 :: (dereferenceable invariant load 4) + ; VR: renamable $sgpr15 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr16, 0, 0 :: (dereferenceable invariant load 4) ; VR: renamable $sgpr12 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr18, 0, 0 :: (dereferenceable invariant load 4) ; VR: renamable $sgpr13 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr19, 0, 0 :: (dereferenceable invariant load 4) ; VR: renamable $sgpr16 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr21, 0, 0 :: (dereferenceable invariant load 4) - ; VR: renamable $sgpr17 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr22, 0, 0 :: (dereferenceable invariant load 4) ; VR: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, implicit killed renamable $sgpr10_sgpr11, implicit killed renamable $sgpr8, implicit killed renamable $sgpr9, implicit killed renamable $sgpr12, implicit killed renamable $sgpr13, implicit killed renamable $sgpr14, implicit killed renamable $sgpr15, implicit killed renamable $sgpr16, implicit killed renamable $sgpr17 %0:sgpr_128 = IMPLICIT_DEF %1:sreg_64 = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll index e5646f7..96fbc48 100644 --- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll +++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll @@ -10,56 +10,61 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK: bb.0..expVert: ; CHECK: liveins: $sgpr3, $sgpr4, $sgpr5, $sgpr8, $sgpr9, $sgpr10, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr25, $sgpr27, $sgpr31 ; CHECK: undef %56.sub0:sgpr_64 = COPY $sgpr31 - ; CHECK: SI_SPILL_S32_SAVE $sgpr27, %stack.2, implicit $exec, implicit $sgpr32 :: (store 4 into %stack.2, addrspace 5) - ; CHECK: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr25 - ; CHECK: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr5 - ; CHECK: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; CHECK: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; CHECK: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr18 + ; CHECK: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr27 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr25 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr5 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; CHECK: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; CHECK: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr18 ; CHECK: undef %50.sub0:sgpr_64 = COPY $sgpr19 - ; CHECK: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr20 - ; CHECK: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr21 - ; CHECK: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr22 - ; CHECK: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr23 - ; CHECK: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr9 - ; CHECK: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr10 - ; CHECK: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr8 + ; CHECK: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr20 + ; CHECK: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr21 + ; CHECK: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr22 + ; CHECK: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr23 + ; CHECK: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr9 + ; CHECK: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr10 + ; CHECK: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr8 ; CHECK: undef %71.sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM %56, 232, 0, 0 :: (load 8 from %ir.40, addrspace 4) ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; CHECK: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 4, implicit-def dead $scc - ; CHECK: [[S_LSHL_B32_1:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 4, implicit-def dead $scc - ; CHECK: [[S_LSHL_B32_2:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 4, implicit-def dead $scc + ; CHECK: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 4, implicit-def dead $scc + ; CHECK: [[S_LSHL_B32_1:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 4, implicit-def dead $scc + ; CHECK: [[S_LSHL_B32_2:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 4, implicit-def dead $scc ; CHECK: [[S_ASHR_I32_:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_]], 31, implicit-def dead $scc ; CHECK: [[S_ASHR_I32_1:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_1]], 31, implicit-def dead $scc - ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 %71.sub1, [[S_MOV_B32_]], implicit-def dead $scc - ; CHECK: SI_SPILL_S32_SAVE [[S_AND_B32_]], %stack.0, implicit $exec, implicit $sgpr32 :: (store 4 into %stack.0, addrspace 5) + ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 %71.sub1, [[S_MOV_B32_]], implicit-def dead $scc ; CHECK: [[S_ASHR_I32_2:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_2]], 31, implicit-def dead $scc - ; CHECK: undef %130.sub0:sreg_64 = S_ADD_U32 [[COPY4]], [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK: undef %130.sub0:sreg_64 = S_ADD_U32 [[COPY5]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK: %130.sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc - ; CHECK: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %130, 16, 0, 0 :: (load 16 from %ir.84, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM undef %74:sreg_64, 0, 0, 0 :: (load 16 from `<4 x i32> addrspace(4)* undef`, addrspace 4) - ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %132:sgpr_128, 0, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: early-clobber %73:sgpr_128, early-clobber %143:sgpr_128, early-clobber %131:sreg_32_xm0_xexec = BUNDLE %130, undef %132:sgpr_128, undef %74:sreg_64 { + ; CHECK: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %130, 16, 0, 0 :: (load 16 from %ir.84, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM undef %74:sreg_64, 0, 0, 0 :: (load 16 from `<4 x i32> addrspace(4)* undef`, addrspace 4) + ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %132:sgpr_128, 0, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: } ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_LOAD_DWORDX4_IMM]], 0, 0, 0 :: (dereferenceable invariant load 4) ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK: %71.sub3:sgpr_128 = S_MOV_B32 553734060 ; CHECK: %71.sub2:sgpr_128 = S_MOV_B32 -1 - ; CHECK: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %118:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %89:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: SI_SPILL_S128_SAVE %71, %stack.1, implicit $exec, implicit $sgpr32 :: (store 16 into %stack.1, align 4, addrspace 5) + ; CHECK: early-clobber %87:vgpr_32, early-clobber %117:vgpr_32, early-clobber %76:vgpr_32 = BUNDLE [[S_LOAD_DWORDX4_IMM1]], undef %118:sgpr_128, undef %89:sgpr_128, [[V_MOV_B32_e32_]], implicit $exec { + ; CHECK: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %118:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %89:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: } + ; CHECK: [[COPY13:%[0-9]+]]:sgpr_128 = COPY %71 ; CHECK: %71.sub1:sgpr_128 = S_MOV_B32 0 ; CHECK: [[S_SUB_I32_:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM]], 29, implicit-def dead $scc ; CHECK: [[S_SUB_I32_1:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM]], 30, implicit-def dead $scc ; CHECK: [[S_SUB_I32_2:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM1]], 31, implicit-def dead $scc - ; CHECK: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY4]], 64, implicit-def $scc + ; CHECK: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY5]], 64, implicit-def $scc ; CHECK: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %54:sreg_32, 0, implicit-def dead $scc, implicit $scc ; CHECK: undef %149.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], [[S_LSHL_B32_]], implicit-def $scc ; CHECK: %149.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc ; CHECK: undef %156.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], [[S_LSHL_B32_1]], implicit-def $scc ; CHECK: %156.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc ; CHECK: undef %163.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], [[S_LSHL_B32_2]], implicit-def $scc - ; CHECK: [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %149, 0, 0, 0 :: (load 16 from %ir.91, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %156, 0, 0, 0 :: (load 16 from %ir.97, addrspace 4) + ; CHECK: early-clobber %150:sgpr_128, early-clobber %157:sgpr_128 = BUNDLE %149, %156 { + ; CHECK: [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %149, 0, 0, 0 :: (load 16 from %ir.91, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %156, 0, 0, 0 :: (load 16 from %ir.97, addrspace 4) + ; CHECK: } ; CHECK: %163.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK: [[S_ASHR_I32_3:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 undef %171:sreg_32, 31, implicit-def dead $scc ; CHECK: undef %176.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], undef %171:sreg_32, implicit-def $scc @@ -87,55 +92,67 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK: %253.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_2]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK: undef %261.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_2]], undef %171:sreg_32, implicit-def $scc ; CHECK: %261.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_2]], [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc - ; CHECK: undef %273.sub0:sreg_64 = S_ADD_U32 [[COPY5]], [[S_LSHL_B32_]], implicit-def $scc + ; CHECK: undef %273.sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_]], implicit-def $scc ; CHECK: %273.sub1:sreg_64 = S_ADDC_U32 undef %48:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK: undef %286.sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_1]], implicit-def $scc + ; CHECK: undef %286.sub0:sreg_64 = S_ADD_U32 [[COPY7]], [[S_LSHL_B32_1]], implicit-def $scc ; CHECK: %286.sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc - ; CHECK: undef %293.sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK: undef %293.sub0:sreg_64 = S_ADD_U32 [[COPY7]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK: %293.sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_]], 16, implicit-def dead $scc ; CHECK: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_2]], 16, implicit-def dead $scc - ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR %71, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load 4) - ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR %71, undef %314:sreg_32, 0, 0 :: (dereferenceable invariant load 4) - ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR %71, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load 4) - ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %71, 16, 0, 0 :: (dereferenceable invariant load 4) - ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %369:sgpr_128, undef %370:sreg_32, 0, 0 :: (dereferenceable invariant load 4) - ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %380:sgpr_128, 16, 0, 0 :: (dereferenceable invariant load 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %176, 0, 0, 0 :: (load 16 from %ir.111, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %183, 0, 0, 0 :: (load 16 from %ir.117, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %190, 0, 0, 0 :: (load 16 from %ir.123, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %200, 0, 0, 0 :: (load 16 from %ir.131, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %210, 0, 0, 0 :: (load 16 from %ir.138, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %364:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load 4) - ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %375:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: early-clobber %379:sreg_32_xm0_xexec, early-clobber %201:sgpr_128, early-clobber %177:sgpr_128, early-clobber %184:sgpr_128, early-clobber %319:sreg_32_xm0_xexec, early-clobber %191:sgpr_128, early-clobber %309:sreg_32_xm0_xexec, early-clobber %323:sreg_32_xm0_xexec, early-clobber %368:sreg_32_xm0_xexec, early-clobber %313:sreg_32_xm0_xexec, early-clobber %211:sgpr_128 = BUNDLE [[S_ADD_I32_]], %71, undef %369:sgpr_128, %210, undef %314:sreg_32, %200, undef %380:sgpr_128, %176, %183, [[S_ADD_I32_1]], %190, undef %370:sreg_32 { + ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR %71, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR %71, undef %314:sreg_32, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR %71, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %71, 16, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %369:sgpr_128, undef %370:sreg_32, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %380:sgpr_128, 16, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %176, 0, 0, 0 :: (load 16 from %ir.111, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %183, 0, 0, 0 :: (load 16 from %ir.117, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %190, 0, 0, 0 :: (load 16 from %ir.123, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %200, 0, 0, 0 :: (load 16 from %ir.131, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %210, 0, 0, 0 :: (load 16 from %ir.138, addrspace 4) + ; CHECK: } + ; CHECK: early-clobber %151:vgpr_32, early-clobber %158:vgpr_32, early-clobber %165:vgpr_32 = BUNDLE [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], [[S_LOAD_DWORDX4_IMM3]], [[S_LOAD_DWORDX4_IMM4]], implicit $exec { + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: } + ; CHECK: early-clobber %374:sreg_32_xm0_xexec, early-clobber %363:sreg_32_xm0_xexec = BUNDLE [[S_ADD_I32_]], undef %364:sgpr_128, undef %375:sgpr_128, [[S_ADD_I32_1]] { + ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %364:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %375:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: } ; CHECK: [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR]], -98, implicit-def dead $scc ; CHECK: [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR1]], -114, implicit-def dead $scc ; CHECK: [[S_ADD_I32_4:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR2]], -130, implicit-def dead $scc ; CHECK: [[S_ADD_I32_5:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM2]], -178, implicit-def dead $scc - ; CHECK: undef %327.sub0:sreg_64 = S_ADD_U32 [[COPY7]], [[S_LSHL_B32_]], implicit-def $scc + ; CHECK: undef %327.sub0:sreg_64 = S_ADD_U32 [[COPY8]], [[S_LSHL_B32_]], implicit-def $scc ; CHECK: %327.sub1:sreg_64 = S_ADDC_U32 undef %42:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK: undef %335.sub0:sreg_64 = S_ADD_U32 [[COPY8]], [[S_LSHL_B32_]], implicit-def $scc + ; CHECK: undef %335.sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_]], implicit-def $scc ; CHECK: %335.sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK: undef %343.sub0:sreg_64 = S_ADD_U32 [[COPY8]], [[S_LSHL_B32_1]], implicit-def $scc + ; CHECK: undef %343.sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_1]], implicit-def $scc ; CHECK: %343.sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc - ; CHECK: undef %351.sub0:sreg_64 = S_ADD_U32 [[COPY8]], [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK: undef %351.sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK: %351.sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc - ; CHECK: [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY9]], 4, implicit-def dead $scc + ; CHECK: [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY10]], 4, implicit-def dead $scc ; CHECK: [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_3]], 16, implicit-def dead $scc ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %396:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load 4) ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN6:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %50, 224, 0, 0 :: (load 16 from %ir.155, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %217, 0, 0, 0 :: (load 16 from %ir.144, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %224, 0, 0, 0 :: (load 16 from %ir.150, addrspace 4) + ; CHECK: early-clobber %218:sgpr_128, early-clobber %225:sgpr_128, early-clobber %231:sgpr_128 = BUNDLE %217, %224, %50 { + ; CHECK: [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %50, 224, 0, 0 :: (load 16 from %ir.155, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %217, 0, 0, 0 :: (load 16 from %ir.144, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %224, 0, 0, 0 :: (load 16 from %ir.150, addrspace 4) + ; CHECK: } ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %241, 0, 0, 0 :: (load 16 from %ir.162, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %253, 0, 0, 0 :: (load 16 from %ir.170, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM8]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: early-clobber %254:sgpr_128, early-clobber %242:sgpr_128 = BUNDLE %253, %241 { + ; CHECK: [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %241, 0, 0, 0 :: (load 16 from %ir.162, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %253, 0, 0, 0 :: (load 16 from %ir.170, addrspace 4) + ; CHECK: } + ; CHECK: early-clobber %212:vgpr_32, early-clobber %202:vgpr_32 = BUNDLE [[S_LOAD_DWORDX4_IMM8]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], implicit $exec { + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM8]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: } ; CHECK: [[S_ADD_I32_7:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR4]], -217, implicit-def dead $scc ; CHECK: [[S_ADD_I32_8:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR3]], -233, implicit-def dead $scc ; CHECK: [[S_ADD_I32_9:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR5]], -249, implicit-def dead $scc @@ -144,35 +161,41 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK: [[S_ADD_I32_12:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR3]], -329, implicit-def dead $scc ; CHECK: [[S_ADD_I32_13:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR3]], -345, implicit-def dead $scc ; CHECK: [[S_ADD_I32_14:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR6]], -441, implicit-def dead $scc - ; CHECK: [[S_ADD_U32_3:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]], 160, implicit-def $scc + ; CHECK: [[S_ADD_U32_3:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], 160, implicit-def $scc ; CHECK: [[S_ADDC_U32_3:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %36:sreg_32, 0, implicit-def dead $scc, implicit $scc ; CHECK: undef %411.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_3]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK: %411.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_3]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc - ; CHECK: [[S_LSHL_B32_4:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY10]], 4, implicit-def dead $scc + ; CHECK: [[S_LSHL_B32_4:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY11]], 4, implicit-def dead $scc ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM11]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_ASHR_I32_4:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_4]], 31, implicit-def dead $scc ; CHECK: undef %425.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_3]], [[S_LSHL_B32_4]], implicit-def $scc ; CHECK: %425.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_3]], [[S_ASHR_I32_4]], implicit-def dead $scc, implicit $scc ; CHECK: [[S_ADD_U32_4:%[0-9]+]]:sreg_32 = S_ADD_U32 %56.sub0, 168, implicit-def $scc ; CHECK: [[S_ADDC_U32_4:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %57:sreg_32, 0, implicit-def dead $scc, implicit $scc - ; CHECK: [[S_LSHL_B32_5:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 3, implicit-def dead $scc + ; CHECK: [[S_LSHL_B32_5:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 3, implicit-def dead $scc ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM12]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_ASHR_I32_5:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_5]], 31, implicit-def dead $scc ; CHECK: undef %441.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_4]], [[S_LSHL_B32_5]], implicit-def $scc ; CHECK: %441.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM10]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: %71.sub0:sgpr_128 = S_LOAD_DWORD_IMM %441, 0, 0, 0 :: (load 4 from %ir..i085.i, align 8, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %261, 0, 0, 0 :: (load 16 from %ir.176, addrspace 4) + ; CHECK: early-clobber %71.sub0:sgpr_128, early-clobber %262:sgpr_128 = BUNDLE %261, %441 { + ; CHECK: internal %71.sub0:sgpr_128 = S_LOAD_DWORD_IMM %441, 0, 0, 0 :: (load 4 from %ir..i085.i, align 8, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %261, 0, 0, 0 :: (load 16 from %ir.176, addrspace 4) + ; CHECK: } ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM13]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %273, 0, 0, 0 :: (load 16 from %ir.185, addrspace 4) ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM14]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %286, 0, 0, 0 :: (load 16 from %ir.194, addrspace 4) - ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %71, 0, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: early-clobber %445:sreg_32_xm0_xexec, early-clobber %287:sgpr_128 = BUNDLE %71, %286 { + ; CHECK: [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %286, 0, 0, 0 :: (load 16 from %ir.194, addrspace 4) + ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %71, 0, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: } ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN15:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM15]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %293, 0, 0, 0 :: (load 16 from %ir.200, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN16:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM16]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 3, implicit-def dead $scc + ; CHECK: early-clobber %281:vgpr_32, early-clobber %275:vgpr_32 = BUNDLE [[S_LOAD_DWORDX4_IMM1]], [[S_LOAD_DWORDX4_IMM16]], [[V_MOV_B32_e32_]], implicit $exec { + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN16:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM16]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: } + ; CHECK: [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 3, implicit-def dead $scc ; CHECK: [[BUFFER_LOAD_DWORD_OFFSET2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM17]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_ASHR_I32_6:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_6]], 31, implicit-def dead $scc ; CHECK: [[S_ADD_I32_15:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM4]], -467, implicit-def dead $scc @@ -180,16 +203,20 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK: %453.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc ; CHECK: %71.sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM %453, 0, 0, 0 :: (load 8 from %ir.304, addrspace 4) ; CHECK: [[BUFFER_LOAD_DWORD_OFFSET3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM18]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %327, 0, 0, 0 :: (load 16 from %ir.223, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %335, 0, 0, 0 :: (load 16 from %ir.230, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %343, 0, 0, 0 :: (load 16 from %ir.236, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %351, 0, 0, 0 :: (load 16 from %ir.242, addrspace 4) + ; CHECK: early-clobber %336:sgpr_128, early-clobber %352:sgpr_128, early-clobber %328:sgpr_128, early-clobber %344:sgpr_128 = BUNDLE %327, %343, %335, %351 { + ; CHECK: [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %327, 0, 0, 0 :: (load 16 from %ir.223, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %335, 0, 0, 0 :: (load 16 from %ir.230, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %343, 0, 0, 0 :: (load 16 from %ir.236, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %351, 0, 0, 0 :: (load 16 from %ir.242, addrspace 4) + ; CHECK: } ; CHECK: %71.sub1:sgpr_128 = S_AND_B32 %71.sub1, [[S_MOV_B32_]], implicit-def dead $scc ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %71, 0, 0, 0 :: (dereferenceable invariant load 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN17:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM19]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN18:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM20]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN19:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM21]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: [[S_LSHL_B32_7:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 3, implicit-def dead $scc + ; CHECK: early-clobber %329:vgpr_32, early-clobber %345:vgpr_32, early-clobber %337:vgpr_32 = BUNDLE [[S_LOAD_DWORDX4_IMM20]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM19]], [[S_LOAD_DWORDX4_IMM21]], implicit $exec { + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN17:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM19]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN18:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM20]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN19:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM21]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: } + ; CHECK: [[S_LSHL_B32_7:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 3, implicit-def dead $scc ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN20:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM22]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_ASHR_I32_7:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_7]], 31, implicit-def dead $scc ; CHECK: [[S_ADD_I32_16:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM5]], -468, implicit-def dead $scc @@ -197,18 +224,22 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK: %468.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_7]], implicit-def dead $scc, implicit $scc ; CHECK: %71.sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM %468, 0, 0, 0 :: (load 8 from %ir.316, addrspace 4) ; CHECK: %71.sub1:sgpr_128 = S_AND_B32 %71.sub1, [[S_MOV_B32_]], implicit-def dead $scc - ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %71, 0, 0, 0 :: (dereferenceable invariant load 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %411, 0, 0, 0 :: (load 16 from %ir.278, addrspace 4) - ; CHECK: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %488:sreg_64, 0, 0, 0 :: (load 4 from `i32 addrspace(4)* undef`, addrspace 4) - ; CHECK: [[S_LSHL_B32_8:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY11]], 3, implicit-def dead $scc + ; CHECK: early-clobber %412:sgpr_128, early-clobber %487:sreg_32_xm0_xexec, early-clobber %475:sreg_32_xm0_xexec = BUNDLE %71, undef %488:sreg_64, %411 { + ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %71, 0, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %411, 0, 0, 0 :: (load 16 from %ir.278, addrspace 4) + ; CHECK: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %488:sreg_64, 0, 0, 0 :: (load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; CHECK: } + ; CHECK: [[S_LSHL_B32_8:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY12]], 3, implicit-def dead $scc ; CHECK: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %425, 0, 0, 0 :: (load 16 from %ir.287, addrspace 4) ; CHECK: [[S_ASHR_I32_8:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_8]], 31, implicit-def dead $scc ; CHECK: [[S_ADD_I32_17:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM6]], -469, implicit-def dead $scc ; CHECK: undef %485.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_4]], [[S_LSHL_B32_8]], implicit-def $scc ; CHECK: %485.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_8]], implicit-def dead $scc, implicit $scc ; CHECK: %71.sub0:sgpr_128 = S_LOAD_DWORD_IMM %485, 0, 0, 0 :: (load 4 from %ir..i0100.i, align 8, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: early-clobber %413:vgpr_32, early-clobber %427:vgpr_32 = BUNDLE [[S_LOAD_DWORDX4_IMM23]], [[S_LOAD_DWORDX4_IMM24]], [[V_MOV_B32_e32_]], implicit $exec { + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: } ; CHECK: %71.sub1:sgpr_128 = S_AND_B32 [[S_LOAD_DWORD_IMM]], [[S_MOV_B32_]], implicit-def dead $scc ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM7:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %71, 0, 0, 0 :: (dereferenceable invariant load 4) ; CHECK: [[S_ADD_I32_18:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM]], -474, implicit-def dead $scc @@ -217,8 +248,7 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK: [[S_ADD_I32_21:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR3]], -507, implicit-def dead $scc ; CHECK: [[S_ADD_I32_22:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR3]], -539, implicit-def dead $scc ; CHECK: [[S_ADD_I32_23:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM7]], -473, implicit-def dead $scc - ; CHECK: [[SI_SPILL_S32_RESTORE:%[0-9]+]]:sgpr_32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load 4 from %stack.2, addrspace 5) - ; CHECK: [[S_ADD_U32_5:%[0-9]+]]:sreg_32 = S_ADD_U32 [[SI_SPILL_S32_RESTORE]], 96, implicit-def $scc + ; CHECK: [[S_ADD_U32_5:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]], 96, implicit-def $scc ; CHECK: [[S_ADDC_U32_5:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %33:sreg_32, 0, implicit-def dead $scc, implicit $scc ; CHECK: undef %514.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_5]], [[S_LSHL_B32_]], implicit-def $scc ; CHECK: %514.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_5]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc @@ -229,9 +259,11 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK: undef %530.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_5]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK: %530.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_5]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK: [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %530, 0, 0, 0 :: (load 16 from %ir.359, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM27]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: early-clobber %516:vgpr_32, early-clobber %532:vgpr_32, early-clobber %524:vgpr_32 = BUNDLE [[S_LOAD_DWORDX4_IMM26]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], [[S_LOAD_DWORDX4_IMM27]], implicit $exec { + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM27]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: } ; CHECK: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -2, [[BUFFER_LOAD_FORMAT_X_IDXEN]], implicit $exec ; CHECK: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -1, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], implicit $exec ; CHECK: [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -3, [[BUFFER_LOAD_FORMAT_X_IDXEN]], implicit $exec @@ -336,13 +368,8 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK: [[V_OR_B32_e32_61:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 [[V_OR_B32_e32_60]], [[V_ADD_U32_e32_25]], implicit $exec ; CHECK: [[V_ADD_U32_e32_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -575, [[BUFFER_LOAD_FORMAT_X_IDXEN]], implicit $exec ; CHECK: [[V_OR_B32_e32_62:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 [[V_OR_B32_e32_61]], [[V_ADD_U32_e32_26]], implicit $exec - ; CHECK: [[SI_SPILL_S32_RESTORE1:%[0-9]+]]:sreg_32_xm0_xexec = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load 4 from %stack.0, addrspace 5) - ; CHECK: [[SI_SPILL_S128_RESTORE:%[0-9]+]]:sgpr_128 = SI_SPILL_S128_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load 16 from %stack.1, align 4, addrspace 5) - ; CHECK: undef %914.sub2_sub3:sgpr_128 = COPY [[SI_SPILL_S128_RESTORE]].sub2_sub3 { - ; CHECK: internal %914.sub0:sgpr_128 = COPY [[SI_SPILL_S128_RESTORE]].sub0 - ; CHECK: } - ; CHECK: %914.sub1:sgpr_128 = COPY [[SI_SPILL_S32_RESTORE1]] - ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM8:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %914, 0, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: [[COPY13]].sub1:sgpr_128 = COPY [[S_AND_B32_]] + ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM8:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY13]], 0, 0, 0 :: (dereferenceable invariant load 4) ; CHECK: [[V_ADD_U32_e32_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -576, [[BUFFER_LOAD_FORMAT_X_IDXEN]], implicit $exec ; CHECK: [[V_OR_B32_e32_63:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 [[V_OR_B32_e32_62]], [[V_ADD_U32_e32_27]], implicit $exec ; CHECK: [[V_ADD_U32_e32_29:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -577, [[BUFFER_LOAD_FORMAT_X_IDXEN]], implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/sram-ecc-default.ll b/llvm/test/CodeGen/AMDGPU/sram-ecc-default.ll index ad9ce86..2ab384f 100644 --- a/llvm/test/CodeGen/AMDGPU/sram-ecc-default.ll +++ b/llvm/test/CodeGen/AMDGPU/sram-ecc-default.ll @@ -1,10 +1,8 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,NO-ECC %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=+sram-ecc < %s | FileCheck -check-prefixes=GCN,NO-ECC %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-sram-ecc < %s | FileCheck -check-prefixes=GCN,NO-ECC %s -; RUN: llc -march=amdgcn -mcpu=gfx902 -mattr=+sram-ecc < %s | FileCheck -check-prefixes=GCN,NO-ECC %s -; RUN: llc -march=amdgcn -mcpu=gfx904 -mattr=+sram-ecc < %s | FileCheck -check-prefixes=GCN,NO-ECC %s -; RUN: llc -march=amdgcn -mcpu=gfx906 -mattr=+sram-ecc < %s | FileCheck -check-prefixes=GCN,ECC %s -; RUN: llc -march=amdgcn -mcpu=gfx906 -mattr=-sram-ecc < %s | FileCheck -check-prefixes=GCN,NO-ECC %s +; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,NO-ECC %s +; RUN: llc -march=amdgcn -mcpu=gfx904 -mattr=+sramecc < %s | FileCheck -check-prefixes=GCN,NO-ECC %s +; RUN: llc -march=amdgcn -mcpu=gfx906 -mattr=+sramecc < %s | FileCheck -check-prefixes=GCN,ECC %s +; RUN: llc -march=amdgcn -mcpu=gfx906 -mattr=-sramecc < %s | FileCheck -check-prefixes=GCN,NO-ECC %s ; Make sure the correct set of targets are marked with ; FeatureDoesNotSupportSRAMECC, and +sram-ecc is ignored if it's never diff --git a/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-any.ll b/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-any.ll new file mode 100644 index 0000000..d0fe964 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-any.ll @@ -0,0 +1,13 @@ +; RUN: llc -march=amdgcn -mcpu=gfx700 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=NOT-SUPPORTED %s +; RUN: llc -march=amdgcn -mcpu=gfx906 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s +; RUN: llc -march=amdgcn -mcpu=gfx908 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s + +; REQUIRES: asserts + +; NOT-SUPPORTED: sramecc setting for subtarget: Unsupported +; ANY: sramecc setting for subtarget: Any +define void @sramecc-subtarget-feature-default() #0 { + ret void +} + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-disabled.ll b/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-disabled.ll new file mode 100644 index 0000000..e2e25eb --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-disabled.ll @@ -0,0 +1,14 @@ +; RUN: llc -march=amdgcn -mcpu=gfx700 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=WARN %s +; RUN: llc -march=amdgcn -mcpu=gfx906 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=OFF %s +; RUN: llc -march=amdgcn -mcpu=gfx908 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=OFF %s + +; REQUIRES: asserts + +; WARN: warning: sramecc 'Off' was requested for a processor that does not support it! +; OFF: sramecc setting for subtarget: Off + +define void @sramecc-subtarget-feature-disabled() #0 { + ret void +} + +attributes #0 = { "target-features"="-sramecc" } diff --git a/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-enabled.ll b/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-enabled.ll new file mode 100644 index 0000000..2603f7d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-enabled.ll @@ -0,0 +1,13 @@ +; RUN: llc -march=amdgcn -mcpu=gfx700 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=WARN %s +; RUN: llc -march=amdgcn -mcpu=gfx906 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s +; RUN: llc -march=amdgcn -mcpu=gfx908 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s + +; REQUIRES: asserts + +; WARN: warning: sramecc 'On' was requested for a processor that does not support it! +; ON: sramecc setting for subtarget: On +define void @sramecc-subtarget-feature-enabled() #0 { + ret void +} + +attributes #0 = { "target-features"="+sramecc" } diff --git a/llvm/test/CodeGen/AMDGPU/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/store-local.128.ll index 47dcfa8..aa7ef20 100644 --- a/llvm/test/CodeGen/AMDGPU/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.128.ll @@ -6,14 +6,14 @@ define amdgpu_kernel void @store_lds_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: ds_write_b128 v4, v[0:3] ; GFX9-NEXT: s_endpgm ; @@ -51,43 +51,43 @@ define amdgpu_kernel void @store_lds_v4i32(<4 x i32> addrspace(3)* %out, <4 x i3 define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:12 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:14 ; GFX9-NEXT: ds_write_b8 v0, v2 offset:8 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:10 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:4 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:6 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s4, s3, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_lshr_b32 s0, s7, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: s_lshr_b32 s3, s3, 24 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s7, 24 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:13 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_lshr_b32 s3, s2, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s6, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:15 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_lshr_b32 s2, s2, 24 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s6, 24 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:9 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: s_lshr_b32 s2, s1, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s5, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:11 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: s_lshr_b32 s1, s1, 24 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s5, 24 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:5 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_lshr_b32 s1, s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s4, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:7 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_lshr_b32 s0, s0, 24 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s4, 24 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:1 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:3 @@ -205,20 +205,20 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: ds_write_b16 v0, v1 offset:12 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:14 ; GFX9-NEXT: ds_write_b16 v0, v2 offset:8 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v2 offset:10 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:6 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: ds_write_b16 v0, v1 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:2 ; GFX9-NEXT: s_endpgm @@ -287,15 +287,15 @@ define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out, define amdgpu_kernel void @store_lds_v4i32_align4(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: ds_write2_b32 v0, v3, v1 offset0:2 offset1:3 ; GFX9-NEXT: s_endpgm ; @@ -335,14 +335,14 @@ define amdgpu_kernel void @store_lds_v4i32_align4(<4 x i32> addrspace(3)* %out, define amdgpu_kernel void @store_lds_v4i32_align8(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 ; GFX9-NEXT: s_endpgm ; @@ -380,14 +380,14 @@ define amdgpu_kernel void @store_lds_v4i32_align8(<4 x i32> addrspace(3)* %out, define amdgpu_kernel void @store_lds_v4i32_align16(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: ds_write_b128 v4, v[0:3] ; GFX9-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/store-local.96.ll index 2788a52..2994385 100644 --- a/llvm/test/CodeGen/AMDGPU/store-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.96.ll @@ -6,13 +6,13 @@ define amdgpu_kernel void @store_lds_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: ds_write_b96 v3, v[0:2] ; GFX9-NEXT: s_endpgm ; @@ -49,34 +49,34 @@ define amdgpu_kernel void @store_lds_v3i32(<3 x i32> addrspace(3)* %out, <3 x i3 define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:8 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:10 ; GFX9-NEXT: ds_write_b8 v0, v2 offset:4 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:6 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s3, s2, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_lshr_b32 s0, s6, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_lshr_b32 s2, s2, 24 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s6, 24 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:9 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: s_lshr_b32 s2, s1, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s5, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:11 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: s_lshr_b32 s1, s1, 24 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s5, 24 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:5 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_lshr_b32 s1, s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s4, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:7 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_lshr_b32 s0, s0, 24 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s4, 24 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:1 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:3 @@ -172,17 +172,17 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: ds_write_b16 v0, v1 offset:8 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:10 ; GFX9-NEXT: ds_write_b16 v0, v2 offset:4 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v2 offset:6 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: ds_write_b16 v0, v1 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:2 ; GFX9-NEXT: s_endpgm @@ -241,13 +241,13 @@ define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out, define amdgpu_kernel void @store_lds_v3i32_align4(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 ; GFX9-NEXT: ds_write_b32 v0, v3 offset:8 ; GFX9-NEXT: s_endpgm @@ -286,13 +286,13 @@ define amdgpu_kernel void @store_lds_v3i32_align4(<3 x i32> addrspace(3)* %out, define amdgpu_kernel void @store_lds_v3i32_align8(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: ds_write_b32 v2, v3 offset:8 ; GFX9-NEXT: ds_write_b64 v2, v[0:1] ; GFX9-NEXT: s_endpgm @@ -331,13 +331,13 @@ define amdgpu_kernel void @store_lds_v3i32_align8(<3 x i32> addrspace(3)* %out, define amdgpu_kernel void @store_lds_v3i32_align16(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: ds_write_b96 v3, v[0:2] ; GFX9-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll index 74da734..f2234bd 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -15,10 +15,10 @@ define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[8:9] glc +; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: v_pk_sub_i16 v0, v1, v0 +; GFX9-NEXT: v_pk_sub_i16 v0, v1, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -125,11 +125,11 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x30 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x30 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: v_pk_sub_i16 v0, s2, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm @@ -382,16 +382,16 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1) ; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: v_pk_sub_i16 v0, v1, v0 +; GFX9-NEXT: v_pk_sub_i16 v0, v1, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -436,20 +436,20 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1) ; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v3, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: v_pk_sub_i16 v2, v2, v0 +; GFX9-NEXT: v_pk_sub_i16 v2, v2, v3 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; @@ -494,16 +494,16 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1) ; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: v_pk_sub_i16 v0, v1, v0 +; GFX9-NEXT: v_pk_sub_i16 v0, v1, v2 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v0 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -550,15 +550,15 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(<2 x i64> addrspace(1) ; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v0, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v1, v1, v0 +; GFX9-NEXT: v_pk_sub_i16 v1, v1, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX9-NEXT: v_bfe_i32 v0, v1, 0, 16 ; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 16 diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll index b3e158f..c28b9bd2 100644 --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -18,11 +18,12 @@ define <4 x half> @shuffle_v4f16_234u(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX9-LABEL: shuffle_v4f16_234u: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v6 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -74,11 +75,12 @@ define <4 x half> @shuffle_v4f16_3u6u(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX9-LABEL: shuffle_v4f16_3u6u: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -90,11 +92,12 @@ define <4 x half> @shuffle_v4f16_3uu7(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX9-LABEL: shuffle_v4f16_3uu7: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -106,15 +109,15 @@ define <4 x half> @shuffle_v4f16_35u5(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX9-LABEL: shuffle_v4f16_35u5: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: global_load_dword v4, v[2:3], off +; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -126,14 +129,14 @@ define <4 x half> @shuffle_v4f16_357u(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX9-LABEL: shuffle_v4f16_357u: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v5 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 @@ -173,9 +176,12 @@ define <4 x half> @shuffle_v4f16_0145(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX9-LABEL: shuffle_v4f16_0145: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -187,9 +193,12 @@ define <4 x half> @shuffle_v4f16_0167(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX9-LABEL: shuffle_v4f16_0167: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -229,9 +238,12 @@ define <4 x half> @shuffle_v4f16_2345(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX9-LABEL: shuffle_v4f16_2345: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -243,9 +255,12 @@ define <4 x half> @shuffle_v4f16_2367(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX9-LABEL: shuffle_v4f16_2367: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -257,11 +272,12 @@ define <4 x half> @shuffle_v4f16_4501(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX9-LABEL: shuffle_v4f16_4501: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: global_load_dword v1, v[0:1], off +; GFX9-NEXT: global_load_dword v4, v[2:3], off +; GFX9-NEXT: global_load_dword v5, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -273,11 +289,12 @@ define <4 x half> @shuffle_v4f16_4523(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX9-LABEL: shuffle_v4f16_4523: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v4, v[2:3], off +; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -316,11 +333,12 @@ define <4 x half> @shuffle_v4f16_6701(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX9-LABEL: shuffle_v4f16_6701: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4 -; GFX9-NEXT: global_load_dword v1, v[0:1], off +; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -332,11 +350,12 @@ define <4 x half> @shuffle_v4f16_6723(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX9-LABEL: shuffle_v4f16_6723: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4 -; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -376,13 +395,14 @@ define <4 x half> @shuffle_v4f16_2356(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX9-LABEL: shuffle_v4f16_2356: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -394,13 +414,14 @@ define <4 x half> @shuffle_v4f16_5623(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX9-LABEL: shuffle_v4f16_5623: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -412,14 +433,15 @@ define <4 x half> @shuffle_v4f16_3456(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX9-LABEL: shuffle_v4f16_3456: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_and_b32_sdwa v2, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v1, v5, 16, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -431,15 +453,14 @@ define <4 x half> @shuffle_v4f16_5634(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX9-LABEL: shuffle_v4f16_5634: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v4 +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -451,16 +472,16 @@ define <4 x half> @shuffle_v4f16_5734(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX9-LABEL: shuffle_v4f16_5734: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v4 +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -472,13 +493,14 @@ define <4 x i16> @shuffle_v4i16_2356(<4 x i16> addrspace(1)* %arg0, <4 x i16> ad ; GFX9-LABEL: shuffle_v4i16_2356: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0 %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1 @@ -490,9 +512,12 @@ define <4 x i16> @shuffle_v4i16_0167(<4 x i16> addrspace(1)* %arg0, <4 x i16> ad ; GFX9-LABEL: shuffle_v4i16_0167: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0 %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1 @@ -556,12 +581,12 @@ define <4 x half> @shuffle_v4f16_6161(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX9-LABEL: shuffle_v4f16_6161: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4 -; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -634,9 +659,12 @@ define <4 x half> @shuffle_v8f16_4589(<8 x half> addrspace(1)* %arg0, <8 x half> ; GFX9-LABEL: shuffle_v8f16_4589: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:8 -; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:8 +; GFX9-NEXT: global_load_dword v5, v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 @@ -648,11 +676,12 @@ define <4 x half> @shuffle_v8f16_10_11_2_3(<8 x half> addrspace(1)* %arg0, <8 x ; GFX9-LABEL: shuffle_v8f16_10_11_2_3: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4 -; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 @@ -664,13 +693,14 @@ define <4 x half> @shuffle_v8f16_13_14_2_3(<8 x half> addrspace(1)* %arg0, <8 x ; GFX9-LABEL: shuffle_v8f16_13_14_2_3: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:8 -; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off offset:8 +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 @@ -713,14 +743,16 @@ define <6 x half> @shuffle_v6f16_452367(<6 x half> addrspace(1)* %arg0, <6 x hal ; GFX9-LABEL: shuffle_v6f16_452367: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, v3 ; GFX9-NEXT: v_mov_b32_e32 v3, v2 -; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off -; GFX9-NEXT: global_load_dword v3, v[3:4], off +; GFX9-NEXT: global_load_dwordx3 v[0:2], v[5:6], off +; GFX9-NEXT: global_load_dword v7, v[3:4], off ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-NEXT: v_mov_b32_e32 v2, v7 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <6 x half>, <6 x half> addrspace(1)* %arg0 %val1 = load <6 x half>, <6 x half> addrspace(1)* %arg1 @@ -732,18 +764,18 @@ define amdgpu_kernel void @fma_shuffle(<4 x half> addrspace(1)* nocapture readon ; GFX9-LABEL: fma_shuffle: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1] ; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[2:3] -; GFX9-NEXT: global_load_dwordx2 v[4:5], v6, s[4:5] +; GFX9-NEXT: global_load_dwordx2 v[4:5], v6, s[8:9] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1] ; GFX9-NEXT: v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1] ; GFX9-NEXT: v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0] ; GFX9-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0] -; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[8:9] ; GFX9-NEXT: s_endpgm entry: %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -778,15 +810,15 @@ define <4 x half> @shuffle_v4f16_0456(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX9-LABEL: shuffle_v4f16_0456: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX9-NEXT: v_and_b32_e32 v0, v3, v0 +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_and_b32_e32 v1, v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v3 +; GFX9-NEXT: v_and_b32_sdwa v2, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v1, v7, 16, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll index d9b4149..2b964ad 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll @@ -9,21 +9,21 @@ define void @vgpr_descriptor_waterfall_loop_idom_update(<4 x i32>* %arg) #0 { ; GCN-NEXT: BB0_1: ; %bb0 ; GCN-NEXT: ; =>This Loop Header: Depth=1 ; GCN-NEXT: ; Child Loop BB0_2 Depth 2 -; GCN-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 8 +; GCN-NEXT: v_add_co_u32_e64 v6, vcc_lo, v0, 8 ; GCN-NEXT: s_mov_b32 s5, exec_lo -; GCN-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GCN-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo ; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: flat_load_dwordx2 v[2:3], v[2:3] -; GCN-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GCN-NEXT: flat_load_dwordx2 v[4:5], v[6:7] +; GCN-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN-NEXT: BB0_2: ; Parent Loop BB0_1 Depth=1 ; GCN-NEXT: ; => This Inner Loop Header: Depth=2 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s8, v4 -; GCN-NEXT: v_readfirstlane_b32 s9, v5 -; GCN-NEXT: v_readfirstlane_b32 s10, v2 -; GCN-NEXT: v_readfirstlane_b32 s11, v3 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[4:5] -; GCN-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] +; GCN-NEXT: v_readfirstlane_b32 s8, v2 +; GCN-NEXT: v_readfirstlane_b32 s9, v3 +; GCN-NEXT: v_readfirstlane_b32 s10, v4 +; GCN-NEXT: v_readfirstlane_b32 s11, v5 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[2:3] +; GCN-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[4:5] ; GCN-NEXT: s_and_b32 s4, vcc_lo, s4 ; GCN-NEXT: s_and_saveexec_b32 s4, s4 ; GCN-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-any.ll b/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-any.ll new file mode 100644 index 0000000..8c146fb --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-any.ll @@ -0,0 +1,16 @@ +; RUN: llc -march=amdgcn -mcpu=gfx600 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=NOT-SUPPORTED %s +; RUN: llc -march=amdgcn -mcpu=gfx700 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=NOT-SUPPORTED %s +; RUN: llc -march=amdgcn -mcpu=gfx801 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s +; RUN: llc -march=amdgcn -mcpu=gfx902 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s + +; REQUIRES: asserts + +; NOT-SUPPORTED: xnack setting for subtarget: Unsupported +; ANY: xnack setting for subtarget: Any +define void @xnack-subtarget-feature-any() #0 { + ret void +} + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-disabled.ll b/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-disabled.ll new file mode 100644 index 0000000..81c68d0 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-disabled.ll @@ -0,0 +1,17 @@ +; RUN: llc -march=amdgcn -mcpu=gfx600 -debug-only=amdgpu-subtarget -o /dev/null %s 2>&1 | FileCheck --check-prefix=WARN %s +; RUN: llc -march=amdgcn -mcpu=gfx700 -debug-only=amdgpu-subtarget -o /dev/null %s 2>&1 | FileCheck --check-prefix=WARN %s +; RUN: llc -march=amdgcn -mcpu=gfx801 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=OFF %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=OFF %s +; RUN: llc -march=amdgcn -mcpu=gfx906 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=OFF %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=OFF %s + +; REQUIRES: asserts + +; WARN: warning: xnack 'Off' was requested for a processor that does not support it! +; OFF: xnack setting for subtarget: Off + +define void @xnack-subtarget-feature-disabled() #0 { + ret void +} + +attributes #0 = { "target-features"="-xnack" } diff --git a/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-enabled.ll b/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-enabled.ll new file mode 100644 index 0000000..8dcb730 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-enabled.ll @@ -0,0 +1,16 @@ +; RUN: llc -march=amdgcn -mcpu=gfx600 -debug-only=amdgpu-subtarget -o /dev/null %s 2>&1 | FileCheck --check-prefix=WARN %s +; RUN: llc -march=amdgcn -mcpu=gfx700 -debug-only=amdgpu-subtarget -o /dev/null %s 2>&1 | FileCheck --check-prefix=WARN %s +; RUN: llc -march=amdgcn -mcpu=gfx801 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s +; RUN: llc -march=amdgcn -mcpu=gfx906 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s + +; REQUIRES: asserts + +; WARN: warning: xnack 'On' was requested for a processor that does not support it! +; ON: xnack setting for subtarget: On +define void @xnack-subtarget-feature-enabled() #0 { + ret void +} + +attributes #0 = { "target-features"="+xnack" } diff --git a/llvm/test/MC/AMDGPU/xnack-mask.s b/llvm/test/MC/AMDGPU/xnack-mask.s index e6e3107..e3fe6e9 100644 --- a/llvm/test/MC/AMDGPU/xnack-mask.s +++ b/llvm/test/MC/AMDGPU/xnack-mask.s @@ -1,10 +1,10 @@ // RUN: not llvm-mc -arch=amdgcn -mcpu=tahiti %s 2>&1 | FileCheck -check-prefix=NOSICIVI10 --implicit-check-not=error: %s // RUN: not llvm-mc -arch=amdgcn -mcpu=hawaii %s 2>&1 | FileCheck -check-prefix=NOSICIVI10 --implicit-check-not=error: %s // RUN: not llvm-mc -arch=amdgcn -mcpu=tonga %s 2>&1 | FileCheck -check-prefix=NOSICIVI10 --implicit-check-not=error: %s -// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1001 %s 2>&1 | FileCheck -check-prefix=NOSICIVI10 --implicit-check-not=error: %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1001 -mattr=-xnack %s 2>&1 | FileCheck -check-prefix=NOSICIVI10 --implicit-check-not=error: %s -// RUN: not llvm-mc -arch=amdgcn -mcpu=stoney %s 2>&1 | FileCheck -check-prefix=XNACKERR --implicit-check-not=error: %s -// RUN: not llvm-mc -arch=amdgcn -mcpu=stoney -show-encoding %s | FileCheck -check-prefix=XNACK %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=stoney -mattr=+xnack %s 2>&1 | FileCheck -check-prefix=XNACKERR --implicit-check-not=error: %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=stoney -mattr=+xnack -show-encoding %s | FileCheck -check-prefix=XNACK %s s_mov_b64 xnack_mask, -1 // NOSICIVI10: error: register not available on this GPU |