diff options
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp | 295 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 4 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll | 636 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll | 8 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll | 39 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll | 359 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/extract-subvector.ll | 498 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 24 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll | 24 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll | 2354 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll | 352 |
11 files changed, 2565 insertions, 2028 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp index 69fdeae..7623b73 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp @@ -81,6 +81,73 @@ public: bool visitLoadInst(LoadInst &LI); }; +using ValueToValueMap = DenseMap<const Value *, Value *>; + +class LiveRegOptimizer { +private: + Module *Mod = nullptr; + const DataLayout *DL = nullptr; + const GCNSubtarget *ST; + /// The scalar type to convert to + Type *ConvertToScalar; + /// The set of visited Instructions + SmallPtrSet<Instruction *, 4> Visited; + /// The set of Instructions to be deleted + SmallPtrSet<Instruction *, 4> DeadInstrs; + /// Map of Value -> Converted Value + ValueToValueMap ValMap; + /// Map of containing conversions from Optimal Type -> Original Type per BB. + DenseMap<BasicBlock *, ValueToValueMap> BBUseValMap; + +public: + /// Calculate the and \p return the type to convert to given a problematic \p + /// OriginalType. In some instances, we may widen the type (e.g. v2i8 -> i32). + Type *calculateConvertType(Type *OriginalType); + /// Convert the virtual register defined by \p V to the compatible vector of + /// legal type + Value *convertToOptType(Instruction *V, BasicBlock::iterator &InstPt); + /// Convert the virtual register defined by \p V back to the original type \p + /// ConvertType, stripping away the MSBs in cases where there was an imperfect + /// fit (e.g. v2i32 -> v7i8) + Value *convertFromOptType(Type *ConvertType, Instruction *V, + BasicBlock::iterator &InstPt, + BasicBlock *InsertBlock); + /// Check for problematic PHI nodes or cross-bb values based on the value + /// defined by \p I, and coerce to legal types if necessary. For problematic + /// PHI node, we coerce all incoming values in a single invocation. + bool optimizeLiveType(Instruction *I); + + /// Remove all instructions that have become dead (i.e. all the re-typed PHIs) + void removeDeadInstrs(); + + // Whether or not the type should be replaced to avoid inefficient + // legalization code + bool shouldReplace(Type *ITy) { + FixedVectorType *VTy = dyn_cast<FixedVectorType>(ITy); + if (!VTy) + return false; + + auto TLI = ST->getTargetLowering(); + + Type *EltTy = VTy->getElementType(); + // If the element size is not less than the convert to scalar size, then we + // can't do any bit packing + if (!EltTy->isIntegerTy() || + EltTy->getScalarSizeInBits() > ConvertToScalar->getScalarSizeInBits()) + return false; + + // Only coerce illegal types + TargetLoweringBase::LegalizeKind LK = + TLI->getTypeConversion(EltTy->getContext(), EVT::getEVT(EltTy, false)); + return LK.first != TargetLoweringBase::TypeLegal; + } + + LiveRegOptimizer(Module *Mod, const GCNSubtarget *ST) : Mod(Mod), ST(ST) { + DL = &Mod->getDataLayout(); + ConvertToScalar = Type::getInt32Ty(Mod->getContext()); + } +}; + } // end anonymous namespace bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) { @@ -102,14 +169,238 @@ bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) { AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo(); + // "Optimize" the virtual regs that cross basic block boundaries. When + // building the SelectionDAG, vectors of illegal types that cross basic blocks + // will be scalarized and widened, with each scalar living in its + // own register. To work around this, this optimization converts the + // vectors to equivalent vectors of legal type (which are converted back + // before uses in subsequent blocks), to pack the bits into fewer physical + // registers (used in CopyToReg/CopyFromReg pairs). + LiveRegOptimizer LRO(Mod, &ST); + bool Changed = false; + for (auto &BB : F) - for (Instruction &I : llvm::make_early_inc_range(BB)) + for (Instruction &I : make_early_inc_range(BB)) { Changed |= visit(I); + Changed |= LRO.optimizeLiveType(&I); + } + LRO.removeDeadInstrs(); return Changed; } +Type *LiveRegOptimizer::calculateConvertType(Type *OriginalType) { + assert(OriginalType->getScalarSizeInBits() <= + ConvertToScalar->getScalarSizeInBits()); + + FixedVectorType *VTy = cast<FixedVectorType>(OriginalType); + + TypeSize OriginalSize = DL->getTypeSizeInBits(VTy); + TypeSize ConvertScalarSize = DL->getTypeSizeInBits(ConvertToScalar); + unsigned ConvertEltCount = + (OriginalSize + ConvertScalarSize - 1) / ConvertScalarSize; + + if (OriginalSize <= ConvertScalarSize) + return IntegerType::get(Mod->getContext(), ConvertScalarSize); + + return VectorType::get(Type::getIntNTy(Mod->getContext(), ConvertScalarSize), + ConvertEltCount, false); +} + +Value *LiveRegOptimizer::convertToOptType(Instruction *V, + BasicBlock::iterator &InsertPt) { + FixedVectorType *VTy = cast<FixedVectorType>(V->getType()); + Type *NewTy = calculateConvertType(V->getType()); + + TypeSize OriginalSize = DL->getTypeSizeInBits(VTy); + TypeSize NewSize = DL->getTypeSizeInBits(NewTy); + + IRBuilder<> Builder(V->getParent(), InsertPt); + // If there is a bitsize match, we can fit the old vector into a new vector of + // desired type. + if (OriginalSize == NewSize) + return Builder.CreateBitCast(V, NewTy, V->getName() + ".bc"); + + // If there is a bitsize mismatch, we must use a wider vector. + assert(NewSize > OriginalSize); + uint64_t ExpandedVecElementCount = NewSize / VTy->getScalarSizeInBits(); + + SmallVector<int, 8> ShuffleMask; + uint64_t OriginalElementCount = VTy->getElementCount().getFixedValue(); + for (unsigned I = 0; I < OriginalElementCount; I++) + ShuffleMask.push_back(I); + + for (uint64_t I = OriginalElementCount; I < ExpandedVecElementCount; I++) + ShuffleMask.push_back(OriginalElementCount); + + Value *ExpandedVec = Builder.CreateShuffleVector(V, ShuffleMask); + return Builder.CreateBitCast(ExpandedVec, NewTy, V->getName() + ".bc"); +} + +Value *LiveRegOptimizer::convertFromOptType(Type *ConvertType, Instruction *V, + BasicBlock::iterator &InsertPt, + BasicBlock *InsertBB) { + FixedVectorType *NewVTy = cast<FixedVectorType>(ConvertType); + + TypeSize OriginalSize = DL->getTypeSizeInBits(V->getType()); + TypeSize NewSize = DL->getTypeSizeInBits(NewVTy); + + IRBuilder<> Builder(InsertBB, InsertPt); + // If there is a bitsize match, we simply convert back to the original type. + if (OriginalSize == NewSize) + return Builder.CreateBitCast(V, NewVTy, V->getName() + ".bc"); + + // If there is a bitsize mismatch, then we must have used a wider value to + // hold the bits. + assert(OriginalSize > NewSize); + // For wide scalars, we can just truncate the value. + if (!V->getType()->isVectorTy()) { + Instruction *Trunc = cast<Instruction>( + Builder.CreateTrunc(V, IntegerType::get(Mod->getContext(), NewSize))); + return cast<Instruction>(Builder.CreateBitCast(Trunc, NewVTy)); + } + + // For wider vectors, we must strip the MSBs to convert back to the original + // type. + VectorType *ExpandedVT = VectorType::get( + Type::getIntNTy(Mod->getContext(), NewVTy->getScalarSizeInBits()), + (OriginalSize / NewVTy->getScalarSizeInBits()), false); + Instruction *Converted = + cast<Instruction>(Builder.CreateBitCast(V, ExpandedVT)); + + unsigned NarrowElementCount = NewVTy->getElementCount().getFixedValue(); + SmallVector<int, 8> ShuffleMask(NarrowElementCount); + std::iota(ShuffleMask.begin(), ShuffleMask.end(), 0); + + return Builder.CreateShuffleVector(Converted, ShuffleMask); +} + +bool LiveRegOptimizer::optimizeLiveType(Instruction *I) { + SmallVector<Instruction *, 4> Worklist; + SmallPtrSet<PHINode *, 4> PhiNodes; + SmallPtrSet<Instruction *, 4> Defs; + SmallPtrSet<Instruction *, 4> Uses; + + Worklist.push_back(cast<Instruction>(I)); + while (!Worklist.empty()) { + Instruction *II = Worklist.pop_back_val(); + + if (!Visited.insert(II).second) + continue; + + if (!shouldReplace(II->getType())) + continue; + + if (PHINode *Phi = dyn_cast<PHINode>(II)) { + PhiNodes.insert(Phi); + // Collect all the incoming values of problematic PHI nodes. + for (Value *V : Phi->incoming_values()) { + // Repeat the collection process for newly found PHI nodes. + if (PHINode *OpPhi = dyn_cast<PHINode>(V)) { + if (!PhiNodes.count(OpPhi) && !Visited.count(OpPhi)) + Worklist.push_back(OpPhi); + continue; + } + + Instruction *IncInst = dyn_cast<Instruction>(V); + // Other incoming value types (e.g. vector literals) are unhandled + if (!IncInst && !isa<ConstantAggregateZero>(V)) + return false; + + // Collect all other incoming values for coercion. + if (IncInst) + Defs.insert(IncInst); + } + } + + // Collect all relevant uses. + for (User *V : II->users()) { + // Repeat the collection process for problematic PHI nodes. + if (PHINode *OpPhi = dyn_cast<PHINode>(V)) { + if (!PhiNodes.count(OpPhi) && !Visited.count(OpPhi)) + Worklist.push_back(OpPhi); + continue; + } + + Instruction *UseInst = cast<Instruction>(V); + // Collect all uses of PHINodes and any use the crosses BB boundaries. + if (UseInst->getParent() != II->getParent() || isa<PHINode>(II)) { + Uses.insert(UseInst); + if (!Defs.count(II) && !isa<PHINode>(II)) { + Defs.insert(II); + } + } + } + } + + // Coerce and track the defs. + for (Instruction *D : Defs) { + if (!ValMap.contains(D)) { + BasicBlock::iterator InsertPt = std::next(D->getIterator()); + Value *ConvertVal = convertToOptType(D, InsertPt); + assert(ConvertVal); + ValMap[D] = ConvertVal; + } + } + + // Construct new-typed PHI nodes. + for (PHINode *Phi : PhiNodes) { + ValMap[Phi] = PHINode::Create(calculateConvertType(Phi->getType()), + Phi->getNumIncomingValues(), + Phi->getName() + ".tc", Phi->getIterator()); + } + + // Connect all the PHI nodes with their new incoming values. + for (PHINode *Phi : PhiNodes) { + PHINode *NewPhi = cast<PHINode>(ValMap[Phi]); + bool MissingIncVal = false; + for (int I = 0, E = Phi->getNumIncomingValues(); I < E; I++) { + Value *IncVal = Phi->getIncomingValue(I); + if (isa<ConstantAggregateZero>(IncVal)) { + Type *NewType = calculateConvertType(Phi->getType()); + NewPhi->addIncoming(ConstantInt::get(NewType, 0, false), + Phi->getIncomingBlock(I)); + } else if (ValMap.contains(IncVal)) + NewPhi->addIncoming(ValMap[IncVal], Phi->getIncomingBlock(I)); + else + MissingIncVal = true; + } + DeadInstrs.insert(MissingIncVal ? cast<Instruction>(ValMap[Phi]) : Phi); + } + // Coerce back to the original type and replace the uses. + for (Instruction *U : Uses) { + // Replace all converted operands for a use. + for (auto [OpIdx, Op] : enumerate(U->operands())) { + if (ValMap.contains(Op)) { + Value *NewVal = nullptr; + if (BBUseValMap.contains(U->getParent()) && + BBUseValMap[U->getParent()].contains(ValMap[Op])) + NewVal = BBUseValMap[U->getParent()][ValMap[Op]]; + else { + BasicBlock::iterator InsertPt = U->getParent()->getFirstNonPHIIt(); + NewVal = + convertFromOptType(Op->getType(), cast<Instruction>(ValMap[Op]), + InsertPt, U->getParent()); + BBUseValMap[U->getParent()][ValMap[Op]] = NewVal; + } + assert(NewVal); + U->setOperand(OpIdx, NewVal); + } + } + } + + return true; +} + +void LiveRegOptimizer::removeDeadInstrs() { + // Remove instrs that have been marked dead after type-coercion. + for (auto *I : DeadInstrs) { + I->replaceAllUsesWith(PoisonValue::get(I->getType())); + I->eraseFromParent(); + } +} + bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const { unsigned AS = LI.getPointerAddressSpace(); // Skip non-constant address space. @@ -119,7 +410,7 @@ bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const { // Skip non-simple loads. if (!LI.isSimple()) return false; - auto *Ty = LI.getType(); + Type *Ty = LI.getType(); // Skip aggregate types. if (Ty->isAggregateType()) return false; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 9162e11..f50a18c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -1197,10 +1197,10 @@ bool GCNPassConfig::addPreISel() { AMDGPUPassConfig::addPreISel(); if (TM->getOptLevel() > CodeGenOptLevel::None) - addPass(createAMDGPULateCodeGenPreparePass()); + addPass(createSinkingPass()); if (TM->getOptLevel() > CodeGenOptLevel::None) - addPass(createSinkingPass()); + addPass(createAMDGPULateCodeGenPreparePass()); // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit // regions formed by them. diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll new file mode 100644 index 0000000..83cb92210 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll @@ -0,0 +1,636 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s + +define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { +; GFX906-LABEL: v3i8_liveout: +; GFX906: ; %bb.0: ; %entry +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX906-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX906-NEXT: v_mov_b32_e32 v3, 8 +; GFX906-NEXT: v_mov_b32_e32 v5, 16 +; GFX906-NEXT: s_waitcnt lgkmcnt(0) +; GFX906-NEXT: global_load_dword v4, v2, s[4:5] +; GFX906-NEXT: v_mov_b32_e32 v1, 0xff +; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: v_and_b32_e32 v6, 0xff, v4 +; GFX906-NEXT: v_lshlrev_b32_sdwa v7, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX906-NEXT: v_lshlrev_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX906-NEXT: v_or3_b32 v4, v6, v7, v4 +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX906-NEXT: s_cbranch_execz .LBB0_2 +; GFX906-NEXT: ; %bb.1: ; %bb.1 +; GFX906-NEXT: global_load_dword v0, v2, s[6:7] +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v0 +; GFX906-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX906-NEXT: v_or3_b32 v4, v2, v3, v0 +; GFX906-NEXT: .LBB0_2: ; %bb.2 +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v4 +; GFX906-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX906-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_mov_b32_e32 v1, 0 +; GFX906-NEXT: global_store_short v1, v0, s[2:3] +; GFX906-NEXT: global_store_byte_d16_hi v1, v0, s[2:3] offset:2 +; GFX906-NEXT: s_endpgm +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %gep1 = getelementptr <3 x i8>, ptr addrspace(1) %src1, i32 %idx + %vec1 = load <3 x i8>, ptr addrspace(1) %gep1 + %gep2 = getelementptr <3 x i8>, ptr addrspace(1) %src2, i32 %idx + %vec2 = load <3 x i8>, ptr addrspace(1) %gep2 + %cmp = icmp ult i32 %idx, 15 + br i1 %cmp, label %bb.1, label %bb.2 +bb.1: + br label %bb.2 + +bb.2: + %tmp5 = phi <3 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] + store <3 x i8> %tmp5, ptr addrspace(1) %dst, align 4 + ret void +} + +define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { +; GFX906-LABEL: v4i8_liveout: +; GFX906: ; %bb.0: ; %entry +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX906-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: s_waitcnt lgkmcnt(0) +; GFX906-NEXT: global_load_dword v1, v2, s[4:5] +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX906-NEXT: s_cbranch_execz .LBB1_2 +; GFX906-NEXT: ; %bb.1: ; %bb.1 +; GFX906-NEXT: global_load_dword v1, v2, s[6:7] +; GFX906-NEXT: .LBB1_2: ; %bb.2 +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX906-NEXT: v_mov_b32_e32 v0, 0 +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: global_store_dword v0, v1, s[2:3] +; GFX906-NEXT: s_endpgm +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx + %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 + %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx + %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 + %cmp = icmp ult i32 %idx, 15 + br i1 %cmp, label %bb.1, label %bb.2 +bb.1: + br label %bb.2 + +bb.2: + %tmp5 = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] + store <4 x i8> %tmp5, ptr addrspace(1) %dst, align 4 + ret void +} + +define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { +; GFX906-LABEL: v5i8_liveout: +; GFX906: ; %bb.0: ; %entry +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: s_waitcnt lgkmcnt(0) +; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[4:5] +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX906-NEXT: s_cbranch_execz .LBB2_2 +; GFX906-NEXT: ; %bb.1: ; %bb.1 +; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[6:7] +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX906-NEXT: .LBB2_2: ; %bb.2 +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX906-NEXT: v_mov_b32_e32 v4, 0 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX906-NEXT: global_store_byte v4, v1, s[2:3] +; GFX906-NEXT: global_store_byte v4, v0, s[2:3] offset:1 +; GFX906-NEXT: global_store_byte_d16_hi v4, v1, s[2:3] offset:2 +; GFX906-NEXT: global_store_byte v4, v3, s[2:3] offset:3 +; GFX906-NEXT: global_store_byte v4, v2, s[2:3] offset:4 +; GFX906-NEXT: s_endpgm +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %gep1 = getelementptr <5 x i8>, ptr addrspace(1) %src1, i32 %idx + %vec1 = load <5 x i8>, ptr addrspace(1) %gep1 + %gep2 = getelementptr <5 x i8>, ptr addrspace(1) %src2, i32 %idx + %vec2 = load <5 x i8>, ptr addrspace(1) %gep2 + %cmp = icmp ult i32 %idx, 15 + br i1 %cmp, label %bb.1, label %bb.2 +bb.1: + br label %bb.2 + +bb.2: + %tmp5 = phi <5 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] + store <5 x i8> %tmp5, ptr addrspace(1) %dst, align 4 + ret void +} + +define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { +; GFX906-LABEL: v8i8_liveout: +; GFX906: ; %bb.0: ; %entry +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: s_waitcnt lgkmcnt(0) +; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[4:5] +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX906-NEXT: s_cbranch_execz .LBB3_2 +; GFX906-NEXT: ; %bb.1: ; %bb.1 +; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[6:7] +; GFX906-NEXT: .LBB3_2: ; %bb.2 +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX906-NEXT: v_mov_b32_e32 v0, 0 +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[2:3] +; GFX906-NEXT: s_endpgm +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx + %vec1 = load <8 x i8>, ptr addrspace(1) %gep1 + %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx + %vec2 = load <8 x i8>, ptr addrspace(1) %gep2 + %cmp = icmp ult i32 %idx, 15 + br i1 %cmp, label %bb.1, label %bb.2 +bb.1: + br label %bb.2 + +bb.2: + %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] + store <8 x i8> %tmp5, ptr addrspace(1) %dst, align 4 + ret void +} + +define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { +; GFX906-LABEL: v16i8_liveout: +; GFX906: ; %bb.0: ; %entry +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX906-NEXT: v_lshlrev_b32_e32 v5, 4, v0 +; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: s_waitcnt lgkmcnt(0) +; GFX906-NEXT: global_load_dwordx4 v[1:4], v5, s[4:5] +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX906-NEXT: s_cbranch_execz .LBB4_2 +; GFX906-NEXT: ; %bb.1: ; %bb.1 +; GFX906-NEXT: global_load_dwordx4 v[1:4], v5, s[6:7] +; GFX906-NEXT: .LBB4_2: ; %bb.2 +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX906-NEXT: v_mov_b32_e32 v0, 0 +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] +; GFX906-NEXT: s_endpgm +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %gep1 = getelementptr <16 x i8>, ptr addrspace(1) %src1, i32 %idx + %vec1 = load <16 x i8>, ptr addrspace(1) %gep1 + %gep2 = getelementptr <16 x i8>, ptr addrspace(1) %src2, i32 %idx + %vec2 = load <16 x i8>, ptr addrspace(1) %gep2 + %cmp = icmp ult i32 %idx, 15 + br i1 %cmp, label %bb.1, label %bb.2 +bb.1: + br label %bb.2 + +bb.2: + %tmp5 = phi <16 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] + store <16 x i8> %tmp5, ptr addrspace(1) %dst, align 4 + ret void +} + +define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { +; GFX906-LABEL: v32i8_liveout: +; GFX906: ; %bb.0: ; %entry +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX906-NEXT: v_lshlrev_b32_e32 v9, 5, v0 +; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: s_waitcnt lgkmcnt(0) +; GFX906-NEXT: global_load_dwordx4 v[1:4], v9, s[4:5] +; GFX906-NEXT: global_load_dwordx4 v[5:8], v9, s[4:5] offset:16 +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX906-NEXT: s_cbranch_execz .LBB5_2 +; GFX906-NEXT: ; %bb.1: ; %bb.1 +; GFX906-NEXT: global_load_dwordx4 v[1:4], v9, s[6:7] +; GFX906-NEXT: global_load_dwordx4 v[5:8], v9, s[6:7] offset:16 +; GFX906-NEXT: .LBB5_2: ; %bb.2 +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX906-NEXT: v_mov_b32_e32 v0, 0 +; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] +; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: global_store_dwordx4 v0, v[5:8], s[2:3] offset:16 +; GFX906-NEXT: s_endpgm +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %gep1 = getelementptr <32 x i8>, ptr addrspace(1) %src1, i32 %idx + %vec1 = load <32 x i8>, ptr addrspace(1) %gep1 + %gep2 = getelementptr <32 x i8>, ptr addrspace(1) %src2, i32 %idx + %vec2 = load <32 x i8>, ptr addrspace(1) %gep2 + %cmp = icmp ult i32 %idx, 15 + br i1 %cmp, label %bb.1, label %bb.2 +bb.1: + br label %bb.2 + +bb.2: + %tmp5 = phi <32 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] + store <32 x i8> %tmp5, ptr addrspace(1) %dst, align 4 + ret void +} + +define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { +; GFX906-LABEL: v256i8_liveout: +; GFX906: ; %bb.0: ; %entry +; GFX906-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX906-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX906-NEXT: s_mov_b32 s10, -1 +; GFX906-NEXT: s_mov_b32 s11, 0xe00000 +; GFX906-NEXT: s_add_u32 s8, s8, s3 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX906-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX906-NEXT: s_addc_u32 s9, s9, 0 +; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: s_waitcnt lgkmcnt(0) +; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[4:5] +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: buffer_store_dword v5, off, s[8:11], 0 ; 4-byte Folded Spill +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill +; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[4:5] offset:16 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[4:5] offset:32 +; GFX906-NEXT: global_load_dwordx4 v[13:16], v4, s[4:5] offset:48 +; GFX906-NEXT: global_load_dwordx4 v[17:20], v4, s[4:5] offset:64 +; GFX906-NEXT: global_load_dwordx4 v[21:24], v4, s[4:5] offset:80 +; GFX906-NEXT: global_load_dwordx4 v[25:28], v4, s[4:5] offset:96 +; GFX906-NEXT: global_load_dwordx4 v[29:32], v4, s[4:5] offset:112 +; GFX906-NEXT: global_load_dwordx4 v[33:36], v4, s[4:5] offset:128 +; GFX906-NEXT: global_load_dwordx4 v[37:40], v4, s[4:5] offset:144 +; GFX906-NEXT: global_load_dwordx4 v[41:44], v4, s[4:5] offset:160 +; GFX906-NEXT: global_load_dwordx4 v[45:48], v4, s[4:5] offset:176 +; GFX906-NEXT: global_load_dwordx4 v[49:52], v4, s[4:5] offset:192 +; GFX906-NEXT: global_load_dwordx4 v[53:56], v4, s[4:5] offset:208 +; GFX906-NEXT: global_load_dwordx4 v[57:60], v4, s[4:5] offset:224 +; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[4:5] offset:240 +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX906-NEXT: s_cbranch_execz .LBB6_2 +; GFX906-NEXT: ; %bb.1: ; %bb.1 +; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; 4-byte Folded Spill +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill +; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[6:7] offset:16 +; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[6:7] offset:32 +; GFX906-NEXT: global_load_dwordx4 v[13:16], v4, s[6:7] offset:48 +; GFX906-NEXT: global_load_dwordx4 v[17:20], v4, s[6:7] offset:64 +; GFX906-NEXT: global_load_dwordx4 v[21:24], v4, s[6:7] offset:80 +; GFX906-NEXT: global_load_dwordx4 v[25:28], v4, s[6:7] offset:96 +; GFX906-NEXT: global_load_dwordx4 v[29:32], v4, s[6:7] offset:112 +; GFX906-NEXT: global_load_dwordx4 v[33:36], v4, s[6:7] offset:128 +; GFX906-NEXT: global_load_dwordx4 v[37:40], v4, s[6:7] offset:144 +; GFX906-NEXT: global_load_dwordx4 v[41:44], v4, s[6:7] offset:160 +; GFX906-NEXT: global_load_dwordx4 v[45:48], v4, s[6:7] offset:176 +; GFX906-NEXT: global_load_dwordx4 v[49:52], v4, s[6:7] offset:192 +; GFX906-NEXT: global_load_dwordx4 v[53:56], v4, s[6:7] offset:208 +; GFX906-NEXT: global_load_dwordx4 v[57:60], v4, s[6:7] offset:224 +; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] offset:240 +; GFX906-NEXT: .LBB6_2: ; %bb.2 +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill +; GFX906-NEXT: v_mov_b32_e32 v0, v57 +; GFX906-NEXT: v_mov_b32_e32 v1, v58 +; GFX906-NEXT: v_mov_b32_e32 v2, v59 +; GFX906-NEXT: v_mov_b32_e32 v3, v60 +; GFX906-NEXT: v_mov_b32_e32 v60, v56 +; GFX906-NEXT: v_mov_b32_e32 v59, v55 +; GFX906-NEXT: v_mov_b32_e32 v58, v54 +; GFX906-NEXT: v_mov_b32_e32 v57, v53 +; GFX906-NEXT: v_mov_b32_e32 v56, v52 +; GFX906-NEXT: v_mov_b32_e32 v55, v51 +; GFX906-NEXT: v_mov_b32_e32 v54, v50 +; GFX906-NEXT: v_mov_b32_e32 v53, v49 +; GFX906-NEXT: v_mov_b32_e32 v52, v48 +; GFX906-NEXT: v_mov_b32_e32 v51, v47 +; GFX906-NEXT: v_mov_b32_e32 v50, v46 +; GFX906-NEXT: v_mov_b32_e32 v49, v45 +; GFX906-NEXT: v_mov_b32_e32 v48, v44 +; GFX906-NEXT: v_mov_b32_e32 v47, v43 +; GFX906-NEXT: v_mov_b32_e32 v46, v42 +; GFX906-NEXT: v_mov_b32_e32 v45, v41 +; GFX906-NEXT: v_mov_b32_e32 v44, v40 +; GFX906-NEXT: v_mov_b32_e32 v43, v39 +; GFX906-NEXT: v_mov_b32_e32 v42, v38 +; GFX906-NEXT: v_mov_b32_e32 v41, v37 +; GFX906-NEXT: v_mov_b32_e32 v40, v36 +; GFX906-NEXT: v_mov_b32_e32 v39, v35 +; GFX906-NEXT: v_mov_b32_e32 v38, v34 +; GFX906-NEXT: v_mov_b32_e32 v37, v33 +; GFX906-NEXT: v_mov_b32_e32 v36, v32 +; GFX906-NEXT: v_mov_b32_e32 v35, v31 +; GFX906-NEXT: v_mov_b32_e32 v34, v30 +; GFX906-NEXT: v_mov_b32_e32 v33, v29 +; GFX906-NEXT: v_mov_b32_e32 v32, v28 +; GFX906-NEXT: v_mov_b32_e32 v31, v27 +; GFX906-NEXT: v_mov_b32_e32 v30, v26 +; GFX906-NEXT: v_mov_b32_e32 v29, v25 +; GFX906-NEXT: v_mov_b32_e32 v28, v24 +; GFX906-NEXT: v_mov_b32_e32 v27, v23 +; GFX906-NEXT: v_mov_b32_e32 v26, v22 +; GFX906-NEXT: v_mov_b32_e32 v25, v21 +; GFX906-NEXT: v_mov_b32_e32 v24, v20 +; GFX906-NEXT: v_mov_b32_e32 v23, v19 +; GFX906-NEXT: v_mov_b32_e32 v22, v18 +; GFX906-NEXT: v_mov_b32_e32 v21, v17 +; GFX906-NEXT: v_mov_b32_e32 v20, v16 +; GFX906-NEXT: v_mov_b32_e32 v19, v15 +; GFX906-NEXT: v_mov_b32_e32 v18, v14 +; GFX906-NEXT: v_mov_b32_e32 v17, v13 +; GFX906-NEXT: v_mov_b32_e32 v16, v12 +; GFX906-NEXT: v_mov_b32_e32 v15, v11 +; GFX906-NEXT: v_mov_b32_e32 v14, v10 +; GFX906-NEXT: v_mov_b32_e32 v13, v9 +; GFX906-NEXT: v_mov_b32_e32 v12, v8 +; GFX906-NEXT: v_mov_b32_e32 v11, v7 +; GFX906-NEXT: v_mov_b32_e32 v10, v6 +; GFX906-NEXT: v_mov_b32_e32 v9, v5 +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v7, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v8, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload +; GFX906-NEXT: v_mov_b32_e32 v4, 0 +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: global_store_dwordx4 v4, v[5:8], s[2:3] +; GFX906-NEXT: global_store_dwordx4 v4, v[9:12], s[2:3] offset:16 +; GFX906-NEXT: global_store_dwordx4 v4, v[13:16], s[2:3] offset:32 +; GFX906-NEXT: global_store_dwordx4 v4, v[17:20], s[2:3] offset:48 +; GFX906-NEXT: global_store_dwordx4 v4, v[21:24], s[2:3] offset:64 +; GFX906-NEXT: global_store_dwordx4 v4, v[25:28], s[2:3] offset:80 +; GFX906-NEXT: global_store_dwordx4 v4, v[29:32], s[2:3] offset:96 +; GFX906-NEXT: global_store_dwordx4 v4, v[33:36], s[2:3] offset:112 +; GFX906-NEXT: global_store_dwordx4 v4, v[37:40], s[2:3] offset:128 +; GFX906-NEXT: global_store_dwordx4 v4, v[41:44], s[2:3] offset:144 +; GFX906-NEXT: global_store_dwordx4 v4, v[45:48], s[2:3] offset:160 +; GFX906-NEXT: global_store_dwordx4 v4, v[49:52], s[2:3] offset:176 +; GFX906-NEXT: global_store_dwordx4 v4, v[53:56], s[2:3] offset:192 +; GFX906-NEXT: global_store_dwordx4 v4, v[57:60], s[2:3] offset:208 +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:224 +; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:20 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:24 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:28 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:240 +; GFX906-NEXT: s_endpgm +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx + %vec1 = load <256 x i8>, ptr addrspace(1) %gep1 + %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx + %vec2 = load <256 x i8>, ptr addrspace(1) %gep2 + %cmp = icmp ult i32 %idx, 15 + br i1 %cmp, label %bb.1, label %bb.2 +bb.1: + br label %bb.2 + +bb.2: + %tmp5 = phi <256 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] + store <256 x i8> %tmp5, ptr addrspace(1) %dst, align 4 + ret void +} + + +define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { +; GFX906-LABEL: repeat_successor: +; GFX906: ; %bb.0: ; %entry +; GFX906-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX906-NEXT: s_waitcnt lgkmcnt(0) +; GFX906-NEXT: s_cmp_lt_i32 s2, 3 +; GFX906-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX906-NEXT: ; %bb.1: ; %LeafBlock +; GFX906-NEXT: s_cmp_ge_i32 s2, 1 +; GFX906-NEXT: s_cbranch_scc0 .LBB7_6 +; GFX906-NEXT: ; %bb.2: +; GFX906-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX906-NEXT: global_load_dword v0, v0, s[4:5] +; GFX906-NEXT: s_branch .LBB7_5 +; GFX906-NEXT: .LBB7_3: ; %LeafBlock5 +; GFX906-NEXT: s_cmp_eq_u32 s2, 3 +; GFX906-NEXT: s_cbranch_scc0 .LBB7_6 +; GFX906-NEXT: ; %bb.4: ; %sw.bb5 +; GFX906-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX906-NEXT: global_load_dword v0, v0, s[6:7] +; GFX906-NEXT: .LBB7_5: ; %return.sink.split +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c +; GFX906-NEXT: v_mov_b32_e32 v1, 0 +; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX906-NEXT: global_store_dword v1, v0, s[0:1] +; GFX906-NEXT: .LBB7_6: ; %return +; GFX906-NEXT: s_endpgm +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx + %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 + %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx + %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 + switch i32 %in, label %return [ + i32 1, label %return.sink.split + i32 2, label %return.sink.split + i32 3, label %sw.bb5 + ] + +sw.bb5: + br label %return.sink.split + +return.sink.split: + %tmp5 = phi <4 x i8> [ %vec2, %sw.bb5 ], [ %vec1, %entry ], [ %vec1, %entry ] + store <4 x i8> %tmp5, ptr addrspace(1) %dst, align 4 + ret void + +return: + ret void +} + +define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { +; GFX906-LABEL: v8i8_phi_chain: +; GFX906: ; %bb.0: ; %entry +; GFX906-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: s_waitcnt lgkmcnt(0) +; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[0:1] +; GFX906-NEXT: s_xor_b64 s[0:1], vcc, -1 +; GFX906-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GFX906-NEXT: s_cbranch_execz .LBB8_2 +; GFX906-NEXT: ; %bb.1: ; %bb.1 +; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[2:3] +; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 +; GFX906-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX906-NEXT: s_and_b64 s[2:3], exec, vcc +; GFX906-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX906-NEXT: .LBB8_2: ; %Flow +; GFX906-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX906-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] +; GFX906-NEXT: s_cbranch_execz .LBB8_4 +; GFX906-NEXT: ; %bb.3: ; %bb.2 +; GFX906-NEXT: v_mov_b32_e32 v0, 0 +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[4:5] +; GFX906-NEXT: .LBB8_4: ; %bb.3 +; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: v_mov_b32_e32 v0, 0 +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[6:7] +; GFX906-NEXT: s_endpgm +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx + %vec1 = load <8 x i8>, ptr addrspace(1) %gep1 + %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx + %vec2 = load <8 x i8>, ptr addrspace(1) %gep2 + %cmp = icmp ult i32 %idx, 15 + br i1 %cmp, label %bb.1, label %bb.2 +bb.1: + %cmp2 = icmp ult i32 %idx, 7 + br i1 %cmp2, label %bb.2, label %bb.3 + +bb.2: + %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] + store <8 x i8> %tmp5, ptr addrspace(1) %dst0, align 4 + br label %bb.3 + +bb.3: + %tmp7 = phi <8 x i8> [ %vec2, %bb.1], [%tmp5, %bb.2] + store <8 x i8> %tmp7, ptr addrspace(1) %dst1, align 4 + ret void +} + +define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { +; GFX906-LABEL: v8i8_multi_block: +; GFX906: ; %bb.0: ; %entry +; GFX906-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX906-NEXT: v_lshlrev_b32_e32 v5, 3, v0 +; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: s_waitcnt lgkmcnt(0) +; GFX906-NEXT: global_load_dwordx2 v[3:4], v5, s[0:1] +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: v_mov_b32_e32 v1, v3 +; GFX906-NEXT: v_mov_b32_e32 v2, v4 +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX906-NEXT: s_cbranch_execz .LBB9_4 +; GFX906-NEXT: ; %bb.1: ; %bb.1 +; GFX906-NEXT: global_load_dwordx2 v[1:2], v5, s[2:3] +; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 +; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX906-NEXT: s_cbranch_execz .LBB9_3 +; GFX906-NEXT: ; %bb.2: ; %bb.2 +; GFX906-NEXT: v_mov_b32_e32 v0, 0 +; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[4:5] +; GFX906-NEXT: .LBB9_3: ; %Flow +; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: .LBB9_4: ; %bb.3 +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX906-NEXT: v_mov_b32_e32 v0, 0 +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[6:7] +; GFX906-NEXT: s_endpgm +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx + %vec1 = load <8 x i8>, ptr addrspace(1) %gep1 + %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx + %vec2 = load <8 x i8>, ptr addrspace(1) %gep2 + %cmp = icmp ult i32 %idx, 15 + br i1 %cmp, label %bb.1, label %bb.3 +bb.1: + %cmp2 = icmp ult i32 %idx, 7 + br i1 %cmp2, label %bb.2, label %bb.3 + +bb.2: + store <8 x i8> %vec1, ptr addrspace(1) %dst0, align 4 + br label %bb.3 + +bb.3: + %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ], [ %vec2, %bb.2] + store <8 x i8> %tmp5, ptr addrspace(1) %dst1, align 4 + ret void +} + +define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { +; GFX906-LABEL: v32i8_loop_carried: +; GFX906: ; %bb.0: ; %entry +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX906-NEXT: v_lshlrev_b32_e32 v1, 5, v0 +; GFX906-NEXT: v_mov_b32_e32 v3, 8 +; GFX906-NEXT: v_mov_b32_e32 v2, 0xff +; GFX906-NEXT: v_cmp_le_u32_e32 vcc, 15, v0 +; GFX906-NEXT: s_waitcnt lgkmcnt(0) +; GFX906-NEXT: global_load_dword v1, v1, s[2:3] +; GFX906-NEXT: s_mov_b64 s[2:3], 0 +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX906-NEXT: v_and_or_b32 v0, v1, v2, v0 +; GFX906-NEXT: v_mov_b32_e32 v2, 24 +; GFX906-NEXT: .LBB10_1: ; %bb.1 +; GFX906-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX906-NEXT: v_and_b32_e32 v3, 0xff, v1 +; GFX906-NEXT: s_and_b64 s[4:5], exec, vcc +; GFX906-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX906-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX906-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] +; GFX906-NEXT: v_or3_b32 v1, v0, v3, v1 +; GFX906-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_cbranch_execnz .LBB10_1 +; GFX906-NEXT: ; %bb.2: ; %bb.2.loopexit +; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX906-NEXT: v_mov_b32_e32 v0, 0 +; GFX906-NEXT: s_waitcnt lgkmcnt(0) +; GFX906-NEXT: global_store_dword v0, v1, s[0:1] +; GFX906-NEXT: s_endpgm +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %gep1 = getelementptr <32 x i8>, ptr addrspace(1) %src1, i32 %idx + %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 + br label %bb.1 + +bb.1: + %temp = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] + %vec2 = shufflevector <4 x i8> %vec1, <4 x i8> %temp, <4 x i32> <i32 0, i32 2, i32 4, i32 6> + %cmp = icmp ult i32 %idx, 15 + br i1 %cmp, label %bb.1, label %bb.2 + br label %bb.2 + +bb.2: + store <4 x i8> %vec2, ptr addrspace(1) %dst, align 4 + ret void +} + + +declare i32 @llvm.amdgcn.workitem.id.x() + diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll index 93b9aea..11772d25 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll @@ -987,8 +987,8 @@ define amdgpu_kernel void @phi_v7i16_switch(<7 x i16> %in, ptr %out, i8 %cond) { ; OPT-NEXT: entry: ; OPT-NEXT: [[X:%.*]] = insertelement <7 x i16> [[IN:%.*]], i16 3, i32 3 ; OPT-NEXT: switch i8 [[COND:%.*]], label [[ELSE:%.*]] [ -; OPT-NEXT: i8 0, label [[THEN_1:%.*]] -; OPT-NEXT: i8 3, label [[THEN_2:%.*]] +; OPT-NEXT: i8 0, label [[THEN_1:%.*]] +; OPT-NEXT: i8 3, label [[THEN_2:%.*]] ; OPT-NEXT: ] ; OPT: then.1: ; OPT-NEXT: [[LARGEPHI_EXTRACTSLICE0:%.*]] = shufflevector <7 x i16> [[X]], <7 x i16> poison, <2 x i32> <i32 0, i32 1> @@ -1025,8 +1025,8 @@ define amdgpu_kernel void @phi_v7i16_switch(<7 x i16> %in, ptr %out, i8 %cond) { ; NOOPT-NEXT: entry: ; NOOPT-NEXT: [[X:%.*]] = insertelement <7 x i16> [[IN:%.*]], i16 3, i32 3 ; NOOPT-NEXT: switch i8 [[COND:%.*]], label [[ELSE:%.*]] [ -; NOOPT-NEXT: i8 0, label [[THEN_1:%.*]] -; NOOPT-NEXT: i8 3, label [[THEN_2:%.*]] +; NOOPT-NEXT: i8 0, label [[THEN_1:%.*]] +; NOOPT-NEXT: i8 3, label [[THEN_2:%.*]] ; NOOPT-NEXT: ] ; NOOPT: then.1: ; NOOPT-NEXT: br label [[FINALLY:%.*]] diff --git a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll index 53acbb6..1e5ec36 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll @@ -8,29 +8,30 @@ define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0 ; CHECK-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x8 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_bitcmp0_b32 s0, 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %bb10 -; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: global_load_dwordx2 v[0:1], v0, s[8:9] +; CHECK-NEXT: global_load_dwordx2 v[8:9], v0, s[8:9] ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 8, v0 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 24, v1 +; CHECK-NEXT: v_and_b32_e32 v7, 0xff, v8 +; CHECK-NEXT: v_bfe_u32 v6, v8, 8, 8 +; CHECK-NEXT: v_bfe_u32 v5, v8, 16, 8 +; CHECK-NEXT: v_lshrrev_b32_e32 v4, 24, v8 +; CHECK-NEXT: v_and_b32_e32 v3, 0xff, v9 +; CHECK-NEXT: v_bfe_u32 v2, v9, 8, 8 +; CHECK-NEXT: v_bfe_u32 v1, v9, 16, 8 +; CHECK-NEXT: v_lshrrev_b32_e32 v0, 24, v9 ; CHECK-NEXT: s_branch .LBB0_3 ; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: v_mov_b32_e32 v3, 0 ; CHECK-NEXT: v_mov_b32_e32 v4, 0 -; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: v_mov_b32_e32 v5, 0 ; CHECK-NEXT: v_mov_b32_e32 v6, 0 ; CHECK-NEXT: v_mov_b32_e32 v7, 0 -; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: .LBB0_3: ; %bb41 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x48 ; CHECK-NEXT: v_mov_b32_e32 v8, s10 @@ -47,16 +48,16 @@ define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr ; CHECK-NEXT: v_mov_b32_e32 v19, s21 ; CHECK-NEXT: v_mov_b32_e32 v20, s22 ; CHECK-NEXT: v_mov_b32_e32 v21, s23 -; CHECK-NEXT: flat_store_byte v[8:9], v0 -; CHECK-NEXT: flat_store_byte v[10:11], v7 -; CHECK-NEXT: flat_store_byte v[12:13], v6 -; CHECK-NEXT: flat_store_byte v[14:15], v5 -; CHECK-NEXT: flat_store_byte v[16:17], v1 -; CHECK-NEXT: flat_store_byte v[18:19], v4 -; CHECK-NEXT: flat_store_byte v[20:21], v3 +; CHECK-NEXT: flat_store_byte v[8:9], v7 +; CHECK-NEXT: flat_store_byte v[10:11], v6 +; CHECK-NEXT: flat_store_byte v[12:13], v5 +; CHECK-NEXT: flat_store_byte v[14:15], v4 +; CHECK-NEXT: flat_store_byte v[16:17], v3 +; CHECK-NEXT: flat_store_byte v[18:19], v2 +; CHECK-NEXT: flat_store_byte v[20:21], v1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; CHECK-NEXT: flat_store_byte v[2:3], v0 ; CHECK-NEXT: s_endpgm bb: br i1 %arg, label %bb10, label %bb41 diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll index 6dabd8c..efbbe2b 100644 --- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll @@ -13,9 +13,9 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1) ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc +; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -30,27 +30,25 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1) ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v6, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v5 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execz .LBB0_3 ; SI-NEXT: s_branch .LBB0_4 ; SI-NEXT: .LBB0_2: -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: .LBB0_3: ; %T ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:2 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -63,29 +61,29 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1) ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: v_or_b32_e32 v3, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v0 +; SI-NEXT: v_or_b32_e32 v2, v2, v1 ; SI-NEXT: .LBB0_4: ; %exit -; SI-NEXT: v_bfe_i32 v0, v3, 0, 16 -; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 -; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 -; SI-NEXT: v_mov_b32_e32 v3, 0xffff -; SI-NEXT: v_mov_b32_e32 v4, 0x8000 -; SI-NEXT: v_mov_b32_e32 v5, 0xffff0000 -; SI-NEXT: v_bfrev_b32_e32 v6, 1 +; SI-NEXT: v_ashrrev_i32_e32 v0, 16, v2 +; SI-NEXT: v_bfe_i32 v1, v2, 0, 16 +; SI-NEXT: v_bfe_i32 v2, v3, 0, 16 +; SI-NEXT: v_mov_b32_e32 v3, 0xffff0000 +; SI-NEXT: v_bfrev_b32_e32 v4, 1 +; SI-NEXT: v_mov_b32_e32 v5, 0xffff +; SI-NEXT: v_mov_b32_e32 v6, 0x8000 ; SI-NEXT: v_mov_b32_e32 v7, 0xffff8000 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 -; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; SI-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 -; SI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 -; SI-NEXT: v_cndmask_b32_e32 v2, -1, v7, vcc -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v4 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_cndmask_b32_e32 v1, -1, v7, vcc +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_alignbit_b32 v1, v2, v4, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: vec_8xi16_extract_4xi16: @@ -180,26 +178,23 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace( ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:10 glc +; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:12 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 -; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v2 +; SI-NEXT: v_or_b32_e32 v4, v4, v3 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execz .LBB1_3 ; SI-NEXT: s_branch .LBB1_4 ; SI-NEXT: .LBB1_2: -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: .LBB1_3: ; %T ; SI-NEXT: s_mov_b32 s6, 0 @@ -214,39 +209,39 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace( ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:10 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:12 glc +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:12 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:14 glc +; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v0 -; SI-NEXT: v_or_b32_e32 v5, v5, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v5, v4, v0 +; SI-NEXT: v_or_b32_e32 v4, v2, v1 ; SI-NEXT: .LBB1_4: ; %exit -; SI-NEXT: v_bfe_i32 v0, v5, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v2, 16, v4 +; SI-NEXT: v_ashr_i64 v[0:1], v[4:5], 48 ; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 -; SI-NEXT: v_bfe_i32 v3, v3, 0, 16 -; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 -; SI-NEXT: v_mov_b32_e32 v4, 0xffff -; SI-NEXT: v_mov_b32_e32 v5, 0x8000 -; SI-NEXT: v_mov_b32_e32 v6, 0xffff0000 -; SI-NEXT: v_bfrev_b32_e32 v7, 1 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 -; SI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; SI-NEXT: v_bfe_i32 v3, v5, 0, 16 +; SI-NEXT: v_mov_b32_e32 v4, 0xffff0000 +; SI-NEXT: v_bfrev_b32_e32 v5, 1 +; SI-NEXT: v_mov_b32_e32 v6, 0xffff +; SI-NEXT: v_mov_b32_e32 v7, 0x8000 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 +; SI-NEXT: v_cndmask_b32_e32 v8, v4, v5, vcc ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 ; SI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v3 +; SI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc +; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 ; SI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 -; SI-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v2, v3, v4 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v1, v8 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_alignbit_b32 v1, v2, v8, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: vec_8xi16_extract_4xi16_2: @@ -499,9 +494,9 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc +; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -532,27 +527,25 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1 ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v6, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v5 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execz .LBB3_3 ; SI-NEXT: s_branch .LBB3_4 ; SI-NEXT: .LBB3_2: -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: .LBB3_3: ; %T ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:2 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -581,29 +574,29 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1 ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: v_or_b32_e32 v3, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v0 +; SI-NEXT: v_or_b32_e32 v2, v2, v1 ; SI-NEXT: .LBB3_4: ; %exit -; SI-NEXT: v_bfe_i32 v0, v3, 0, 16 -; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 -; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 -; SI-NEXT: v_mov_b32_e32 v3, 0xffff -; SI-NEXT: v_mov_b32_e32 v4, 0x8000 -; SI-NEXT: v_mov_b32_e32 v5, 0xffff0000 -; SI-NEXT: v_bfrev_b32_e32 v6, 1 +; SI-NEXT: v_ashrrev_i32_e32 v0, 16, v2 +; SI-NEXT: v_bfe_i32 v1, v2, 0, 16 +; SI-NEXT: v_bfe_i32 v2, v3, 0, 16 +; SI-NEXT: v_mov_b32_e32 v3, 0xffff0000 +; SI-NEXT: v_bfrev_b32_e32 v4, 1 +; SI-NEXT: v_mov_b32_e32 v5, 0xffff +; SI-NEXT: v_mov_b32_e32 v6, 0x8000 ; SI-NEXT: v_mov_b32_e32 v7, 0xffff8000 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 -; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; SI-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 -; SI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 -; SI-NEXT: v_cndmask_b32_e32 v2, -1, v7, vcc -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v4 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_cndmask_b32_e32 v1, -1, v7, vcc +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_alignbit_b32 v1, v2, v4, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: vec_16xi16_extract_4xi16: @@ -710,13 +703,13 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:10 glc +; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:12 glc +; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:12 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:14 glc +; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -734,18 +727,15 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v2, v7, v2 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v2 +; SI-NEXT: v_or_b32_e32 v4, v4, v3 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execz .LBB4_3 ; SI-NEXT: s_branch .LBB4_4 ; SI-NEXT: .LBB4_2: -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: .LBB4_3: ; %T ; SI-NEXT: s_mov_b32 s6, 0 @@ -760,11 +750,11 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:10 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:12 glc +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:12 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -785,29 +775,29 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: v_or_b32_e32 v3, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v5, v4, v0 +; SI-NEXT: v_or_b32_e32 v4, v2, v1 ; SI-NEXT: .LBB4_4: ; %exit -; SI-NEXT: v_bfe_i32 v0, v3, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v2, 16, v4 +; SI-NEXT: v_ashr_i64 v[0:1], v[4:5], 48 ; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 -; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 ; SI-NEXT: v_bfe_i32 v3, v5, 0, 16 -; SI-NEXT: v_mov_b32_e32 v4, 0xffff -; SI-NEXT: v_mov_b32_e32 v5, 0x8000 -; SI-NEXT: v_mov_b32_e32 v6, 0xffff0000 -; SI-NEXT: v_bfrev_b32_e32 v7, 1 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 -; SI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; SI-NEXT: v_mov_b32_e32 v4, 0xffff0000 +; SI-NEXT: v_bfrev_b32_e32 v5, 1 +; SI-NEXT: v_mov_b32_e32 v6, 0xffff +; SI-NEXT: v_mov_b32_e32 v7, 0x8000 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 +; SI-NEXT: v_cndmask_b32_e32 v8, v4, v5, vcc ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 ; SI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc -; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 -; SI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v3 -; SI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc +; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 +; SI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; SI-NEXT: v_or_b32_e32 v0, v1, v8 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v8, 16 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1205,21 +1195,21 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr ; SI-NEXT: s_mov_b32 s39, 0xf000 ; SI-NEXT: s_mov_b32 s36, s38 ; SI-NEXT: s_mov_b32 s37, s38 -; SI-NEXT: buffer_load_ushort v9, v[2:3], s[36:39], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v6, v[2:3], s[36:39], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[2:3], s[36:39], 0 addr64 offset:2 glc +; SI-NEXT: buffer_load_ushort v4, v[2:3], s[36:39], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v10, v[2:3], s[36:39], 0 addr64 offset:4 glc +; SI-NEXT: buffer_load_ushort v7, v[2:3], s[36:39], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[2:3], s[36:39], 0 addr64 offset:6 glc +; SI-NEXT: buffer_load_ushort v5, v[2:3], s[36:39], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[36:39], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v6, v[2:3], s[36:39], 0 addr64 offset:10 glc +; SI-NEXT: buffer_load_ushort v9, v[2:3], s[36:39], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v11, v[2:3], s[36:39], 0 addr64 offset:12 glc +; SI-NEXT: buffer_load_ushort v10, v[2:3], s[36:39], 0 addr64 offset:12 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v7, v[2:3], s[36:39], 0 addr64 offset:14 glc +; SI-NEXT: buffer_load_ushort v11, v[2:3], s[36:39], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -1237,46 +1227,39 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[36:39], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v11, v2 -; SI-NEXT: v_or_b32_e32 v8, v8, v12 -; SI-NEXT: v_or_b32_e32 v2, v10, v13 -; SI-NEXT: v_or_b32_e32 v9, v9, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v4 +; SI-NEXT: v_or_b32_e32 v5, v10, v2 +; SI-NEXT: v_or_b32_e32 v4, v8, v3 +; SI-NEXT: v_or_b32_e32 v3, v7, v9 +; SI-NEXT: v_or_b32_e32 v2, v6, v11 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execz .LBB7_3 ; SI-NEXT: s_branch .LBB7_4 ; SI-NEXT: .LBB7_2: -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: .LBB7_3: ; %T ; SI-NEXT: s_mov_b32 s39, 0xf000 ; SI-NEXT: s_mov_b32 s36, s38 ; SI-NEXT: s_mov_b32 s37, s38 -; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[36:39], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[0:1], s[36:39], 0 addr64 offset:2 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[36:39], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[36:39], 0 addr64 offset:4 glc +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[36:39], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v4, v[0:1], s[36:39], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v8, v[0:1], s[36:39], 0 addr64 offset:8 glc +; SI-NEXT: buffer_load_ushort v7, v[0:1], s[36:39], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v6, v[0:1], s[36:39], 0 addr64 offset:10 glc +; SI-NEXT: buffer_load_ushort v5, v[0:1], s[36:39], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[36:39], 0 addr64 offset:12 glc +; SI-NEXT: buffer_load_ushort v8, v[0:1], s[36:39], 0 addr64 offset:12 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v7, v[0:1], s[36:39], 0 addr64 offset:14 glc +; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -1294,52 +1277,52 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[36:39], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v3, v0 -; SI-NEXT: v_or_b32_e32 v8, v8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_or_b32_e32 v5, v8, v0 +; SI-NEXT: v_or_b32_e32 v4, v7, v1 +; SI-NEXT: v_or_b32_e32 v3, v6, v9 ; SI-NEXT: v_or_b32_e32 v2, v2, v10 -; SI-NEXT: v_or_b32_e32 v9, v9, v11 ; SI-NEXT: .LBB7_4: ; %exit -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v8 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_movk_i32 s34, 0x3800 -; SI-NEXT: v_mov_b32_e32 v8, 0x3d00 -; SI-NEXT: v_mov_b32_e32 v9, 0x3900 -; SI-NEXT: v_mov_b32_e32 v10, 0x3d000000 -; SI-NEXT: v_mov_b32_e32 v11, 0x39000000 +; SI-NEXT: v_mov_b32_e32 v8, 0x3d000000 +; SI-NEXT: v_mov_b32_e32 v9, 0x39000000 +; SI-NEXT: v_mov_b32_e32 v10, 0x3d00 +; SI-NEXT: v_mov_b32_e32 v11, 0x3900 ; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v0 -; SI-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v12, v8, v9, vcc ; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v1 +; SI-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc +; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v2 +; SI-NEXT: v_cndmask_b32_e32 v13, v8, v9, vcc +; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v4 ; SI-NEXT: v_cndmask_b32_e32 v1, v10, v11, vcc +; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v6 +; SI-NEXT: v_cndmask_b32_e32 v14, v8, v9, vcc ; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v5 +; SI-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc +; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v7 ; SI-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v6 -; SI-NEXT: v_cndmask_b32_e32 v12, v10, v11, vcc ; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v3 -; SI-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc -; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v7 -; SI-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc -; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v2 -; SI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc -; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v4 -; SI-NEXT: v_cndmask_b32_e32 v8, v10, v11, vcc -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v4, v5, v12 -; SI-NEXT: v_or_b32_e32 v6, v3, v7 -; SI-NEXT: v_or_b32_e32 v2, v2, v8 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v12, 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cndmask_b32_e32 v3, v10, v11, vcc +; SI-NEXT: v_or_b32_e32 v0, v0, v12 +; SI-NEXT: v_or_b32_e32 v4, v1, v13 +; SI-NEXT: v_or_b32_e32 v6, v2, v14 +; SI-NEXT: v_or_b32_e32 v2, v3, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_alignbit_b32 v1, v2, v12, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v13, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v14 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: vec_16xi16_extract_8xi16_0: diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector.ll index 15abf44..36a93bd 100644 --- a/llvm/test/CodeGen/AMDGPU/extract-subvector.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-subvector.ll @@ -1,26 +1,82 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s -; GCN-LABEL: extract_2xi16 -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: v_bfe_i32 -; GCN: v_bfe_i32 - define <2 x i16> @extract_2xi16(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) { +; GCN-LABEL: extract_2xi16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_and_b32_e32 v4, 1, v4 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 +; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; GCN-NEXT: s_cbranch_execz .LBB0_2 +; GCN-NEXT: ; %bb.1: ; %F +; GCN-NEXT: s_mov_b32 s10, 0 +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s8, s10 +; GCN-NEXT: s_mov_b32 s9, s10 +; GCN-NEXT: buffer_load_ushort v0, v[2:3], s[8:11], 0 addr64 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_ushort v1, v[2:3], s[8:11], 0 addr64 offset:2 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:4 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:6 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:8 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:10 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:12 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_ushort v2, v[2:3], s[8:11], 0 addr64 offset:14 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v4, v0, v1 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: .LBB0_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB0_4 +; GCN-NEXT: ; %bb.3: ; %T +; GCN-NEXT: s_mov_b32 s10, 0 +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s8, s10 +; GCN-NEXT: s_mov_b32 s9, s10 +; GCN-NEXT: buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_ushort v3, v[0:1], s[8:11], 0 addr64 offset:2 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:4 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:6 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:8 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:10 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:12 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 offset:14 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GCN-NEXT: v_or_b32_e32 v4, v2, v0 +; GCN-NEXT: .LBB0_4: ; %exit +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_ashrrev_i32_e32 v0, 16, v4 +; GCN-NEXT: v_bfe_i32 v1, v4, 0, 16 +; GCN-NEXT: v_mov_b32_e32 v2, 0xffff +; GCN-NEXT: v_mov_b32_e32 v3, 0x8000 +; GCN-NEXT: v_mov_b32_e32 v4, 0xffff8000 +; GCN-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GCN-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 +; GCN-NEXT: v_cndmask_b32_e32 v2, -1, v4, vcc +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GCN-NEXT: s_setpc_b64 s[30:31] br i1 %c0, label %T, label %F T: @@ -39,9 +95,59 @@ exit: ret <2 x i16> %r2 } -; GCN-LABEL: extract_2xi64 -; GCN-COUNT-2: v_cndmask_b32 define <2 x i64> @extract_2xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) { +; GCN-LABEL: extract_2xi64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_and_b32_e32 v4, 1, v4 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19 +; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; GCN-NEXT: s_cbranch_execz .LBB1_2 +; GCN-NEXT: ; %bb.1: ; %F +; GCN-NEXT: s_mov_b32 s10, 0 +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s8, s10 +; GCN-NEXT: s_mov_b32 s9, s10 +; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:32 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[2:3], s[8:11], 0 addr64 offset:48 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: .LBB1_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB1_4 +; GCN-NEXT: ; %bb.3: ; %T +; GCN-NEXT: s_mov_b32 s10, 0 +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s8, s10 +; GCN-NEXT: s_mov_b32 s9, s10 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:32 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 offset:48 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: .LBB1_4: ; %exit +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v1, 0xffff8000 +; GCN-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[4:5] +; GCN-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc +; GCN-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[6:7] +; GCN-NEXT: v_cndmask_b32_e32 v2, -1, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, -1 +; GCN-NEXT: v_mov_b32_e32 v3, -1 +; GCN-NEXT: s_setpc_b64 s[30:31] br i1 %c0, label %T, label %F T: @@ -60,9 +166,65 @@ exit: ret <2 x i64> %r2 } -; GCN-LABEL: extract_4xi64 -; GCN-COUNT-4: v_cndmask_b32 define <4 x i64> @extract_4xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) { +; GCN-LABEL: extract_4xi64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_and_b32_e32 v4, 1, v4 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19 +; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; GCN-NEXT: s_cbranch_execz .LBB2_2 +; GCN-NEXT: ; %bb.1: ; %F +; GCN-NEXT: s_mov_b32 s10, 0 +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s8, s10 +; GCN-NEXT: s_mov_b32 s9, s10 +; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[2:3], s[8:11], 0 addr64 offset:32 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[2:3], s[8:11], 0 addr64 offset:48 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: .LBB2_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB2_4 +; GCN-NEXT: ; %bb.3: ; %T +; GCN-NEXT: s_mov_b32 s10, 0 +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s8, s10 +; GCN-NEXT: s_mov_b32 s9, s10 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[8:11], 0 addr64 offset:32 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 offset:48 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: .LBB2_4: ; %exit +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v1, 0xffff8000 +; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc +; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v2, v1, -1, vcc +; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[8:9] +; GCN-NEXT: v_cndmask_b32_e64 v4, v1, -1, vcc +; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[10:11] +; GCN-NEXT: v_cndmask_b32_e64 v6, v1, -1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, -1 +; GCN-NEXT: v_mov_b32_e32 v3, -1 +; GCN-NEXT: v_mov_b32_e32 v5, -1 +; GCN-NEXT: v_mov_b32_e32 v7, -1 +; GCN-NEXT: s_setpc_b64 s[30:31] br i1 %c0, label %T, label %F T: @@ -81,9 +243,92 @@ exit: ret <4 x i64> %r2 } -; GCN-LABEL: extract_8xi64 -; GCN-COUNT-8: v_cndmask_b32 define <8 x i64> @extract_8xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) { +; GCN-LABEL: extract_8xi64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_and_b32_e32 v4, 1, v4 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35 +; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; GCN-NEXT: s_cbranch_execz .LBB3_2 +; GCN-NEXT: ; %bb.1: ; %F +; GCN-NEXT: s_mov_b32 s10, 0 +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s8, s10 +; GCN-NEXT: s_mov_b32 s9, s10 +; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:112 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:96 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:80 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:64 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[2:3], s[8:11], 0 addr64 offset:32 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[16:19], v[2:3], s[8:11], 0 addr64 offset:48 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: .LBB3_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB3_4 +; GCN-NEXT: ; %bb.3: ; %T +; GCN-NEXT: s_mov_b32 s10, 0 +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s8, s10 +; GCN-NEXT: s_mov_b32 s9, s10 +; GCN-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:112 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:96 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:80 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:64 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[8:11], 0 addr64 offset:32 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[16:19], v[0:1], s[8:11], 0 addr64 offset:48 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: .LBB3_4: ; %exit +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, 0xffff8000 +; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[6:7] +; GCN-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[8:9] +; GCN-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[10:11] +; GCN-NEXT: v_cmp_gt_i64_e64 s[8:9], 0, v[12:13] +; GCN-NEXT: v_cmp_gt_i64_e64 s[10:11], 0, v[14:15] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_gt_i64_e64 s[12:13], 0, v[16:17] +; GCN-NEXT: v_cmp_gt_i64_e64 s[14:15], 0, v[18:19] +; GCN-NEXT: v_cmp_gt_i64_e64 s[16:17], 0, v[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v0, v1, -1, s[16:17] +; GCN-NEXT: v_cndmask_b32_e64 v2, v1, -1, vcc +; GCN-NEXT: v_cndmask_b32_e64 v4, v1, -1, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v6, v1, -1, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v8, v1, -1, s[8:9] +; GCN-NEXT: v_cndmask_b32_e64 v10, v1, -1, s[10:11] +; GCN-NEXT: v_cndmask_b32_e64 v12, v1, -1, s[12:13] +; GCN-NEXT: v_cndmask_b32_e64 v14, v1, -1, s[14:15] +; GCN-NEXT: v_mov_b32_e32 v1, -1 +; GCN-NEXT: v_mov_b32_e32 v3, -1 +; GCN-NEXT: v_mov_b32_e32 v5, -1 +; GCN-NEXT: v_mov_b32_e32 v7, -1 +; GCN-NEXT: v_mov_b32_e32 v9, -1 +; GCN-NEXT: v_mov_b32_e32 v11, -1 +; GCN-NEXT: v_mov_b32_e32 v13, -1 +; GCN-NEXT: v_mov_b32_e32 v15, -1 +; GCN-NEXT: s_setpc_b64 s[30:31] br i1 %c0, label %T, label %F T: @@ -102,9 +347,59 @@ exit: ret <8 x i64> %r2 } -; GCN-LABEL: extract_2xf64 -; GCN-COUNT-2: v_cndmask_b32 define <2 x double> @extract_2xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) { +; GCN-LABEL: extract_2xf64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_and_b32_e32 v4, 1, v4 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19 +; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; GCN-NEXT: s_cbranch_execz .LBB4_2 +; GCN-NEXT: ; %bb.1: ; %F +; GCN-NEXT: s_mov_b32 s10, 0 +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s8, s10 +; GCN-NEXT: s_mov_b32 s9, s10 +; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:32 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[2:3], s[8:11], 0 addr64 offset:48 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: .LBB4_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB4_4 +; GCN-NEXT: ; %bb.3: ; %T +; GCN-NEXT: s_mov_b32 s10, 0 +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s8, s10 +; GCN-NEXT: s_mov_b32 s9, s10 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:32 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 offset:48 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: .LBB4_4: ; %exit +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0xbff00000 +; GCN-NEXT: v_cmp_lt_f64_e32 vcc, -1.0, v[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v1, v0, -2.0, vcc +; GCN-NEXT: v_cmp_lt_f64_e32 vcc, -1.0, v[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v3, v0, -2.0, vcc +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_setpc_b64 s[30:31] br i1 %c0, label %T, label %F T: @@ -123,9 +418,65 @@ exit: ret <2 x double> %r2 } -; GCN-LABEL: extract_4xf64 -; GCN-COUNT-4: v_cndmask_b32 define <4 x double> @extract_4xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) { +; GCN-LABEL: extract_4xf64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_and_b32_e32 v4, 1, v4 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19 +; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; GCN-NEXT: s_cbranch_execz .LBB5_2 +; GCN-NEXT: ; %bb.1: ; %F +; GCN-NEXT: s_mov_b32 s10, 0 +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s8, s10 +; GCN-NEXT: s_mov_b32 s9, s10 +; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[2:3], s[8:11], 0 addr64 offset:32 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[2:3], s[8:11], 0 addr64 offset:48 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: .LBB5_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB5_4 +; GCN-NEXT: ; %bb.3: ; %T +; GCN-NEXT: s_mov_b32 s10, 0 +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s8, s10 +; GCN-NEXT: s_mov_b32 s9, s10 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[8:11], 0 addr64 offset:32 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 offset:48 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: .LBB5_4: ; %exit +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0xbff00000 +; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, -1.0, v[4:5] +; GCN-NEXT: v_cndmask_b32_e32 v1, -2.0, v0, vcc +; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, -1.0, v[6:7] +; GCN-NEXT: v_cndmask_b32_e32 v3, -2.0, v0, vcc +; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, -1.0, v[8:9] +; GCN-NEXT: v_cndmask_b32_e32 v5, -2.0, v0, vcc +; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, -1.0, v[10:11] +; GCN-NEXT: v_cndmask_b32_e32 v7, -2.0, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: v_mov_b32_e32 v6, 0 +; GCN-NEXT: s_setpc_b64 s[30:31] br i1 %c0, label %T, label %F T: @@ -144,9 +495,92 @@ exit: ret <4 x double> %r2 } -; GCN-LABEL: extract_8xf64 -; GCN-COUNT-8: v_cndmask_b32 define <8 x double> @extract_8xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) { +; GCN-LABEL: extract_8xf64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_and_b32_e32 v4, 1, v4 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35 +; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; GCN-NEXT: s_cbranch_execz .LBB6_2 +; GCN-NEXT: ; %bb.1: ; %F +; GCN-NEXT: s_mov_b32 s10, 0 +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s8, s10 +; GCN-NEXT: s_mov_b32 s9, s10 +; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:112 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:96 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:80 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:64 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[2:3], s[8:11], 0 addr64 offset:32 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[16:19], v[2:3], s[8:11], 0 addr64 offset:48 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: .LBB6_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB6_4 +; GCN-NEXT: ; %bb.3: ; %T +; GCN-NEXT: s_mov_b32 s10, 0 +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s8, s10 +; GCN-NEXT: s_mov_b32 s9, s10 +; GCN-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:112 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:96 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:80 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:64 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[8:11], 0 addr64 offset:32 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[16:19], v[0:1], s[8:11], 0 addr64 offset:48 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: .LBB6_4: ; %exit +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v0, 0xbff00000 +; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, -1.0, v[6:7] +; GCN-NEXT: v_cmp_nlt_f64_e64 s[4:5], -1.0, v[8:9] +; GCN-NEXT: v_cmp_nlt_f64_e64 s[6:7], -1.0, v[10:11] +; GCN-NEXT: v_cmp_nlt_f64_e64 s[8:9], -1.0, v[12:13] +; GCN-NEXT: v_cmp_nlt_f64_e64 s[10:11], -1.0, v[14:15] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_nlt_f64_e64 s[12:13], -1.0, v[16:17] +; GCN-NEXT: v_cmp_nlt_f64_e64 s[14:15], -1.0, v[18:19] +; GCN-NEXT: v_cmp_nlt_f64_e64 s[16:17], -1.0, v[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v1, -2.0, v0, s[16:17] +; GCN-NEXT: v_cndmask_b32_e32 v3, -2.0, v0, vcc +; GCN-NEXT: v_cndmask_b32_e64 v5, -2.0, v0, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v7, -2.0, v0, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v9, -2.0, v0, s[8:9] +; GCN-NEXT: v_cndmask_b32_e64 v11, -2.0, v0, s[10:11] +; GCN-NEXT: v_cndmask_b32_e64 v13, -2.0, v0, s[12:13] +; GCN-NEXT: v_cndmask_b32_e64 v15, -2.0, v0, s[14:15] +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: v_mov_b32_e32 v6, 0 +; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: v_mov_b32_e32 v10, 0 +; GCN-NEXT: v_mov_b32_e32 v12, 0 +; GCN-NEXT: v_mov_b32_e32 v14, 0 +; GCN-NEXT: s_setpc_b64 s[30:31] br i1 %c0, label %T, label %F T: diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index 08cf83f..952e89e 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -255,13 +255,13 @@ ; GCN-O1-NEXT: Function Alias Analysis Results ; GCN-O1-NEXT: Flatten the CFG ; GCN-O1-NEXT: Dominator Tree Construction -; GCN-O1-NEXT: Cycle Info Analysis -; GCN-O1-NEXT: Uniformity Analysis -; GCN-O1-NEXT: AMDGPU IR late optimizations ; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O1-NEXT: Function Alias Analysis Results ; GCN-O1-NEXT: Natural Loop Information ; GCN-O1-NEXT: Code sinking +; GCN-O1-NEXT: Cycle Info Analysis +; GCN-O1-NEXT: Uniformity Analysis +; GCN-O1-NEXT: AMDGPU IR late optimizations ; GCN-O1-NEXT: Post-Dominator Tree Construction ; GCN-O1-NEXT: Unify divergent function exit nodes ; GCN-O1-NEXT: Dominator Tree Construction @@ -552,13 +552,13 @@ ; GCN-O1-OPTS-NEXT: Function Alias Analysis Results ; GCN-O1-OPTS-NEXT: Flatten the CFG ; GCN-O1-OPTS-NEXT: Dominator Tree Construction -; GCN-O1-OPTS-NEXT: Cycle Info Analysis -; GCN-O1-OPTS-NEXT: Uniformity Analysis -; GCN-O1-OPTS-NEXT: AMDGPU IR late optimizations ; GCN-O1-OPTS-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O1-OPTS-NEXT: Function Alias Analysis Results ; GCN-O1-OPTS-NEXT: Natural Loop Information ; GCN-O1-OPTS-NEXT: Code sinking +; GCN-O1-OPTS-NEXT: Cycle Info Analysis +; GCN-O1-OPTS-NEXT: Uniformity Analysis +; GCN-O1-OPTS-NEXT: AMDGPU IR late optimizations ; GCN-O1-OPTS-NEXT: Post-Dominator Tree Construction ; GCN-O1-OPTS-NEXT: Unify divergent function exit nodes ; GCN-O1-OPTS-NEXT: Dominator Tree Construction @@ -861,13 +861,13 @@ ; GCN-O2-NEXT: Function Alias Analysis Results ; GCN-O2-NEXT: Flatten the CFG ; GCN-O2-NEXT: Dominator Tree Construction -; GCN-O2-NEXT: Cycle Info Analysis -; GCN-O2-NEXT: Uniformity Analysis -; GCN-O2-NEXT: AMDGPU IR late optimizations ; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O2-NEXT: Function Alias Analysis Results ; GCN-O2-NEXT: Natural Loop Information ; GCN-O2-NEXT: Code sinking +; GCN-O2-NEXT: Cycle Info Analysis +; GCN-O2-NEXT: Uniformity Analysis +; GCN-O2-NEXT: AMDGPU IR late optimizations ; GCN-O2-NEXT: Post-Dominator Tree Construction ; GCN-O2-NEXT: Unify divergent function exit nodes ; GCN-O2-NEXT: Dominator Tree Construction @@ -1184,13 +1184,13 @@ ; GCN-O3-NEXT: Function Alias Analysis Results ; GCN-O3-NEXT: Flatten the CFG ; GCN-O3-NEXT: Dominator Tree Construction -; GCN-O3-NEXT: Cycle Info Analysis -; GCN-O3-NEXT: Uniformity Analysis -; GCN-O3-NEXT: AMDGPU IR late optimizations ; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O3-NEXT: Function Alias Analysis Results ; GCN-O3-NEXT: Natural Loop Information ; GCN-O3-NEXT: Code sinking +; GCN-O3-NEXT: Cycle Info Analysis +; GCN-O3-NEXT: Uniformity Analysis +; GCN-O3-NEXT: AMDGPU IR late optimizations ; GCN-O3-NEXT: Post-Dominator Tree Construction ; GCN-O3-NEXT: Unify divergent function exit nodes ; GCN-O3-NEXT: Dominator Tree Construction diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll index 0f2eedb..911bb44 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -2101,10 +2101,7 @@ define void @crash_lshlrevb16_not_reg_op() { ; NOSDWA: ; %bb.0: ; %bb0 ; NOSDWA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; NOSDWA-NEXT: s_mov_b64 s[4:5], 0 -; NOSDWA-NEXT: v_mov_b32_e32 v0, 0xff -; NOSDWA-NEXT: v_and_b32_e32 v0, s4, v0 -; NOSDWA-NEXT: v_lshlrev_b16_e64 v1, 8, 1 -; NOSDWA-NEXT: v_or_b32_e32 v0, v0, v1 +; NOSDWA-NEXT: v_mov_b32_e32 v0, 0x100 ; NOSDWA-NEXT: s_and_b64 vcc, exec, -1 ; NOSDWA-NEXT: .LBB22_1: ; %bb1 ; NOSDWA-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2124,9 +2121,7 @@ define void @crash_lshlrevb16_not_reg_op() { ; GFX89: ; %bb.0: ; %bb0 ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX89-NEXT: s_mov_b64 s[4:5], 0 -; GFX89-NEXT: v_lshlrev_b16_e64 v0, 8, 1 -; GFX89-NEXT: v_mov_b32_e32 v1, s4 -; GFX89-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX89-NEXT: v_mov_b32_e32 v0, 0x100 ; GFX89-NEXT: s_and_b64 vcc, exec, -1 ; GFX89-NEXT: .LBB22_1: ; %bb1 ; GFX89-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2146,8 +2141,7 @@ define void @crash_lshlrevb16_not_reg_op() { ; GFX9: ; %bb.0: ; %bb0 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: v_lshlrev_b16_e64 v0, 8, 1 -; GFX9-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v0, 0x100 ; GFX9-NEXT: s_and_b64 vcc, exec, -1 ; GFX9-NEXT: .LBB22_1: ; %bb1 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2166,18 +2160,16 @@ define void @crash_lshlrevb16_not_reg_op() { ; GFX10-LABEL: crash_lshlrevb16_not_reg_op: ; GFX10: ; %bb.0: ; %bb0 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b16 v0, 8, 1 -; GFX10-NEXT: s_mov_b32 vcc_lo, exec_lo -; GFX10-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: s_mov_b64 s[4:5], 0 +; GFX10-NEXT: s_mov_b32 vcc_lo, exec_lo ; GFX10-NEXT: .LBB22_1: ; %bb1 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_lshl_b32 s6, s4, 3 -; GFX10-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-NEXT: v_lshrrev_b16 v3, s6, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: v_lshrrev_b16 v2, s6, 0x100 ; GFX10-NEXT: s_mov_b64 s[4:5], 1 -; GFX10-NEXT: flat_store_byte v[1:2], v3 +; GFX10-NEXT: flat_store_byte v[0:1], v2 ; GFX10-NEXT: s_cbranch_vccnz .LBB22_1 ; GFX10-NEXT: ; %bb.2: ; %DummyReturnBlock ; GFX10-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll index f78b408..2355fa7 100644 --- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll +++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -6,27 +6,31 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX906-NEXT: v_lshlrev_b32_e32 v5, 2, v0 +; GFX906-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX906-NEXT: v_mov_b32_e32 v3, 8 ; GFX906-NEXT: v_mov_b32_e32 v1, 0 -; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dword v2, v5, s[4:5] +; GFX906-NEXT: global_load_dword v4, v2, s[4:5] +; GFX906-NEXT: s_mov_b32 s4, 0xff0000 +; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v4, 8, v2 +; GFX906-NEXT: v_lshrrev_b32_sdwa v5, v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX906-NEXT: v_and_or_b32 v4, v4, s4, v5 ; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB0_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dword v2, v5, s[6:7] +; GFX906-NEXT: global_load_dword v0, v2, s[6:7] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v4, 8, v2 +; GFX906-NEXT: v_lshrrev_b32_sdwa v2, v3, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX906-NEXT: v_and_or_b32 v4, v0, s4, v2 ; GFX906-NEXT: .LBB0_2: ; %bb.2 ; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v4 -; GFX906-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: global_store_byte v1, v3, s[2:3] offset:2 -; GFX906-NEXT: global_store_short v1, v0, s[2:3] +; GFX906-NEXT: global_store_byte_d16_hi v1, v4, s[2:3] offset:2 +; GFX906-NEXT: global_store_short v1, v4, s[2:3] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -50,31 +54,19 @@ define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX906-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; GFX906-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GFX906-NEXT: v_mov_b32_e32 v1, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dword v2, v6, s[4:5] -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX906-NEXT: global_load_dword v2, v3, s[4:5] ; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB1_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dword v2, v6, s[6:7] -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX906-NEXT: global_load_dword v2, v3, s[6:7] ; GFX906-NEXT: .LBB1_2: ; %bb.2 ; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v5 -; GFX906-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v3 -; GFX906-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dword v1, v0, s[2:3] +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: global_store_dword v1, v2, s[2:3] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -98,32 +90,23 @@ define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX906-NEXT: v_lshlrev_b32_e32 v7, 3, v0 -; GFX906-NEXT: v_mov_b32_e32 v5, 0 +; GFX906-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX906-NEXT: v_mov_b32_e32 v3, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx2 v[1:2], v7, s[4:5] +; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[4:5] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2] -; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX906-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB2_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx2 v[1:2], v7, s[6:7] +; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[6:7] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2] -; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX906-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX906-NEXT: .LBB2_2: ; %bb.2 ; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v6 -; GFX906-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v3 -; GFX906-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_byte v5, v2, s[2:3] offset:4 -; GFX906-NEXT: global_store_dword v5, v0, s[2:3] +; GFX906-NEXT: global_store_byte v3, v2, s[2:3] offset:4 +; GFX906-NEXT: global_store_dword v3, v1, s[2:3] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -147,42 +130,19 @@ define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX906-NEXT: v_lshlrev_b32_e32 v10, 3, v0 +; GFX906-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX906-NEXT: v_mov_b32_e32 v3, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx2 v[1:2], v10, s[4:5] -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshrrev_b32_e32 v4, 24, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v6, 8, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX906-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v1 +; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[4:5] ; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB3_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx2 v[1:2], v10, s[6:7] -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshrrev_b32_e32 v4, 24, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v6, 8, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX906-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v1 +; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[6:7] ; GFX906-NEXT: .LBB3_2: ; %bb.2 ; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v9 -; GFX906-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v7 -; GFX906-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v6 -; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v4 -; GFX906-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx2 v3, v[0:1], s[2:3] +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: global_store_dwordx2 v3, v[1:2], s[2:3] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -206,64 +166,19 @@ define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1 ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX906-NEXT: v_lshlrev_b32_e32 v18, 4, v0 +; GFX906-NEXT: v_lshlrev_b32_e32 v6, 4, v0 ; GFX906-NEXT: v_mov_b32_e32 v5, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx4 v[1:4], v18, s[4:5] -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshrrev_b32_e32 v6, 24, v4 -; GFX906-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX906-NEXT: v_lshrrev_b32_e32 v8, 8, v4 -; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v3 -; GFX906-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; GFX906-NEXT: v_lshrrev_b32_e32 v11, 8, v3 -; GFX906-NEXT: v_lshrrev_b32_e32 v12, 24, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v14, 8, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v15, 24, v1 -; GFX906-NEXT: v_lshrrev_b32_e32 v16, 16, v1 -; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v1 +; GFX906-NEXT: global_load_dwordx4 v[1:4], v6, s[4:5] ; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB4_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx4 v[1:4], v18, s[6:7] -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshrrev_b32_e32 v6, 24, v4 -; GFX906-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX906-NEXT: v_lshrrev_b32_e32 v8, 8, v4 -; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v3 -; GFX906-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; GFX906-NEXT: v_lshrrev_b32_e32 v11, 8, v3 -; GFX906-NEXT: v_lshrrev_b32_e32 v12, 24, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v14, 8, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v15, 24, v1 -; GFX906-NEXT: v_lshrrev_b32_e32 v16, 16, v1 -; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v1 +; GFX906-NEXT: global_load_dwordx4 v[1:4], v6, s[6:7] ; GFX906-NEXT: .LBB4_2: ; %bb.2 ; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v17 -; GFX906-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v15 -; GFX906-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v14 -; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v12 -; GFX906-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v11 -; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v9 -; GFX906-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v8 -; GFX906-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v6 -; GFX906-NEXT: v_or_b32_sdwa v4, v7, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v5, v[0:3], s[2:3] +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: global_store_dwordx4 v5, v[1:4], s[2:3] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -286,114 +201,24 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1 ; GFX906-LABEL: v32i8_liveout: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX906-NEXT: v_lshlrev_b32_e32 v31, 5, v0 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX906-NEXT: v_lshlrev_b32_e32 v10, 5, v0 ; GFX906-NEXT: v_mov_b32_e32 v9, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx4 v[1:4], v31, s[4:5] offset:16 -; GFX906-NEXT: global_load_dwordx4 v[5:8], v31, s[4:5] -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v4 -; GFX906-NEXT: v_lshrrev_b32_e32 v10, 16, v4 -; GFX906-NEXT: v_lshrrev_b32_e32 v11, 8, v4 -; GFX906-NEXT: v_lshrrev_b32_e32 v12, 24, v3 -; GFX906-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; GFX906-NEXT: v_lshrrev_b32_e32 v14, 8, v3 -; GFX906-NEXT: v_lshrrev_b32_e32 v15, 24, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v16, 16, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v18, 24, v1 -; GFX906-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; GFX906-NEXT: v_lshrrev_b32_e32 v20, 8, v1 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshrrev_b32_e32 v21, 24, v8 -; GFX906-NEXT: v_lshrrev_b32_e32 v22, 16, v8 -; GFX906-NEXT: v_lshrrev_b32_e32 v23, 8, v8 -; GFX906-NEXT: v_lshrrev_b32_e32 v24, 24, v7 -; GFX906-NEXT: v_lshrrev_b32_e32 v25, 16, v7 -; GFX906-NEXT: v_lshrrev_b32_e32 v26, 8, v7 -; GFX906-NEXT: v_lshrrev_b32_e32 v27, 24, v6 -; GFX906-NEXT: v_lshrrev_b32_e32 v28, 16, v6 -; GFX906-NEXT: v_lshrrev_b32_e32 v29, 8, v6 -; GFX906-NEXT: v_lshrrev_b32_e32 v30, 24, v5 -; GFX906-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; GFX906-NEXT: v_lshrrev_b32_e32 v33, 8, v5 -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX906-NEXT: global_load_dwordx4 v[1:4], v10, s[4:5] offset:16 +; GFX906-NEXT: global_load_dwordx4 v[5:8], v10, s[4:5] +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB5_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx4 v[1:4], v31, s[6:7] offset:16 -; GFX906-NEXT: global_load_dwordx4 v[5:8], v31, s[6:7] -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v4 -; GFX906-NEXT: v_lshrrev_b32_e32 v10, 16, v4 -; GFX906-NEXT: v_lshrrev_b32_e32 v11, 8, v4 -; GFX906-NEXT: v_lshrrev_b32_e32 v12, 24, v3 -; GFX906-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; GFX906-NEXT: v_lshrrev_b32_e32 v14, 8, v3 -; GFX906-NEXT: v_lshrrev_b32_e32 v15, 24, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v16, 16, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v18, 24, v1 -; GFX906-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; GFX906-NEXT: v_lshrrev_b32_e32 v20, 8, v1 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshrrev_b32_e32 v21, 24, v8 -; GFX906-NEXT: v_lshrrev_b32_e32 v22, 16, v8 -; GFX906-NEXT: v_lshrrev_b32_e32 v23, 8, v8 -; GFX906-NEXT: v_lshrrev_b32_e32 v24, 24, v7 -; GFX906-NEXT: v_lshrrev_b32_e32 v25, 16, v7 -; GFX906-NEXT: v_lshrrev_b32_e32 v26, 8, v7 -; GFX906-NEXT: v_lshrrev_b32_e32 v27, 24, v6 -; GFX906-NEXT: v_lshrrev_b32_e32 v28, 16, v6 -; GFX906-NEXT: v_lshrrev_b32_e32 v29, 8, v6 -; GFX906-NEXT: v_lshrrev_b32_e32 v30, 24, v5 -; GFX906-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; GFX906-NEXT: v_lshrrev_b32_e32 v33, 8, v5 +; GFX906-NEXT: global_load_dwordx4 v[1:4], v10, s[6:7] offset:16 +; GFX906-NEXT: global_load_dwordx4 v[5:8], v10, s[6:7] ; GFX906-NEXT: .LBB5_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX906-NEXT: v_lshlrev_b16_e32 v30, 8, v30 -; GFX906-NEXT: v_lshlrev_b16_e32 v31, 8, v33 -; GFX906-NEXT: v_lshlrev_b16_e32 v29, 8, v29 -; GFX906-NEXT: v_lshlrev_b16_e32 v27, 8, v27 -; GFX906-NEXT: v_lshlrev_b16_e32 v26, 8, v26 -; GFX906-NEXT: v_lshlrev_b16_e32 v24, 8, v24 -; GFX906-NEXT: v_lshlrev_b16_e32 v23, 8, v23 -; GFX906-NEXT: v_lshlrev_b16_e32 v21, 8, v21 -; GFX906-NEXT: v_or_b32_sdwa v30, v32, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v5, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v6, v6, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v7, v7, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v8, v8, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v5, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v6, v6, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v7, v7, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v8, v8, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v9, v[5:8], s[0:1] -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v20 -; GFX906-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v18 -; GFX906-NEXT: v_or_b32_sdwa v5, v19, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v17 -; GFX906-NEXT: v_or_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v15 -; GFX906-NEXT: v_or_b32_sdwa v5, v16, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v14 -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v12 -; GFX906-NEXT: v_or_b32_sdwa v5, v13, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v11 -; GFX906-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v9, v[1:4], s[0:1] offset:16 +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: global_store_dwordx4 v9, v[1:4], s[2:3] offset:16 +; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: global_store_dwordx4 v9, v[5:8], s[2:3] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -415,1572 +240,595 @@ bb.2: define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v256i8_liveout: ; GFX906: ; %bb.0: ; %entry +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX906-NEXT: v_lshlrev_b32_e32 v61, 3, v0 ; GFX906-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX906-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX906-NEXT: s_mov_b32 s10, -1 +; GFX906-NEXT: s_waitcnt lgkmcnt(0) +; GFX906-NEXT: global_load_dwordx4 v[5:8], v61, s[4:5] offset:240 ; GFX906-NEXT: s_mov_b32 s11, 0xe00000 ; GFX906-NEXT: s_add_u32 s8, s8, s3 -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX906-NEXT: v_lshlrev_b32_e32 v63, 3, v0 ; GFX906-NEXT: s_addc_u32 s9, s9, 0 -; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx4 v[17:20], v63, s[4:5] offset:240 -; GFX906-NEXT: global_load_dwordx4 v[5:8], v63, s[4:5] offset:224 -; GFX906-NEXT: global_load_dwordx4 v[9:12], v63, s[4:5] offset:208 -; GFX906-NEXT: global_load_dwordx4 v[13:16], v63, s[4:5] offset:192 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX906-NEXT: v_mov_b32_e32 v4, 0 -; GFX906-NEXT: s_waitcnt vmcnt(3) -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v20 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v20 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v20 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v19 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v19 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v19 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v18 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v18 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v18 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v17 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v20, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v17 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v8 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v8 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v8 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v7 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v7 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v6 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v6 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v5 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v5 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v12 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v12 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v12 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v11 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v11 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v11 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v10 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v10 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v9 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v9 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v9 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v16 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v16 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v16 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v15 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v15 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v15 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v14 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v14 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v14 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v13 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v13 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v13 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[17:20], v63, s[4:5] offset:176 -; GFX906-NEXT: global_load_dwordx4 v[21:24], v63, s[4:5] offset:160 -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v20 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v20 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v20 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v19 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v19 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v19 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v18 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v18 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v18 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v17 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v17 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill -; GFX906-NEXT: s_waitcnt vmcnt(12) -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v24 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v24 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v23 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v23 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v22 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v22 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v21 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v21 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v21 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[25:28], v63, s[4:5] offset:144 -; GFX906-NEXT: global_load_dwordx4 v[29:32], v63, s[4:5] offset:128 -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v28 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v28 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v28 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v27 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v27 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v26 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v26 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v25 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v25 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill -; GFX906-NEXT: s_waitcnt vmcnt(12) -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v32 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v32 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v32 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v31 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v31 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v31 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v30 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v30 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v30 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v29 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v29 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[33:36], v63, s[4:5] offset:112 -; GFX906-NEXT: global_load_dwordx4 v[37:40], v63, s[4:5] offset:96 -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v36 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v36 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v36 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v35 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v35 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v35 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v34 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v34 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v33 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v33 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v33 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill -; GFX906-NEXT: s_waitcnt vmcnt(12) -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v40 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v40 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v40 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v39 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v39 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v39 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v38 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v38 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v38 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v37 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v37 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v37 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[41:44], v63, s[4:5] offset:80 -; GFX906-NEXT: global_load_dwordx4 v[45:48], v63, s[4:5] offset:64 -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v44 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v44 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v44 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v43 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v43 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v43 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v42 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v42 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v42 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v41 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v41 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v41 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill -; GFX906-NEXT: s_waitcnt vmcnt(12) -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v48 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v48 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v48 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v47 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v47 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v47 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v46 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v46 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v46 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v45 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v45 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v45 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[49:52], v63, s[4:5] offset:48 -; GFX906-NEXT: global_load_dwordx4 v[53:56], v63, s[4:5] offset:32 -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v52 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v52 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v52 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v51 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v51 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v51 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v50 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v50 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v50 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v49 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v49 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v49 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill -; GFX906-NEXT: s_waitcnt vmcnt(12) -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v56 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v56 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v56 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v55 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v55 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v55 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v54 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v54 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v54 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v53 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v53 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v53 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[57:60], v63, s[4:5] offset:16 +; GFX906-NEXT: buffer_store_dword v5, off, s[8:11], 0 ; 4-byte Folded Spill +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill +; GFX906-NEXT: global_load_dwordx4 v[5:8], v61, s[4:5] offset:224 ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: global_load_dwordx4 v[0:3], v63, s[4:5] -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v60 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v60 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v60 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v59 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v59 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v59 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v58 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v58 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v58 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v57 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v57 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v57 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill -; GFX906-NEXT: s_waitcnt vmcnt(12) -; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v3 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v3 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v3 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v2 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v2 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v2 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v1 -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 24, v0 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v1 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 16, v0 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v1 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 8, v0 -; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX906-NEXT: global_load_dwordx4 v[9:12], v61, s[4:5] offset:208 +; GFX906-NEXT: global_load_dwordx4 v[13:16], v61, s[4:5] offset:192 +; GFX906-NEXT: global_load_dwordx4 v[17:20], v61, s[4:5] offset:176 +; GFX906-NEXT: global_load_dwordx4 v[21:24], v61, s[4:5] offset:160 +; GFX906-NEXT: global_load_dwordx4 v[25:28], v61, s[4:5] offset:144 +; GFX906-NEXT: global_load_dwordx4 v[29:32], v61, s[4:5] offset:128 +; GFX906-NEXT: global_load_dwordx4 v[33:36], v61, s[4:5] offset:112 +; GFX906-NEXT: global_load_dwordx4 v[37:40], v61, s[4:5] offset:96 +; GFX906-NEXT: global_load_dwordx4 v[41:44], v61, s[4:5] offset:80 +; GFX906-NEXT: global_load_dwordx4 v[45:48], v61, s[4:5] offset:64 +; GFX906-NEXT: global_load_dwordx4 v[49:52], v61, s[4:5] offset:48 +; GFX906-NEXT: global_load_dwordx4 v[53:56], v61, s[4:5] offset:32 +; GFX906-NEXT: global_load_dwordx4 v[57:60], v61, s[4:5] offset:16 +; GFX906-NEXT: global_load_dwordx4 v[0:3], v61, s[4:5] +; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX906-NEXT: s_cbranch_execz .LBB6_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx4 v[0:3], v63, s[6:7] offset:240 -; GFX906-NEXT: global_load_dwordx4 v[5:8], v63, s[6:7] offset:224 -; GFX906-NEXT: global_load_dwordx4 v[9:12], v63, s[6:7] offset:208 -; GFX906-NEXT: global_load_dwordx4 v[13:16], v63, s[6:7] offset:192 -; GFX906-NEXT: s_waitcnt vmcnt(3) -; GFX906-NEXT: v_lshrrev_b32_e32 v17, 24, v3 -; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v17, 16, v3 -; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v3 -; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v17, 24, v2 -; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v17, 16, v2 -; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v2 -; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v17, 24, v1 -; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v17, 16, v1 -; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v1 -; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v17, 24, v0 -; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v17, 16, v0 -; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill +; GFX906-NEXT: global_load_dwordx4 v[0:3], v61, s[6:7] offset:240 +; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill ; GFX906-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill ; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v0 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v8 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v8 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v8 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v7 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v7 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v6 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v6 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v5 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v5 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v12 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v12 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v12 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v11 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v11 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v11 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v10 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v10 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v9 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v9 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v9 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v16 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v16 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v16 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v15 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v15 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v15 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v14 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v14 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v14 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v13 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v13 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v13 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[17:20], v63, s[6:7] offset:176 -; GFX906-NEXT: global_load_dwordx4 v[21:24], v63, s[6:7] offset:160 -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v20 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v20 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v20 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v19 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v19 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v19 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v18 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v18 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v18 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v17 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v17 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill -; GFX906-NEXT: s_waitcnt vmcnt(12) -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v24 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v24 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v23 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v23 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v22 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v22 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v21 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v21 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v21 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[25:28], v63, s[6:7] offset:144 -; GFX906-NEXT: global_load_dwordx4 v[29:32], v63, s[6:7] offset:128 -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v28 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v28 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v28 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v27 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v27 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v26 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v26 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v25 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v25 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill -; GFX906-NEXT: s_waitcnt vmcnt(12) -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v32 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v32 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v32 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v31 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v31 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v31 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v30 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v30 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v30 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v29 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v29 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[33:36], v63, s[6:7] offset:112 -; GFX906-NEXT: global_load_dwordx4 v[37:40], v63, s[6:7] offset:96 -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v36 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v36 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v36 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v35 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v35 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v35 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v34 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v34 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v33 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v33 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v33 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill -; GFX906-NEXT: s_waitcnt vmcnt(12) -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v40 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v40 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v40 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v39 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v39 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v39 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v38 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v38 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v38 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v37 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v37 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v37 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[41:44], v63, s[6:7] offset:80 -; GFX906-NEXT: global_load_dwordx4 v[45:48], v63, s[6:7] offset:64 -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v44 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v44 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v44 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v43 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v43 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v43 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v42 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v42 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v42 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v41 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v41 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v41 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill -; GFX906-NEXT: s_waitcnt vmcnt(12) -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v48 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v48 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v48 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v47 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v47 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v47 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v46 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v46 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v46 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v45 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v45 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v45 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[49:52], v63, s[6:7] offset:48 -; GFX906-NEXT: global_load_dwordx4 v[53:56], v63, s[6:7] offset:32 -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v52 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v52 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v52 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v51 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v51 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v51 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v50 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v50 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v50 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v49 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v49 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v49 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill -; GFX906-NEXT: s_waitcnt vmcnt(12) -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v56 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v56 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v56 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v55 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v55 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v55 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v54 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v54 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v54 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v53 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v53 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v53 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[57:60], v63, s[6:7] offset:16 -; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: global_load_dwordx4 v[0:3], v63, s[6:7] -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v60 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v60 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v60 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v59 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v59 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v59 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v58 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v58 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v58 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v57 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v57 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v57 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill -; GFX906-NEXT: s_waitcnt vmcnt(12) -; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v3 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v3 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v3 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v2 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v2 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v2 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v1 -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 24, v0 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v1 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 16, v0 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v1 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 8, v0 +; GFX906-NEXT: global_load_dwordx4 v[5:8], v61, s[6:7] offset:224 +; GFX906-NEXT: global_load_dwordx4 v[9:12], v61, s[6:7] offset:208 +; GFX906-NEXT: global_load_dwordx4 v[13:16], v61, s[6:7] offset:192 +; GFX906-NEXT: global_load_dwordx4 v[17:20], v61, s[6:7] offset:176 +; GFX906-NEXT: global_load_dwordx4 v[21:24], v61, s[6:7] offset:160 +; GFX906-NEXT: global_load_dwordx4 v[25:28], v61, s[6:7] offset:144 +; GFX906-NEXT: global_load_dwordx4 v[29:32], v61, s[6:7] offset:128 +; GFX906-NEXT: global_load_dwordx4 v[33:36], v61, s[6:7] offset:112 +; GFX906-NEXT: global_load_dwordx4 v[37:40], v61, s[6:7] offset:96 +; GFX906-NEXT: global_load_dwordx4 v[41:44], v61, s[6:7] offset:80 +; GFX906-NEXT: global_load_dwordx4 v[45:48], v61, s[6:7] offset:64 +; GFX906-NEXT: global_load_dwordx4 v[49:52], v61, s[6:7] offset:48 +; GFX906-NEXT: global_load_dwordx4 v[53:56], v61, s[6:7] offset:32 +; GFX906-NEXT: global_load_dwordx4 v[57:60], v61, s[6:7] offset:16 +; GFX906-NEXT: global_load_dwordx4 v[0:3], v61, s[6:7] ; GFX906-NEXT: .LBB6_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX906-NEXT: v_lshlrev_b16_e32 v61, 8, v61 -; GFX906-NEXT: v_or_b32_sdwa v1, v1, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v61, off, s[8:11], 0 offset:764 ; 4-byte Folded Reload -; GFX906-NEXT: v_lshlrev_b16_e32 v62, 8, v62 -; GFX906-NEXT: v_or_b32_sdwa v0, v0, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v63, off, s[8:11], 0 offset:760 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(2) -; GFX906-NEXT: v_lshlrev_b16_e32 v61, 8, v61 -; GFX906-NEXT: v_or_b32_sdwa v2, v2, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v61, off, s[8:11], 0 offset:752 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v61, 8, v61 -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v61, off, s[8:11], 0 offset:768 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v61, 8, v61 -; GFX906-NEXT: v_or_b32_sdwa v61, v62, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v62, off, s[8:11], 0 offset:756 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v0, v0, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v61, off, s[8:11], 0 offset:744 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v62, 8, v62 -; GFX906-NEXT: v_or_b32_sdwa v62, v63, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v1, v1, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v62, off, s[8:11], 0 offset:748 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v61, 8, v61 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v61, v62, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v2, v2, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v61, off, s[8:11], 0 offset:736 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v62, off, s[8:11], 0 offset:740 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v61, 8, v61 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v61, v62, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:732 ; 4-byte Folded Reload -; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:728 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:716 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:704 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(3) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: s_waitcnt vmcnt(2) -; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX906-NEXT: buffer_load_dword v57, off, s[8:11], 0 offset:720 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v58, off, s[8:11], 0 offset:724 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(3) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v59, off, s[8:11], 0 offset:712 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(2) -; GFX906-NEXT: v_lshlrev_b16_e32 v57, 8, v57 -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_or_b32_sdwa v57, v58, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v58, off, s[8:11], 0 offset:708 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v0, v0, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v57, off, s[8:11], 0 offset:696 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v58, 8, v58 -; GFX906-NEXT: v_or_b32_sdwa v58, v59, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v58, off, s[8:11], 0 offset:700 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v57, 8, v57 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v57, v58, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v57, off, s[8:11], 0 offset:688 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v58, off, s[8:11], 0 offset:692 ; 4-byte Folded Reload -; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX906-NEXT: v_or_b32_sdwa v3, v60, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v57, 8, v57 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v57, v58, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Reload -; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:680 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:668 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:656 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(3) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: s_waitcnt vmcnt(2) -; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX906-NEXT: buffer_load_dword v53, off, s[8:11], 0 offset:672 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v54, off, s[8:11], 0 offset:676 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(3) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v55, off, s[8:11], 0 offset:664 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(2) -; GFX906-NEXT: v_lshlrev_b16_e32 v53, 8, v53 -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_or_b32_sdwa v53, v54, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v54, off, s[8:11], 0 offset:660 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v0, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v53, off, s[8:11], 0 offset:648 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v54, 8, v54 -; GFX906-NEXT: v_or_b32_sdwa v54, v55, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v54, off, s[8:11], 0 offset:652 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v53, 8, v53 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v53, v54, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v53, off, s[8:11], 0 offset:640 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v54, off, s[8:11], 0 offset:644 ; 4-byte Folded Reload -; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX906-NEXT: v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v53, 8, v53 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v53, v54, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:32 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Reload -; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:632 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:620 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:608 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(3) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: s_waitcnt vmcnt(2) -; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX906-NEXT: buffer_load_dword v49, off, s[8:11], 0 offset:624 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v50, off, s[8:11], 0 offset:628 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(3) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v51, off, s[8:11], 0 offset:616 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(2) -; GFX906-NEXT: v_lshlrev_b16_e32 v49, 8, v49 -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_or_b32_sdwa v49, v50, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v50, off, s[8:11], 0 offset:612 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v0, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v49, off, s[8:11], 0 offset:600 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v50, 8, v50 -; GFX906-NEXT: v_or_b32_sdwa v50, v51, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v1, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v50, off, s[8:11], 0 offset:604 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v49, 8, v49 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v49, v50, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v2, v2, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v49, off, s[8:11], 0 offset:592 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v50, off, s[8:11], 0 offset:596 ; 4-byte Folded Reload -; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX906-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v49, 8, v49 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v49, v50, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:48 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Reload -; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:584 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:572 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:560 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(3) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: s_waitcnt vmcnt(2) -; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX906-NEXT: buffer_load_dword v45, off, s[8:11], 0 offset:576 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v46, off, s[8:11], 0 offset:580 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(3) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v47, off, s[8:11], 0 offset:568 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(2) -; GFX906-NEXT: v_lshlrev_b16_e32 v45, 8, v45 -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_or_b32_sdwa v45, v46, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v46, off, s[8:11], 0 offset:564 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v0, v0, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v45, off, s[8:11], 0 offset:552 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v46, 8, v46 -; GFX906-NEXT: v_or_b32_sdwa v46, v47, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v1, v1, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v46, off, s[8:11], 0 offset:556 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v45, 8, v45 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v45, v46, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v45, off, s[8:11], 0 offset:544 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v46, off, s[8:11], 0 offset:548 ; 4-byte Folded Reload -; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX906-NEXT: v_or_b32_sdwa v3, v48, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v45, 8, v45 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v45, v46, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:64 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Reload -; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:536 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:524 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:512 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(3) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: s_waitcnt vmcnt(2) -; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX906-NEXT: buffer_load_dword v41, off, s[8:11], 0 offset:528 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v42, off, s[8:11], 0 offset:532 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(3) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v2, v43, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v43, off, s[8:11], 0 offset:520 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(2) -; GFX906-NEXT: v_lshlrev_b16_e32 v41, 8, v41 -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_or_b32_sdwa v41, v42, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v42, off, s[8:11], 0 offset:516 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v0, v0, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v41, off, s[8:11], 0 offset:504 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v42, 8, v42 -; GFX906-NEXT: v_or_b32_sdwa v42, v43, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v42, off, s[8:11], 0 offset:508 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v41, 8, v41 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v41, v42, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v41, off, s[8:11], 0 offset:496 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v42, off, s[8:11], 0 offset:500 ; 4-byte Folded Reload -; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX906-NEXT: v_or_b32_sdwa v3, v44, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v41, 8, v41 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v41, v42, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:80 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Reload -; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:488 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:476 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:464 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(3) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: s_waitcnt vmcnt(2) -; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX906-NEXT: buffer_load_dword v37, off, s[8:11], 0 offset:480 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v38, off, s[8:11], 0 offset:484 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(3) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v39, off, s[8:11], 0 offset:472 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(2) -; GFX906-NEXT: v_lshlrev_b16_e32 v37, 8, v37 -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v38, off, s[8:11], 0 offset:468 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v0, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v37, off, s[8:11], 0 offset:456 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v38, 8, v38 -; GFX906-NEXT: v_or_b32_sdwa v38, v39, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v1, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v38, off, s[8:11], 0 offset:460 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v37, 8, v37 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v2, v2, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v37, off, s[8:11], 0 offset:448 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v38, off, s[8:11], 0 offset:452 ; 4-byte Folded Reload -; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX906-NEXT: v_or_b32_sdwa v3, v40, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v37, 8, v37 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:96 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Reload -; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:440 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:428 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:416 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(3) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: s_waitcnt vmcnt(2) -; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX906-NEXT: buffer_load_dword v33, off, s[8:11], 0 offset:432 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v34, off, s[8:11], 0 offset:436 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(3) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v35, off, s[8:11], 0 offset:424 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(2) -; GFX906-NEXT: v_lshlrev_b16_e32 v33, 8, v33 -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v34, off, s[8:11], 0 offset:420 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v0, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v33, off, s[8:11], 0 offset:408 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v34, 8, v34 -; GFX906-NEXT: v_or_b32_sdwa v34, v35, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v34, off, s[8:11], 0 offset:412 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v33, 8, v33 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v2, v2, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v33, off, s[8:11], 0 offset:400 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v34, off, s[8:11], 0 offset:404 ; 4-byte Folded Reload -; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX906-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v33, 8, v33 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:112 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Reload -; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:392 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:380 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:368 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(3) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: s_waitcnt vmcnt(2) -; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX906-NEXT: buffer_load_dword v29, off, s[8:11], 0 offset:384 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v30, off, s[8:11], 0 offset:388 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(3) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v2, v31, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v31, off, s[8:11], 0 offset:376 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(2) -; GFX906-NEXT: v_lshlrev_b16_e32 v29, 8, v29 -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v30, off, s[8:11], 0 offset:372 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v0, v0, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v29, off, s[8:11], 0 offset:360 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v30, 8, v30 -; GFX906-NEXT: v_or_b32_sdwa v30, v31, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v1, v1, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v30, off, s[8:11], 0 offset:364 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v29, 8, v29 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v2, v2, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v29, off, s[8:11], 0 offset:352 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v30, off, s[8:11], 0 offset:356 ; 4-byte Folded Reload -; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX906-NEXT: v_or_b32_sdwa v3, v32, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v29, 8, v29 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:128 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Reload -; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:344 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:332 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:320 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(3) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: s_waitcnt vmcnt(2) -; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX906-NEXT: buffer_load_dword v25, off, s[8:11], 0 offset:336 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v26, off, s[8:11], 0 offset:340 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(3) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v2, v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v27, off, s[8:11], 0 offset:328 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(2) -; GFX906-NEXT: v_lshlrev_b16_e32 v25, 8, v25 -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v26, off, s[8:11], 0 offset:324 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v0, v0, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v25, off, s[8:11], 0 offset:312 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v26, 8, v26 -; GFX906-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v1, v1, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v26, off, s[8:11], 0 offset:316 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v25, 8, v25 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v2, v2, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v25, off, s[8:11], 0 offset:304 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v26, off, s[8:11], 0 offset:308 ; 4-byte Folded Reload -; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX906-NEXT: v_or_b32_sdwa v3, v28, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v25, 8, v25 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:144 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Reload -; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:296 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:284 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:272 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(3) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: s_waitcnt vmcnt(2) -; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX906-NEXT: buffer_load_dword v21, off, s[8:11], 0 offset:288 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v22, off, s[8:11], 0 offset:292 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(3) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v2, v23, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v23, off, s[8:11], 0 offset:280 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(2) -; GFX906-NEXT: v_lshlrev_b16_e32 v21, 8, v21 -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v22, off, s[8:11], 0 offset:276 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v0, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v21, off, s[8:11], 0 offset:264 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v22, 8, v22 -; GFX906-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v22, off, s[8:11], 0 offset:268 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v21, 8, v21 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v2, v2, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v21, off, s[8:11], 0 offset:256 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v22, off, s[8:11], 0 offset:260 ; 4-byte Folded Reload -; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX906-NEXT: v_or_b32_sdwa v3, v24, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v21, 8, v21 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:160 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Reload +; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) +; GFX906-NEXT: global_store_dwordx4 v4, v[33:36], s[0:1] offset:112 +; GFX906-NEXT: s_waitcnt vmcnt(7) +; GFX906-NEXT: global_store_dwordx4 v4, v[37:40], s[0:1] offset:96 +; GFX906-NEXT: s_waitcnt vmcnt(7) +; GFX906-NEXT: global_store_dwordx4 v4, v[41:44], s[0:1] offset:80 +; GFX906-NEXT: s_waitcnt vmcnt(7) +; GFX906-NEXT: global_store_dwordx4 v4, v[45:48], s[0:1] offset:64 +; GFX906-NEXT: s_waitcnt vmcnt(7) +; GFX906-NEXT: global_store_dwordx4 v4, v[49:52], s[0:1] offset:48 +; GFX906-NEXT: s_waitcnt vmcnt(7) +; GFX906-NEXT: global_store_dwordx4 v4, v[53:56], s[0:1] offset:32 +; GFX906-NEXT: s_waitcnt vmcnt(7) +; GFX906-NEXT: global_store_dwordx4 v4, v[57:60], s[0:1] offset:16 +; GFX906-NEXT: s_waitcnt vmcnt(7) +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:248 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:236 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:224 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(3) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: s_waitcnt vmcnt(2) -; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX906-NEXT: buffer_load_dword v17, off, s[8:11], 0 offset:240 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v18, off, s[8:11], 0 offset:244 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(3) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v19, off, s[8:11], 0 offset:232 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(2) -; GFX906-NEXT: v_lshlrev_b16_e32 v17, 8, v17 -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v18, off, s[8:11], 0 offset:228 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v0, v0, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v17, off, s[8:11], 0 offset:216 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v18, 8, v18 -; GFX906-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v18, off, s[8:11], 0 offset:220 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v17, 8, v17 +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:240 +; GFX906-NEXT: global_store_dwordx4 v4, v[5:8], s[0:1] offset:224 +; GFX906-NEXT: global_store_dwordx4 v4, v[9:12], s[0:1] offset:208 +; GFX906-NEXT: global_store_dwordx4 v4, v[13:16], s[0:1] offset:192 +; GFX906-NEXT: global_store_dwordx4 v4, v[17:20], s[0:1] offset:176 +; GFX906-NEXT: global_store_dwordx4 v4, v[21:24], s[0:1] offset:160 +; GFX906-NEXT: global_store_dwordx4 v4, v[25:28], s[0:1] offset:144 +; GFX906-NEXT: global_store_dwordx4 v4, v[29:32], s[0:1] offset:128 +; GFX906-NEXT: s_endpgm +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx + %vec1 = load <256 x i8>, ptr addrspace(1) %gep1 + %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx + %vec2 = load <256 x i8>, ptr addrspace(1) %gep2 + %cmp = icmp ult i32 %idx, 15 + br i1 %cmp, label %bb.1, label %bb.2 +bb.1: + br label %bb.2 + +bb.2: + %tmp5 = phi <256 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] + store <256 x i8> %tmp5, ptr addrspace(1) %dst, align 4 + ret void +} + +define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { +; GFX906-LABEL: repeat_successor: +; GFX906: ; %bb.0: ; %entry +; GFX906-NEXT: s_load_dword s8, s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX906-NEXT: s_waitcnt lgkmcnt(0) +; GFX906-NEXT: s_cmp_lt_i32 s8, 3 +; GFX906-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX906-NEXT: ; %bb.1: ; %LeafBlock +; GFX906-NEXT: s_cmp_gt_i32 s8, 0 +; GFX906-NEXT: s_cbranch_scc0 .LBB7_6 +; GFX906-NEXT: ; %bb.2: +; GFX906-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX906-NEXT: global_load_dword v0, v0, s[4:5] +; GFX906-NEXT: s_branch .LBB7_5 +; GFX906-NEXT: .LBB7_3: ; %LeafBlock5 +; GFX906-NEXT: s_cmp_eq_u32 s8, 3 +; GFX906-NEXT: s_cbranch_scc0 .LBB7_6 +; GFX906-NEXT: ; %bb.4: ; %sw.bb5 +; GFX906-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX906-NEXT: global_load_dword v0, v0, s[6:7] +; GFX906-NEXT: .LBB7_5: ; %return.sink.split +; GFX906-NEXT: v_mov_b32_e32 v1, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v2, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v17, off, s[8:11], 0 offset:208 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v18, off, s[8:11], 0 offset:212 ; 4-byte Folded Reload -; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX906-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v17, 8, v17 +; GFX906-NEXT: global_store_dword v1, v0, s[2:3] +; GFX906-NEXT: .LBB7_6: ; %return +; GFX906-NEXT: s_endpgm +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx + %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 + %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx + %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 + switch i32 %in, label %return [ + i32 1, label %return.sink.split + i32 2, label %return.sink.split + i32 3, label %sw.bb5 + ] + +sw.bb5: + br label %return.sink.split + +return.sink.split: + %tmp5 = phi <4 x i8> [ %vec2, %sw.bb5 ], [ %vec1, %entry ], [ %vec1, %entry ] + store <4 x i8> %tmp5, ptr addrspace(1) %dst, align 4 + ret void + +return: + ret void +} + +define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { +; GFX906-LABEL: v8i8_phi_chain: +; GFX906: ; %bb.0: ; %entry +; GFX906-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: s_waitcnt lgkmcnt(0) +; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[0:1] +; GFX906-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0 +; GFX906-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GFX906-NEXT: s_cbranch_execz .LBB8_2 +; GFX906-NEXT: ; %bb.1: ; %bb.1 +; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[2:3] +; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 +; GFX906-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX906-NEXT: s_and_b64 s[2:3], vcc, exec +; GFX906-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX906-NEXT: .LBB8_2: ; %Flow +; GFX906-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX906-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] +; GFX906-NEXT: s_cbranch_execz .LBB8_4 +; GFX906-NEXT: ; %bb.3: ; %bb.2 +; GFX906-NEXT: v_mov_b32_e32 v0, 0 +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[4:5] +; GFX906-NEXT: .LBB8_4: ; %bb.3 +; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:176 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Reload -; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:204 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:192 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:184 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(3) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: s_waitcnt vmcnt(2) -; GFX906-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:188 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX906-NEXT: v_or_b32_sdwa v3, v14, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v14, off, s[8:11], 0 offset:164 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:196 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:180 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:176 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v13, off, s[8:11], 0 offset:160 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:172 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v13, 8, v13 -; GFX906-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[6:7] +; GFX906-NEXT: s_endpgm +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx + %vec1 = load <8 x i8>, ptr addrspace(1) %gep1 + %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx + %vec2 = load <8 x i8>, ptr addrspace(1) %gep2 + %cmp = icmp ult i32 %idx, 15 + br i1 %cmp, label %bb.1, label %bb.2 +bb.1: + %cmp2 = icmp ult i32 %idx, 7 + br i1 %cmp2, label %bb.2, label %bb.3 + +bb.2: + %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] + store <8 x i8> %tmp5, ptr addrspace(1) %dst0, align 4 + br label %bb.3 + +bb.3: + %tmp7 = phi <8 x i8> [ %vec2, %bb.1], [%tmp5, %bb.2] + store <8 x i8> %tmp7, ptr addrspace(1) %dst1, align 4 + ret void +} + + +define amdgpu_kernel void @v8i8_phi_zeroinit(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { +; GFX906-LABEL: v8i8_phi_zeroinit: +; GFX906: ; %bb.0: ; %entry +; GFX906-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX906-NEXT: v_lshlrev_b32_e32 v5, 3, v0 +; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX906-NEXT: s_waitcnt lgkmcnt(0) +; GFX906-NEXT: global_load_dwordx2 v[3:4], v5, s[0:1] +; GFX906-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0 +; GFX906-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GFX906-NEXT: s_cbranch_execz .LBB9_2 +; GFX906-NEXT: ; %bb.1: ; %bb.1 +; GFX906-NEXT: global_load_dwordx2 v[1:2], v5, s[2:3] +; GFX906-NEXT: s_mov_b32 s2, 0 +; GFX906-NEXT: s_mov_b32 s3, s2 +; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 +; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_mov_b32_e32 v4, s3 +; GFX906-NEXT: v_mov_b32_e32 v3, s2 +; GFX906-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX906-NEXT: s_and_b64 s[2:3], vcc, exec +; GFX906-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX906-NEXT: .LBB9_2: ; %Flow +; GFX906-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX906-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] +; GFX906-NEXT: s_cbranch_execz .LBB9_4 +; GFX906-NEXT: ; %bb.3: ; %bb.2 +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: v_mov_b32_e32 v1, v3 +; GFX906-NEXT: v_mov_b32_e32 v0, 0 +; GFX906-NEXT: v_mov_b32_e32 v2, v4 +; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[4:5] +; GFX906-NEXT: .LBB9_4: ; %bb.3 +; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX906-NEXT: v_or_b32_sdwa v3, v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:168 ; 4-byte Folded Reload +; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[6:7] +; GFX906-NEXT: s_endpgm +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx + %vec1 = load <8 x i8>, ptr addrspace(1) %gep1 + %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx + %vec2 = load <8 x i8>, ptr addrspace(1) %gep2 + %cmp = icmp ult i32 %idx, 15 + br i1 %cmp, label %bb.1, label %bb.2 +bb.1: + %cmp2 = icmp ult i32 %idx, 7 + br i1 %cmp2, label %bb.2, label %bb.3 + +bb.2: + %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ zeroinitializer, %bb.1 ] + store <8 x i8> %tmp5, ptr addrspace(1) %dst0, align 4 + br label %bb.3 + +bb.3: + %tmp7 = phi <8 x i8> [ %vec2, %bb.1], [%tmp5, %bb.2] + store <8 x i8> %tmp7, ptr addrspace(1) %dst1, align 4 + ret void +} + +define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { +; GFX906-LABEL: v8i8_phi_const: +; GFX906: ; %bb.0: ; %entry +; GFX906-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX906-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: s_waitcnt lgkmcnt(0) +; GFX906-NEXT: global_load_dwordx2 v[1:2], v1, s[0:1] +; GFX906-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX906-NEXT: v_or_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:192 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Reload -; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:152 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:148 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:140 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(3) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v9, off, s[8:11], 0 offset:128 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(2) -; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_lshrrev_b32_e32 v4, 24, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX906-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX906-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX906-NEXT: s_cbranch_execz .LBB10_2 +; GFX906-NEXT: ; %bb.1: ; %bb.1 +; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 +; GFX906-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX906-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX906-NEXT: v_mov_b32_e32 v1, 1 +; GFX906-NEXT: v_mov_b32_e32 v8, 2 +; GFX906-NEXT: v_mov_b32_e32 v6, 3 +; GFX906-NEXT: v_mov_b32_e32 v7, 4 +; GFX906-NEXT: v_mov_b32_e32 v2, 5 +; GFX906-NEXT: v_mov_b32_e32 v5, 6 +; GFX906-NEXT: v_mov_b32_e32 v3, 7 +; GFX906-NEXT: v_mov_b32_e32 v4, 8 +; GFX906-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] +; GFX906-NEXT: .LBB10_2: ; %Flow +; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] +; GFX906-NEXT: s_cbranch_execz .LBB10_4 +; GFX906-NEXT: ; %bb.3: ; %bb.2 +; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v8 +; GFX906-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v7 +; GFX906-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:144 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:136 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v5 +; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v4 ; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_mov_b32_e32 v9, 0 ; GFX906-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:132 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:124 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v10, off, s[8:11], 0 offset:116 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(2) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX906-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:120 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v9, off, s[8:11], 0 offset:112 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v9, 8, v9 -; GFX906-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:208 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Reload -; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:104 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:100 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:92 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(3) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:80 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(2) -; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:96 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:88 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX906-NEXT: global_store_dwordx2 v9, v[0:1], s[4:5] +; GFX906-NEXT: .LBB10_4: ; %bb.3 +; GFX906-NEXT: s_endpgm +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx + %vec1 = load <8 x i8>, ptr addrspace(1) %gep1 + %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx + %vec2 = load <8 x i8>, ptr addrspace(1) %gep2 + %cmp = icmp ult i32 %idx, 15 + br i1 %cmp, label %bb.1, label %bb.2 +bb.1: + %cmp2 = icmp ult i32 %idx, 7 + br i1 %cmp2, label %bb.2, label %bb.3 + +bb.2: + %tmp5 = phi <8 x i8> [ %vec1, %entry ], [<i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, %bb.1 ] + store <8 x i8> %tmp5, ptr addrspace(1) %dst0, align 4 + br label %bb.3 + +bb.3: + %tmp7 = phi <8 x i8> [ %vec2, %bb.1], [%tmp5, %bb.2] + store <8 x i8> %tmp7, ptr addrspace(1) %dst1, align 4 + ret void +} + +define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { +; GFX906-LABEL: v8i8_multi_block: +; GFX906: ; %bb.0: ; %entry +; GFX906-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX906-NEXT: v_lshlrev_b32_e32 v6, 3, v0 +; GFX906-NEXT: v_mov_b32_e32 v5, 0 +; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: s_waitcnt lgkmcnt(0) +; GFX906-NEXT: global_load_dwordx2 v[3:4], v6, s[0:1] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:84 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:76 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:68 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(2) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX906-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:72 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:64 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX906-NEXT: v_mov_b32_e32 v1, v3 +; GFX906-NEXT: v_mov_b32_e32 v2, v4 +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX906-NEXT: s_cbranch_execz .LBB11_4 +; GFX906-NEXT: ; %bb.1: ; %bb.1 +; GFX906-NEXT: global_load_dwordx2 v[1:2], v6, s[2:3] +; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 +; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX906-NEXT: s_cbranch_execz .LBB11_3 +; GFX906-NEXT: ; %bb.2: ; %bb.2 +; GFX906-NEXT: v_mov_b32_e32 v0, 0 +; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[4:5] +; GFX906-NEXT: .LBB11_3: ; %Flow +; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: .LBB11_4: ; %bb.3 +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v5 -; GFX906-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:224 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:60 ; 4-byte Folded Reload -; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v7, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v8, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:52 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:56 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:44 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(7) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: s_waitcnt vmcnt(3) -; GFX906-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: s_waitcnt vmcnt(2) -; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:48 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:40 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:32 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(2) -; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:36 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:28 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:20 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(2) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX906-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:24 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX906-NEXT: global_store_dwordx2 v5, v[1:2], s[6:7] +; GFX906-NEXT: s_endpgm +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx + %vec1 = load <8 x i8>, ptr addrspace(1) %gep1 + %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx + %vec2 = load <8 x i8>, ptr addrspace(1) %gep2 + %cmp = icmp ult i32 %idx, 15 + br i1 %cmp, label %bb.1, label %bb.3 +bb.1: + %cmp2 = icmp ult i32 %idx, 7 + br i1 %cmp2, label %bb.2, label %bb.3 + +bb.2: + store <8 x i8> %vec1, ptr addrspace(1) %dst0, align 4 + br label %bb.3 + +bb.3: + %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ], [ %vec2, %bb.2] + store <8 x i8> %tmp5, ptr addrspace(1) %dst1, align 4 + ret void +} + +define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { +; GFX906-LABEL: v32i8_loop_carried: +; GFX906: ; %bb.0: ; %entry +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX906-NEXT: v_lshlrev_b32_e32 v1, 5, v0 +; GFX906-NEXT: v_cmp_lt_u32_e32 vcc, 14, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x2000604 +; GFX906-NEXT: s_waitcnt lgkmcnt(0) +; GFX906-NEXT: global_load_dword v1, v1, s[2:3] +; GFX906-NEXT: s_mov_b64 s[2:3], 0 +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: v_mov_b32_e32 v0, v1 +; GFX906-NEXT: .LBB12_1: ; %bb.1 +; GFX906-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX906-NEXT: s_and_b64 s[6:7], exec, vcc +; GFX906-NEXT: s_or_b64 s[2:3], s[6:7], s[2:3] +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX906-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_cbranch_execnz .LBB12_1 +; GFX906-NEXT: ; %bb.2: ; %bb.2.loopexit +; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX906-NEXT: v_mov_b32_e32 v1, 0 +; GFX906-NEXT: s_waitcnt lgkmcnt(0) +; GFX906-NEXT: global_store_dword v1, v0, s[0:1] +; GFX906-NEXT: s_endpgm +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %gep1 = getelementptr <32 x i8>, ptr addrspace(1) %src1, i32 %idx + %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 + br label %bb.1 + +bb.1: + %temp = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] + %vec2 = shufflevector <4 x i8> %vec1, <4 x i8> %temp, <4 x i32> <i32 0, i32 2, i32 4, i32 6> + %cmp = icmp ult i32 %idx, 15 + br i1 %cmp, label %bb.1, label %bb.2 + br label %bb.2 + +bb.2: + store <4 x i8> %vec2, ptr addrspace(1) %dst, align 4 + ret void +} + +; Should not have instances of "Instruction does not dominate all uses!" + +define amdgpu_kernel void @v8i8_multiuse_multiblock(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst1, ptr addrspace(1) nocapture %dst2, ptr addrspace(1) nocapture %dst3) { +; GFX906-LABEL: v8i8_multiuse_multiblock: +; GFX906: ; %bb.0: ; %entry +; GFX906-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; GFX906-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 +; GFX906-NEXT: v_cmp_lt_u32_e64 s[2:3], 14, v0 +; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: s_waitcnt lgkmcnt(0) +; GFX906-NEXT: global_load_dwordx2 v[1:2], v1, s[4:5] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v5 -; GFX906-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:240 +; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX906-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX906-NEXT: s_cbranch_execz .LBB13_2 +; GFX906-NEXT: ; %bb.1: ; %bb.1 +; GFX906-NEXT: s_movk_i32 s6, 0xff00 +; GFX906-NEXT: v_mov_b32_e32 v5, 8 +; GFX906-NEXT: v_and_b32_sdwa v6, v1, s6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX906-NEXT: s_mov_b32 s6, 0x6070504 +; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 +; GFX906-NEXT: v_and_b32_e32 v4, 0xffffff00, v1 +; GFX906-NEXT: v_lshlrev_b16_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX906-NEXT: v_perm_b32 v7, v1, v1, s6 +; GFX906-NEXT: s_andn2_b64 s[2:3], s[2:3], exec +; GFX906-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX906-NEXT: v_mov_b32_e32 v3, 0 +; GFX906-NEXT: v_or_b32_sdwa v4, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v6, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX906-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] +; GFX906-NEXT: v_or_b32_sdwa v6, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: global_store_dword v3, v1, s[8:9] +; GFX906-NEXT: global_store_dword v3, v7, s[8:9] offset:8 +; GFX906-NEXT: global_store_dword v3, v6, s[8:9] offset:16 +; GFX906-NEXT: global_store_dword v3, v4, s[8:9] offset:24 +; GFX906-NEXT: .LBB13_2: ; %Flow +; GFX906-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX906-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] +; GFX906-NEXT: s_cbranch_execz .LBB13_4 +; GFX906-NEXT: ; %bb.3: ; %bb.2 +; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v2 +; GFX906-NEXT: v_and_b32_e32 v4, 0xffffff00, v2 +; GFX906-NEXT: v_and_b32_e32 v5, 0xffffff00, v1 +; GFX906-NEXT: s_mov_b32 s2, 0xc0c0001 +; GFX906-NEXT: v_or_b32_sdwa v3, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v5, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_perm_b32 v2, 0, v2, s2 +; GFX906-NEXT: v_mov_b32_e32 v0, 0 +; GFX906-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: v_perm_b32 v6, 0, v1, s2 +; GFX906-NEXT: s_mov_b32 s3, 0xffff0000 +; GFX906-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX906-NEXT: v_and_or_b32 v7, v1, s3, v6 +; GFX906-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX906-NEXT: global_store_dword v0, v3, s[10:11] +; GFX906-NEXT: global_store_dword v0, v4, s[10:11] offset:8 +; GFX906-NEXT: global_store_dword v0, v7, s[10:11] offset:16 +; GFX906-NEXT: global_store_dword v0, v2, s[10:11] offset:24 +; GFX906-NEXT: .LBB13_4: ; %bb.3 +; GFX906-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX906-NEXT: s_movk_i32 s3, 0xff00 +; GFX906-NEXT: v_mov_b32_e32 v4, 8 +; GFX906-NEXT: s_movk_i32 s2, 0xff +; GFX906-NEXT: v_and_b32_sdwa v2, v1, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX906-NEXT: v_lshlrev_b16_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX906-NEXT: v_or_b32_sdwa v3, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v5, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX906-NEXT: v_lshlrev_b16_e32 v6, 8, v1 +; GFX906-NEXT: v_and_b32_sdwa v7, v1, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX906-NEXT: v_mov_b32_e32 v0, 0 +; GFX906-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v7, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v4, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: global_store_dword v0, v3, s[0:1] +; GFX906-NEXT: global_store_dword v0, v1, s[0:1] offset:8 +; GFX906-NEXT: global_store_dword v0, v4, s[0:1] offset:16 +; GFX906-NEXT: global_store_dword v0, v2, s[0:1] offset:24 ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx - %vec1 = load <256 x i8>, ptr addrspace(1) %gep1 + %vec1 = load <8 x i8>, ptr addrspace(1) %gep1 %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx - %vec2 = load <256 x i8>, ptr addrspace(1) %gep2 + %vec2 = load <8 x i8>, ptr addrspace(1) %gep2 %cmp = icmp ult i32 %idx, 15 br i1 %cmp, label %bb.1, label %bb.2 bb.1: - br label %bb.2 + %s1 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %s2 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 1, i32 3, i32 2> + %s3 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 2, i32 1, i32 3> + %s4 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 2, i32 3, i32 1> + %gep4 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst1, i32 0 + %gep5 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst1, i32 1 + %gep6 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst1, i32 2 + %gep7 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst1, i32 3 + store <4 x i8> %s1, ptr addrspace(1) %gep4, align 4 + store <4 x i8> %s2, ptr addrspace(1) %gep5, align 4 + store <4 x i8> %s3, ptr addrspace(1) %gep6, align 4 + store <4 x i8> %s4, ptr addrspace(1) %gep7, align 4 + %cmp2 = icmp ult i32 %idx, 7 + br i1 %cmp2, label %bb.2, label %bb.3 bb.2: - %tmp5 = phi <256 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] - store <256 x i8> %tmp5, ptr addrspace(1) %dst, align 4 + %s5 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 3, i32 1, i32 2> + %s6 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 3, i32 2, i32 1> + %s7 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 0, i32 2, i32 3> + %s8 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 0, i32 3, i32 2> + %gep8 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst2, i32 0 + %gep9 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst2, i32 1 + %gep10 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst2, i32 2 + %gep11 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst2, i32 3 + store <4 x i8> %s5, ptr addrspace(1) %gep8, align 4 + store <4 x i8> %s6, ptr addrspace(1) %gep9, align 4 + store <4 x i8> %s7, ptr addrspace(1) %gep10, align 4 + store <4 x i8> %s8, ptr addrspace(1) %gep11, align 4 + br label %bb.3 + +bb.3: + %s9 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 2, i32 0, i32 3> + %s10 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 2, i32 3, i32 0> + %s11 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 3, i32 0, i32 2> + %s12 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 3, i32 2, i32 0> + %gep12 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst3, i32 0 + %gep13 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst3, i32 1 + %gep14 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst3, i32 2 + %gep15 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst3, i32 3 + store <4 x i8> %s9, ptr addrspace(1) %gep12, align 4 + store <4 x i8> %s10, ptr addrspace(1) %gep13, align 4 + store <4 x i8> %s11, ptr addrspace(1) %gep14, align 4 + store <4 x i8> %s12, ptr addrspace(1) %gep15, align 4 ret void } -declare i32 @llvm.amdgcn.workitem.id.x() +declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll b/llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll new file mode 100644 index 0000000..5d2e299 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll @@ -0,0 +1,352 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -mtriple=amdgcn -mcpu=gfx906 -amdgpu-late-codegenprepare -S -o - %s | FileCheck --check-prefix=GFX906 %s + +define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { +; GFX906-LABEL: define amdgpu_kernel void @v3i8_liveout( +; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX906-NEXT: entry: +; GFX906-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; GFX906-NEXT: [[GEP1:%.*]] = getelementptr <3 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]] +; GFX906-NEXT: [[VEC1:%.*]] = load <3 x i8>, ptr addrspace(1) [[GEP1]], align 4 +; GFX906-NEXT: [[TMP0:%.*]] = shufflevector <3 x i8> [[VEC1]], <3 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; GFX906-NEXT: [[VEC1_BC:%.*]] = bitcast <4 x i8> [[TMP0]] to i32 +; GFX906-NEXT: [[GEP2:%.*]] = getelementptr <3 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]] +; GFX906-NEXT: [[VEC2:%.*]] = load <3 x i8>, ptr addrspace(1) [[GEP2]], align 4 +; GFX906-NEXT: [[TMP1:%.*]] = shufflevector <3 x i8> [[VEC2]], <3 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; GFX906-NEXT: [[VEC2_BC:%.*]] = bitcast <4 x i8> [[TMP1]] to i32 +; GFX906-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15 +; GFX906-NEXT: br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]] +; GFX906: bb.1: +; GFX906-NEXT: br label [[BB_2]] +; GFX906: bb.2: +; GFX906-NEXT: [[TMP5_TC:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ] +; GFX906-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP5_TC]] to i24 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast i24 [[TMP2]] to <3 x i8> +; GFX906-NEXT: store <3 x i8> [[TMP3]], ptr addrspace(1) [[DST]], align 4 +; GFX906-NEXT: ret void +; +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %gep1 = getelementptr <3 x i8>, ptr addrspace(1) %src1, i32 %idx + %vec1 = load <3 x i8>, ptr addrspace(1) %gep1 + %gep2 = getelementptr <3 x i8>, ptr addrspace(1) %src2, i32 %idx + %vec2 = load <3 x i8>, ptr addrspace(1) %gep2 + %cmp = icmp ult i32 %idx, 15 + br i1 %cmp, label %bb.1, label %bb.2 +bb.1: + br label %bb.2 + +bb.2: + %tmp5 = phi <3 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] + store <3 x i8> %tmp5, ptr addrspace(1) %dst, align 4 + ret void +} + +define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { +; GFX906-LABEL: define amdgpu_kernel void @v4i8_liveout( +; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: entry: +; GFX906-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; GFX906-NEXT: [[GEP1:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]] +; GFX906-NEXT: [[VEC1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 4 +; GFX906-NEXT: [[VEC1_BC:%.*]] = bitcast <4 x i8> [[VEC1]] to i32 +; GFX906-NEXT: [[GEP2:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]] +; GFX906-NEXT: [[VEC2:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP2]], align 4 +; GFX906-NEXT: [[VEC2_BC:%.*]] = bitcast <4 x i8> [[VEC2]] to i32 +; GFX906-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15 +; GFX906-NEXT: br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]] +; GFX906: bb.1: +; GFX906-NEXT: br label [[BB_2]] +; GFX906: bb.2: +; GFX906-NEXT: [[TMP5_TC:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ] +; GFX906-NEXT: [[TMP5_TC_BC:%.*]] = bitcast i32 [[TMP5_TC]] to <4 x i8> +; GFX906-NEXT: store <4 x i8> [[TMP5_TC_BC]], ptr addrspace(1) [[DST]], align 4 +; GFX906-NEXT: ret void +; +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx + %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 + %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx + %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 + %cmp = icmp ult i32 %idx, 15 + br i1 %cmp, label %bb.1, label %bb.2 +bb.1: + br label %bb.2 + +bb.2: + %tmp5 = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] + store <4 x i8> %tmp5, ptr addrspace(1) %dst, align 4 + ret void +} + +define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { +; GFX906-LABEL: define amdgpu_kernel void @v5i8_liveout( +; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: entry: +; GFX906-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; GFX906-NEXT: [[GEP1:%.*]] = getelementptr <5 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]] +; GFX906-NEXT: [[VEC1:%.*]] = load <5 x i8>, ptr addrspace(1) [[GEP1]], align 8 +; GFX906-NEXT: [[TMP0:%.*]] = shufflevector <5 x i8> [[VEC1]], <5 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 5, i32 5> +; GFX906-NEXT: [[VEC1_BC:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +; GFX906-NEXT: [[GEP2:%.*]] = getelementptr <5 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]] +; GFX906-NEXT: [[VEC2:%.*]] = load <5 x i8>, ptr addrspace(1) [[GEP2]], align 8 +; GFX906-NEXT: [[TMP1:%.*]] = shufflevector <5 x i8> [[VEC2]], <5 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 5, i32 5> +; GFX906-NEXT: [[VEC2_BC:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +; GFX906-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15 +; GFX906-NEXT: br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]] +; GFX906: bb.1: +; GFX906-NEXT: br label [[BB_2]] +; GFX906: bb.2: +; GFX906-NEXT: [[TMP5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP5_TC]] to <8 x i8> +; GFX906-NEXT: [[TMP3:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4> +; GFX906-NEXT: store <5 x i8> [[TMP3]], ptr addrspace(1) [[DST]], align 4 +; GFX906-NEXT: ret void +; +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %gep1 = getelementptr <5 x i8>, ptr addrspace(1) %src1, i32 %idx + %vec1 = load <5 x i8>, ptr addrspace(1) %gep1 + %gep2 = getelementptr <5 x i8>, ptr addrspace(1) %src2, i32 %idx + %vec2 = load <5 x i8>, ptr addrspace(1) %gep2 + %cmp = icmp ult i32 %idx, 15 + br i1 %cmp, label %bb.1, label %bb.2 +bb.1: + br label %bb.2 + +bb.2: + %tmp5 = phi <5 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] + store <5 x i8> %tmp5, ptr addrspace(1) %dst, align 4 + ret void +} + +define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { +; GFX906-LABEL: define amdgpu_kernel void @v8i8_liveout( +; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: entry: +; GFX906-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; GFX906-NEXT: [[GEP1:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]] +; GFX906-NEXT: [[VEC1:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP1]], align 8 +; GFX906-NEXT: [[VEC1_BC:%.*]] = bitcast <8 x i8> [[VEC1]] to <2 x i32> +; GFX906-NEXT: [[GEP2:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]] +; GFX906-NEXT: [[VEC2:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP2]], align 8 +; GFX906-NEXT: [[VEC2_BC:%.*]] = bitcast <8 x i8> [[VEC2]] to <2 x i32> +; GFX906-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15 +; GFX906-NEXT: br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]] +; GFX906: bb.1: +; GFX906-NEXT: br label [[BB_2]] +; GFX906: bb.2: +; GFX906-NEXT: [[TMP5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ] +; GFX906-NEXT: [[TMP5_TC_BC:%.*]] = bitcast <2 x i32> [[TMP5_TC]] to <8 x i8> +; GFX906-NEXT: store <8 x i8> [[TMP5_TC_BC]], ptr addrspace(1) [[DST]], align 4 +; GFX906-NEXT: ret void +; +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx + %vec1 = load <8 x i8>, ptr addrspace(1) %gep1 + %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx + %vec2 = load <8 x i8>, ptr addrspace(1) %gep2 + %cmp = icmp ult i32 %idx, 15 + br i1 %cmp, label %bb.1, label %bb.2 +bb.1: + br label %bb.2 + +bb.2: + %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] + store <8 x i8> %tmp5, ptr addrspace(1) %dst, align 4 + ret void +} + +define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { +; GFX906-LABEL: define amdgpu_kernel void @repeat_successor( +; GFX906-SAME: i32 [[IN:%.*]], ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: entry: +; GFX906-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; GFX906-NEXT: [[GEP1:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]] +; GFX906-NEXT: [[VEC1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 4 +; GFX906-NEXT: [[VEC1_BC:%.*]] = bitcast <4 x i8> [[VEC1]] to i32 +; GFX906-NEXT: [[GEP2:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]] +; GFX906-NEXT: [[VEC2:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP2]], align 4 +; GFX906-NEXT: [[VEC2_BC:%.*]] = bitcast <4 x i8> [[VEC2]] to i32 +; GFX906-NEXT: switch i32 [[IN]], label [[RETURN:%.*]] [ +; GFX906-NEXT: i32 1, label [[RETURN_SINK_SPLIT:%.*]] +; GFX906-NEXT: i32 2, label [[RETURN_SINK_SPLIT]] +; GFX906-NEXT: i32 3, label [[SW_BB5:%.*]] +; GFX906-NEXT: ] +; GFX906: sw.bb5: +; GFX906-NEXT: br label [[RETURN_SINK_SPLIT]] +; GFX906: return.sink.split: +; GFX906-NEXT: [[TMP5_TC:%.*]] = phi i32 [ [[VEC2_BC]], [[SW_BB5]] ], [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC1_BC]], [[ENTRY]] ] +; GFX906-NEXT: [[TMP5_TC_BC:%.*]] = bitcast i32 [[TMP5_TC]] to <4 x i8> +; GFX906-NEXT: store <4 x i8> [[TMP5_TC_BC]], ptr addrspace(1) [[DST]], align 4 +; GFX906-NEXT: ret void +; GFX906: return: +; GFX906-NEXT: ret void +; +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx + %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 + %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx + %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 + switch i32 %in, label %return [ + i32 1, label %return.sink.split + i32 2, label %return.sink.split + i32 3, label %sw.bb5 + ] + +sw.bb5: + br label %return.sink.split + +return.sink.split: + %tmp5 = phi <4 x i8> [ %vec2, %sw.bb5 ], [ %vec1, %entry ], [ %vec1, %entry ] + store <4 x i8> %tmp5, ptr addrspace(1) %dst, align 4 + ret void + +return: + ret void +} + +define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { +; GFX906-LABEL: define amdgpu_kernel void @v8i8_phi_chain( +; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST0:%.*]], ptr addrspace(1) nocapture [[DST1:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: entry: +; GFX906-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; GFX906-NEXT: [[GEP1:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]] +; GFX906-NEXT: [[VEC1:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP1]], align 8 +; GFX906-NEXT: [[VEC1_BC:%.*]] = bitcast <8 x i8> [[VEC1]] to <2 x i32> +; GFX906-NEXT: [[GEP2:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]] +; GFX906-NEXT: [[VEC2:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP2]], align 8 +; GFX906-NEXT: [[VEC2_BC:%.*]] = bitcast <8 x i8> [[VEC2]] to <2 x i32> +; GFX906-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15 +; GFX906-NEXT: br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]] +; GFX906: bb.1: +; GFX906-NEXT: [[CMP2:%.*]] = icmp ult i32 [[IDX]], 7 +; GFX906-NEXT: br i1 [[CMP2]], label [[BB_2]], label [[BB_3:%.*]] +; GFX906: bb.2: +; GFX906-NEXT: [[TMP5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ] +; GFX906-NEXT: [[TMP5_TC_BC:%.*]] = bitcast <2 x i32> [[TMP5_TC]] to <8 x i8> +; GFX906-NEXT: store <8 x i8> [[TMP5_TC_BC]], ptr addrspace(1) [[DST0]], align 4 +; GFX906-NEXT: br label [[BB_3]] +; GFX906: bb.3: +; GFX906-NEXT: [[TMP7_TC:%.*]] = phi <2 x i32> [ [[VEC2_BC]], [[BB_1]] ], [ [[TMP5_TC]], [[BB_2]] ] +; GFX906-NEXT: [[TMP7_TC_BC:%.*]] = bitcast <2 x i32> [[TMP7_TC]] to <8 x i8> +; GFX906-NEXT: store <8 x i8> [[TMP7_TC_BC]], ptr addrspace(1) [[DST1]], align 4 +; GFX906-NEXT: ret void +; +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx + %vec1 = load <8 x i8>, ptr addrspace(1) %gep1 + %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx + %vec2 = load <8 x i8>, ptr addrspace(1) %gep2 + %cmp = icmp ult i32 %idx, 15 + br i1 %cmp, label %bb.1, label %bb.2 +bb.1: + %cmp2 = icmp ult i32 %idx, 7 + br i1 %cmp2, label %bb.2, label %bb.3 + +bb.2: + %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] + store <8 x i8> %tmp5, ptr addrspace(1) %dst0, align 4 + br label %bb.3 + +bb.3: + %tmp7 = phi <8 x i8> [ %vec2, %bb.1], [%tmp5, %bb.2] + store <8 x i8> %tmp7, ptr addrspace(1) %dst1, align 4 + ret void +} + +define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { +; GFX906-LABEL: define amdgpu_kernel void @v8i8_multi_block( +; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST0:%.*]], ptr addrspace(1) nocapture [[DST1:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: entry: +; GFX906-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; GFX906-NEXT: [[GEP1:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]] +; GFX906-NEXT: [[VEC1:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP1]], align 8 +; GFX906-NEXT: [[VEC1_BC:%.*]] = bitcast <8 x i8> [[VEC1]] to <2 x i32> +; GFX906-NEXT: [[GEP2:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]] +; GFX906-NEXT: [[VEC2:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP2]], align 8 +; GFX906-NEXT: [[VEC2_BC:%.*]] = bitcast <8 x i8> [[VEC2]] to <2 x i32> +; GFX906-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15 +; GFX906-NEXT: br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_3:%.*]] +; GFX906: bb.1: +; GFX906-NEXT: [[CMP2:%.*]] = icmp ult i32 [[IDX]], 7 +; GFX906-NEXT: br i1 [[CMP2]], label [[BB_2:%.*]], label [[BB_3]] +; GFX906: bb.2: +; GFX906-NEXT: [[VEC1_BC_BC:%.*]] = bitcast <2 x i32> [[VEC1_BC]] to <8 x i8> +; GFX906-NEXT: store <8 x i8> [[VEC1_BC_BC]], ptr addrspace(1) [[DST0]], align 4 +; GFX906-NEXT: br label [[BB_3]] +; GFX906: bb.3: +; GFX906-NEXT: [[TMP5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ], [ [[VEC2_BC]], [[BB_2]] ] +; GFX906-NEXT: [[TMP5_TC_BC:%.*]] = bitcast <2 x i32> [[TMP5_TC]] to <8 x i8> +; GFX906-NEXT: store <8 x i8> [[TMP5_TC_BC]], ptr addrspace(1) [[DST1]], align 4 +; GFX906-NEXT: ret void +; +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx + %vec1 = load <8 x i8>, ptr addrspace(1) %gep1 + %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx + %vec2 = load <8 x i8>, ptr addrspace(1) %gep2 + %cmp = icmp ult i32 %idx, 15 + br i1 %cmp, label %bb.1, label %bb.3 +bb.1: + %cmp2 = icmp ult i32 %idx, 7 + br i1 %cmp2, label %bb.2, label %bb.3 + +bb.2: + store <8 x i8> %vec1, ptr addrspace(1) %dst0, align 4 + br label %bb.3 + +bb.3: + %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ], [ %vec2, %bb.2] + store <8 x i8> %tmp5, ptr addrspace(1) %dst1, align 4 + ret void +} + +define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { +; GFX906-LABEL: define amdgpu_kernel void @v32i8_loop_carried( +; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: entry: +; GFX906-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; GFX906-NEXT: [[GEP1:%.*]] = getelementptr <32 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]] +; GFX906-NEXT: [[VEC1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 4 +; GFX906-NEXT: [[VEC1_BC:%.*]] = bitcast <4 x i8> [[VEC1]] to i32 +; GFX906-NEXT: br label [[BB_1:%.*]] +; GFX906: bb.1: +; GFX906-NEXT: [[TEMP_TC:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC:%.*]], [[BB_1]] ] +; GFX906-NEXT: [[TEMP_TC_BC:%.*]] = bitcast i32 [[TEMP_TC]] to <4 x i8> +; GFX906-NEXT: [[VEC1_BC_BC:%.*]] = bitcast i32 [[VEC1_BC]] to <4 x i8> +; GFX906-NEXT: [[VEC2:%.*]] = shufflevector <4 x i8> [[VEC1_BC_BC]], <4 x i8> [[TEMP_TC_BC]], <4 x i32> <i32 0, i32 2, i32 4, i32 6> +; GFX906-NEXT: [[VEC2_BC]] = bitcast <4 x i8> [[VEC2]] to i32 +; GFX906-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15 +; GFX906-NEXT: br i1 [[CMP]], label [[BB_1]], label [[BB_2:%.*]] +; GFX906: 0: +; GFX906-NEXT: br label [[BB_2]] +; GFX906: bb.2: +; GFX906-NEXT: [[VEC2_BC_BC:%.*]] = bitcast i32 [[VEC2_BC]] to <4 x i8> +; GFX906-NEXT: store <4 x i8> [[VEC2_BC_BC]], ptr addrspace(1) [[DST]], align 4 +; GFX906-NEXT: ret void +; +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %gep1 = getelementptr <32 x i8>, ptr addrspace(1) %src1, i32 %idx + %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 + br label %bb.1 + +bb.1: + %temp = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] + %vec2 = shufflevector <4 x i8> %vec1, <4 x i8> %temp, <4 x i32> <i32 0, i32 2, i32 4, i32 6> + %cmp = icmp ult i32 %idx, 15 + br i1 %cmp, label %bb.1, label %bb.2 + br label %bb.2 + +bb.2: + store <4 x i8> %vec2, ptr addrspace(1) %dst, align 4 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() |