1 files changed, 93 insertions, 11 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index cb49936..ef58004 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -1211,16 +1211,81 @@ AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(const IRPosition &IRP,
   llvm_unreachable("AAAMDWavesPerEU is only valid for function position");
 }
 
-static bool inlineAsmUsesAGPRs(const InlineAsm *IA) {
-  for (const auto &CI : IA->ParseConstraints()) {
+/// Compute the minimum number of AGPRs required to allocate the inline asm.
+static unsigned inlineAsmGetNumRequiredAGPRs(const InlineAsm *IA,
+                                             const CallBase &Call) {
+  unsigned ArgNo = 0;
+  unsigned ResNo = 0;
+  unsigned AGPRDefCount = 0;
+  unsigned AGPRUseCount = 0;
+  unsigned MaxPhysReg = 0;
+  const DataLayout &DL = Call.getFunction()->getParent()->getDataLayout();
+
+  // TODO: Overestimates due to not accounting for tied operands
+  for (const InlineAsm::ConstraintInfo &CI : IA->ParseConstraints()) {
+    Type *Ty = nullptr;
+    switch (CI.Type) {
+    case InlineAsm::isOutput: {
+      Ty = Call.getType();
+      if (auto *STy = dyn_cast<StructType>(Ty))
+        Ty = STy->getElementType(ResNo);
+      ++ResNo;
+      break;
+    }
+    case InlineAsm::isInput: {
+      Ty = Call.getArgOperand(ArgNo++)->getType();
+      break;
+    }
+    case InlineAsm::isLabel:
+      continue;
+    case InlineAsm::isClobber:
+      // Parse the physical register reference.
+      break;
+    }
+
     for (StringRef Code : CI.Codes) {
-      Code.consume_front("{");
-      if (Code.starts_with("a"))
-        return true;
+      unsigned RegCount = 0;
+      if (Code.starts_with("a")) {
+        // Virtual register, compute number of registers based on the type.
+        //
+        // We ought to be going through TargetLowering to get the number of
+        // registers, but we should avoid the dependence on CodeGen here.
+        RegCount = divideCeil(DL.getTypeSizeInBits(Ty), 32);
+      } else {
+        // Physical register reference
+        auto [Kind, RegIdx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Code);
+        if (Kind == 'a') {
+          RegCount = NumRegs;
+          MaxPhysReg = std::max(MaxPhysReg, std::min(RegIdx + NumRegs, 256u));
+        }
+
+        continue;
+      }
+
+      if (CI.Type == InlineAsm::isOutput) {
+        // Apply tuple alignment requirement
+        //
+        // TODO: This is more conservative than necessary.
+        AGPRDefCount = alignTo(AGPRDefCount, RegCount);
+
+        AGPRDefCount += RegCount;
+        if (CI.isEarlyClobber) {
+          AGPRUseCount = alignTo(AGPRUseCount, RegCount);
+          AGPRUseCount += RegCount;
+        }
+      } else {
+        AGPRUseCount = alignTo(AGPRUseCount, RegCount);
+        AGPRUseCount += RegCount;
+      }
     }
   }
 
-  return false;
+  unsigned MaxVirtReg = std::max(AGPRUseCount, AGPRDefCount);
+
+  // TODO: This is overly conservative. If there are any physical registers,
+  // allocate any virtual registers after them so we don't have to solve optimal
+  // packing.
+  return std::min(MaxVirtReg + MaxPhysReg, 256u);
 }
 
 // TODO: Migrate to range merge of amdgpu-agpr-alloc.
@@ -1259,14 +1324,29 @@ struct AAAMDGPUNoAGPR : public StateWrapper<BooleanState, AbstractAttribute> {
       const Function *Callee = dyn_cast<Function>(CalleeOp);
       if (!Callee) {
         if (const InlineAsm *IA = dyn_cast<InlineAsm>(CalleeOp))
-          return !inlineAsmUsesAGPRs(IA);
+          return inlineAsmGetNumRequiredAGPRs(IA, CB) == 0;
         return false;
       }
 
-      // Some intrinsics may use AGPRs, but if we have a choice, we are not
-      // required to use AGPRs.
-      if (Callee->isIntrinsic())
+      switch (Callee->getIntrinsicID()) {
+      case Intrinsic::not_intrinsic:
+        break;
+      case Intrinsic::write_register:
+      case Intrinsic::read_register:
+      case Intrinsic::read_volatile_register: {
+        const MDString *RegName = cast<MDString>(
+            cast<MDNode>(
+                cast<MetadataAsValue>(CB.getArgOperand(0))->getMetadata())
+                ->getOperand(0));
+        auto [Kind, RegIdx, NumRegs] =
+            AMDGPU::parseAsmPhysRegName(RegName->getString());
+        return Kind != 'a';
+      }
+      default:
+        // Some intrinsics may use AGPRs, but if we have a choice, we are not
+        // required to use AGPRs.
         return true;
+      }
 
       // TODO: Handle callsite attributes
       const auto *CalleeInfo = A.getAAFor<AAAMDGPUNoAGPR>(
@@ -1504,7 +1584,6 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
     A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(*F));
     A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(*F));
     A.getOrCreateAAFor<AAAMDMaxNumWorkgroups>(IRPosition::function(*F));
-    A.getOrCreateAAFor<AAAMDGPUNoAGPR>(IRPosition::function(*F));
     CallingConv::ID CC = F->getCallingConv();
     if (!AMDGPU::isEntryFunctionCC(CC)) {
       A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(*F));
@@ -1515,6 +1594,9 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
     if (!F->isDeclaration() && ST.hasClusters())
       A.getOrCreateAAFor<AAAMDGPUClusterDims>(IRPosition::function(*F));
 
+    if (ST.hasGFX90AInsts())
+      A.getOrCreateAAFor<AAAMDGPUNoAGPR>(IRPosition::function(*F));
+
     for (auto &I : instructions(F)) {
       Value *Ptr = nullptr;
       if (auto *LI = dyn_cast<LoadInst>(&I))