diff options
Diffstat (limited to 'llvm/lib')
59 files changed, 1052 insertions, 450 deletions
diff --git a/llvm/lib/Analysis/HashRecognize.cpp b/llvm/lib/Analysis/HashRecognize.cpp index 4529123..8974ce5 100644 --- a/llvm/lib/Analysis/HashRecognize.cpp +++ b/llvm/lib/Analysis/HashRecognize.cpp @@ -468,8 +468,11 @@ std::variant<PolynomialInfo, StringRef> HashRecognize::recognizeCRC() const { // Ensure that the PHIs have exactly two uses: // the bit-shift, and the XOR (or a cast feeding into the XOR). + // Also ensure that the SimpleRecurrence's evolution doesn't have stray + // users. if (!ConditionalRecurrence.Phi->hasNUses(2) || - !SimpleRecurrence.Phi->hasNUses(2)) + !SimpleRecurrence.Phi->hasNUses(2) || + SimpleRecurrence.BO->getUniqueUndroppableUser() != SimpleRecurrence.Phi) return "Recurrences have stray uses"; // Check that the SelectInst ConditionalRecurrence.Step is conditional on diff --git a/llvm/lib/Analysis/HeatUtils.cpp b/llvm/lib/Analysis/HeatUtils.cpp index a1cc707..08e9428 100644 --- a/llvm/lib/Analysis/HeatUtils.cpp +++ b/llvm/lib/Analysis/HeatUtils.cpp @@ -64,10 +64,7 @@ std::string llvm::getHeatColor(uint64_t Freq, uint64_t MaxFreq) { } std::string llvm::getHeatColor(double Percent) { - if (Percent > 1.0) - Percent = 1.0; - if (Percent < 0.0) - Percent = 0.0; + Percent = std::clamp(Percent, 0.0, 1.0); unsigned ColorID = unsigned(round(Percent * (HeatSize - 1.0))); return HeatPalette[ColorID]; } diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index 5164cec..8e3ce49 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -4538,6 +4538,9 @@ bool LLParser::parseValID(ValID &ID, PerFunctionState *PFS, Type *ExpectedTy) { if (!Indices.empty() && !Ty->isSized(&Visited)) return error(ID.Loc, "base element of getelementptr must be sized"); + if (!ConstantExpr::isSupportedGetElementPtr(Ty)) + return error(ID.Loc, "invalid base element for constant getelementptr"); + if (!GetElementPtrInst::getIndexedType(Ty, Indices)) return error(ID.Loc, "invalid getelementptr indices"); @@ -5639,16 +5642,17 @@ bool LLParser::parseDIBasicType(MDNode *&Result, bool IsDistinct) { OPTIONAL(name, MDStringField, ); \ OPTIONAL(size, MDUnsignedOrMDField, (0, UINT64_MAX)); \ OPTIONAL(align, MDUnsignedField, (0, UINT32_MAX)); \ + OPTIONAL(dataSize, MDUnsignedField, (0, UINT32_MAX)); \ OPTIONAL(encoding, DwarfAttEncodingField, ); \ OPTIONAL(num_extra_inhabitants, MDUnsignedField, (0, UINT32_MAX)); \ OPTIONAL(flags, DIFlagField, ); PARSE_MD_FIELDS(); #undef VISIT_MD_FIELDS - Result = GET_OR_DISTINCT(DIBasicType, (Context, tag.Val, name.Val, - size.getValueAsMetadata(Context), - align.Val, encoding.Val, - num_extra_inhabitants.Val, flags.Val)); + Result = GET_OR_DISTINCT( + DIBasicType, + (Context, tag.Val, name.Val, size.getValueAsMetadata(Context), align.Val, + encoding.Val, num_extra_inhabitants.Val, dataSize.Val, flags.Val)); return false; } @@ -6341,8 +6345,8 @@ bool LLParser::parseDIObjCProperty(MDNode *&Result, bool IsDistinct) { #undef VISIT_MD_FIELDS Result = GET_OR_DISTINCT(DIObjCProperty, - (Context, name.Val, file.Val, line.Val, setter.Val, - getter.Val, attributes.Val, type.Val)); + (Context, name.Val, file.Val, line.Val, getter.Val, + setter.Val, attributes.Val, type.Val)); return false; } diff --git a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp index ed0443f..c63dc8f 100644 --- a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp +++ b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp @@ -1531,7 +1531,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata( break; } case bitc::METADATA_BASIC_TYPE: { - if (Record.size() < 6 || Record.size() > 8) + if (Record.size() < 6 || Record.size() > 9) return error("Invalid record"); IsDistinct = Record[0] & 1; @@ -1540,13 +1540,13 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata( ? static_cast<DINode::DIFlags>(Record[6]) : DINode::FlagZero; uint32_t NumExtraInhabitants = (Record.size() > 7) ? Record[7] : 0; - + uint32_t DataSizeInBits = (Record.size() > 8) ? Record[8] : 0; Metadata *SizeInBits = getMetadataOrConstant(SizeIsMetadata, Record[3]); - MetadataList.assignValue( GET_OR_DISTINCT(DIBasicType, (Context, Record[1], getMDString(Record[2]), SizeInBits, - Record[4], Record[5], NumExtraInhabitants, Flags)), + Record[4], Record[5], NumExtraInhabitants, + DataSizeInBits, Flags)), NextMetadataNo); NextMetadataNo++; break; @@ -2323,8 +2323,9 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata( GET_OR_DISTINCT(DIObjCProperty, (Context, getMDString(Record[1]), getMDOrNull(Record[2]), Record[3], - getMDString(Record[4]), getMDString(Record[5]), - Record[6], getDITypeRefOrNull(Record[7]))), + /*GetterName=*/getMDString(Record[5]), + /*SetterName=*/getMDString(Record[4]), Record[6], + getDITypeRefOrNull(Record[7]))), NextMetadataNo); NextMetadataNo++; break; diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 61aa7c2f5..f17656c 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -1925,6 +1925,7 @@ void ModuleBitcodeWriter::writeDIBasicType(const DIBasicType *N, Record.push_back(N->getEncoding()); Record.push_back(N->getFlags()); Record.push_back(N->getNumExtraInhabitants()); + Record.push_back(N->getDataSizeInBits()); Stream.EmitRecord(bitc::METADATA_BASIC_TYPE, Record, Abbrev); Record.clear(); diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp index 518121e..751d373 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp @@ -1793,9 +1793,13 @@ void DwarfCompileUnit::createBaseTypeDIEs() { "_" + Twine(Btr.BitSize)).toStringRef(Str)); addUInt(Die, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1, Btr.Encoding); // Round up to smallest number of bytes that contains this number of bits. + // ExprRefedBaseTypes is populated with types referenced by + // DW_OP_LLVM_convert operations in location expressions. These are often + // byte-sized, but one common counter-example is 1-bit sized conversions + // from `i1` types. TODO: Should these use DW_AT_bit_size? See + // DwarfUnit::constructTypeDIE. addUInt(Die, dwarf::DW_AT_byte_size, std::nullopt, divideCeil(Btr.BitSize, 8)); - Btr.Die = &Die; } } diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp index e40fb76..555c56f 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp @@ -766,8 +766,19 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DIBasicType *BTy) { addUInt(Buffer, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1, BTy->getEncoding()); - uint64_t Size = BTy->getSizeInBits() >> 3; - addUInt(Buffer, dwarf::DW_AT_byte_size, std::nullopt, Size); + uint64_t SizeInBytes = divideCeil(BTy->getSizeInBits(), 8); + addUInt(Buffer, dwarf::DW_AT_byte_size, std::nullopt, SizeInBytes); + if (BTy->getTag() == dwarf::Tag::DW_TAG_base_type) { + // DW_TAG_base_type: + // If the value of an object of the given type does not fully occupy the + // storage described by a byte size attribute, the base type entry may also + // have a DW_AT_bit_size [...] attribute. + // TODO: Do big endian targets need DW_AT_data_bit_offset? See discussion in + // pull request #164372. + if (uint64_t DataSizeInBits = BTy->getDataSizeInBits(); + DataSizeInBits && DataSizeInBits != SizeInBytes * 8) + addUInt(Buffer, dwarf::DW_AT_bit_size, std::nullopt, DataSizeInBits); + } if (BTy->isBigEndian()) addUInt(Buffer, dwarf::DW_AT_endianity, std::nullopt, dwarf::DW_END_big); diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 1f10478..9ace7d6 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -4425,6 +4425,7 @@ void CombinerHelper::applyBuildFnNoErase( } bool CombinerHelper::matchOrShiftToFunnelShift(MachineInstr &MI, + bool AllowScalarConstants, BuildFnTy &MatchInfo) const { assert(MI.getOpcode() == TargetOpcode::G_OR); @@ -4444,31 +4445,29 @@ bool CombinerHelper::matchOrShiftToFunnelShift(MachineInstr &MI, // Given constants C0 and C1 such that C0 + C1 is bit-width: // (or (shl x, C0), (lshr y, C1)) -> (fshl x, y, C0) or (fshr x, y, C1) - int64_t CstShlAmt, CstLShrAmt; + int64_t CstShlAmt = 0, CstLShrAmt; if (mi_match(ShlAmt, MRI, m_ICstOrSplat(CstShlAmt)) && mi_match(LShrAmt, MRI, m_ICstOrSplat(CstLShrAmt)) && CstShlAmt + CstLShrAmt == BitWidth) { FshOpc = TargetOpcode::G_FSHR; Amt = LShrAmt; - } else if (mi_match(LShrAmt, MRI, m_GSub(m_SpecificICstOrSplat(BitWidth), m_Reg(Amt))) && ShlAmt == Amt) { // (or (shl x, amt), (lshr y, (sub bw, amt))) -> (fshl x, y, amt) FshOpc = TargetOpcode::G_FSHL; - } else if (mi_match(ShlAmt, MRI, m_GSub(m_SpecificICstOrSplat(BitWidth), m_Reg(Amt))) && LShrAmt == Amt) { // (or (shl x, (sub bw, amt)), (lshr y, amt)) -> (fshr x, y, amt) FshOpc = TargetOpcode::G_FSHR; - } else { return false; } LLT AmtTy = MRI.getType(Amt); - if (!isLegalOrBeforeLegalizer({FshOpc, {Ty, AmtTy}})) + if (!isLegalOrBeforeLegalizer({FshOpc, {Ty, AmtTy}}) && + (!AllowScalarConstants || CstShlAmt == 0 || !Ty.isScalar())) return false; MatchInfo = [=](MachineIRBuilder &B) { diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index cf221bb..1ef5dc2 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -23506,6 +23506,93 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { // inselt undef, InVal, EltNo --> build_vector < InVal, InVal, ... > if (InVec.isUndef() && TLI.shouldSplatInsEltVarIndex(VT)) return DAG.getSplat(VT, DL, InVal); + + // Extend this type to be byte-addressable + EVT OldVT = VT; + EVT EltVT = VT.getVectorElementType(); + bool IsByteSized = EltVT.isByteSized(); + if (!IsByteSized) { + EltVT = + EltVT.changeTypeToInteger().getRoundIntegerType(*DAG.getContext()); + VT = VT.changeElementType(EltVT); + } + + // Check if this operation will be handled the default way for its type. + auto IsTypeDefaultHandled = [this](EVT VT) { + return TLI.getTypeAction(*DAG.getContext(), VT) == + TargetLowering::TypeSplitVector || + TLI.isOperationExpand(ISD::INSERT_VECTOR_ELT, VT); + }; + + // Check if this operation is illegal and will be handled the default way, + // even after extending the type to be byte-addressable. + if (IsTypeDefaultHandled(OldVT) && IsTypeDefaultHandled(VT)) { + // For each dynamic insertelt, the default way will save the vector to + // the stack, store at an offset, and load the modified vector. This can + // dramatically increase code size if we have a chain of insertelts on a + // large vector: requiring O(V*C) stores/loads where V = length of + // vector and C is length of chain. If each insertelt is only fed into the + // next, the vector is write-only across this chain, and we can just + // save once before the chain and load after in O(V + C) operations. + SmallVector<SDNode *> Seq{N}; + unsigned NumDynamic = 1; + while (true) { + SDValue InVec = Seq.back()->getOperand(0); + if (InVec.getOpcode() != ISD::INSERT_VECTOR_ELT) + break; + Seq.push_back(InVec.getNode()); + NumDynamic += !isa<ConstantSDNode>(InVec.getOperand(2)); + } + + // It always and only makes sense to lower this sequence when we have more + // than one dynamic insertelt, since we will not have more than V constant + // insertelts, so we will be reducing the total number of stores+loads. + if (NumDynamic > 1) { + // In cases where the vector is illegal it will be broken down into + // parts and stored in parts - we should use the alignment for the + // smallest part. + Align SmallestAlign = DAG.getReducedAlign(VT, /*UseABI=*/false); + SDValue StackPtr = + DAG.CreateStackTemporary(VT.getStoreSize(), SmallestAlign); + auto &MF = DAG.getMachineFunction(); + int FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex(); + auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex); + + // Save the vector to the stack + SDValue InVec = Seq.back()->getOperand(0); + if (!IsByteSized) + InVec = DAG.getNode(ISD::ANY_EXTEND, DL, VT, InVec); + SDValue Store = DAG.getStore(DAG.getEntryNode(), DL, InVec, StackPtr, + PtrInfo, SmallestAlign); + + // Lower each dynamic insertelt to a store + for (SDNode *N : reverse(Seq)) { + SDValue Elmnt = N->getOperand(1); + SDValue Index = N->getOperand(2); + + // Check if we have to extend the element type + if (!IsByteSized && Elmnt.getValueType().bitsLT(EltVT)) + Elmnt = DAG.getNode(ISD::ANY_EXTEND, DL, EltVT, Elmnt); + + // Store the new element. This may be larger than the vector element + // type, so use a truncating store. + SDValue EltPtr = + TLI.getVectorElementPointer(DAG, StackPtr, VT, Index); + EVT EltVT = Elmnt.getValueType(); + Store = DAG.getTruncStore( + Store, DL, Elmnt, EltPtr, MachinePointerInfo::getUnknownStack(MF), + EltVT, + commonAlignment(SmallestAlign, EltVT.getFixedSizeInBits() / 8)); + } + + // Load the saved vector from the stack + SDValue Load = + DAG.getLoad(VT, DL, Store, StackPtr, PtrInfo, SmallestAlign); + SDValue LoadV = Load.getValue(0); + return IsByteSized ? LoadV : DAG.getAnyExtOrTrunc(LoadV, DL, OldVT); + } + } + return SDValue(); } diff --git a/llvm/lib/DebugInfo/GSYM/GsymCreator.cpp b/llvm/lib/DebugInfo/GSYM/GsymCreator.cpp index 93ff3b9..d87cb4d 100644 --- a/llvm/lib/DebugInfo/GSYM/GsymCreator.cpp +++ b/llvm/lib/DebugInfo/GSYM/GsymCreator.cpp @@ -552,7 +552,7 @@ llvm::Error GsymCreator::saveSegments(StringRef Path, createSegment(SegmentSize, FuncIdx); if (ExpectedGC) { GsymCreator *GC = ExpectedGC->get(); - if (GC == NULL) + if (!GC) break; // We had not more functions to encode. // Don't collect any messages at all OutputAggregator Out(nullptr); diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderPerf.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderPerf.cpp index 1a61d31..e609a7d 100644 --- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderPerf.cpp +++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderPerf.cpp @@ -55,7 +55,7 @@ struct PerfState { std::unique_ptr<raw_fd_ostream> Dumpstream; // perf mmap marker - void *MarkerAddr = NULL; + void *MarkerAddr = nullptr; }; // prevent concurrent dumps from messing up the output file diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 286ed03..0e5926f 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -5473,7 +5473,8 @@ OpenMPIRBuilder::collapseLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops, } // TODO: Enable UndefinedSanitizer to diagnose an overflow here. - CollapsedTripCount = Builder.CreateNUWMul(CollapsedTripCount, OrigTripCount); + CollapsedTripCount = + Builder.CreateNUWMul(CollapsedTripCount, OrigTripCount); } // Create the collapsed loop control flow. @@ -9338,9 +9339,8 @@ OpenMPIRBuilder::createAtomicRead(const LocationDescription &Loc, // target does not support `atomicrmw` of the size of the struct LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read"); OldVal->setAtomic(AO); - const DataLayout &LoadDL = OldVal->getModule()->getDataLayout(); - unsigned LoadSize = - LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType()); + const DataLayout &DL = OldVal->getModule()->getDataLayout(); + unsigned LoadSize = DL.getTypeStoreSize(XElemTy); OpenMPIRBuilder::AtomicInfo atomicInfo( &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(), OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var); @@ -9384,9 +9384,8 @@ OpenMPIRBuilder::createAtomicWrite(const LocationDescription &Loc, XSt->setAtomic(AO); } else if (XElemTy->isStructTy()) { LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read"); - const DataLayout &LoadDL = OldVal->getModule()->getDataLayout(); - unsigned LoadSize = - LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType()); + const DataLayout &DL = OldVal->getModule()->getDataLayout(); + unsigned LoadSize = DL.getTypeStoreSize(XElemTy); OpenMPIRBuilder::AtomicInfo atomicInfo( &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(), OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var); @@ -9581,7 +9580,7 @@ Expected<std::pair<Value *, Value *>> OpenMPIRBuilder::emitAtomicUpdate( OldVal->setAtomic(AO); // CurBB // | /---\ - // ContBB | + // ContBB | // | \---/ // ExitBB BasicBlock *CurBB = Builder.GetInsertBlock(); diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp index 3c222f5..95d954f 100644 --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -2199,6 +2199,7 @@ static void writeDIBasicType(raw_ostream &Out, const DIBasicType *N, Printer.printString("name", N->getName()); Printer.printMetadataOrInt("size", N->getRawSizeInBits(), true); Printer.printInt("align", N->getAlignInBits()); + Printer.printInt("dataSize", N->getDataSizeInBits()); Printer.printDwarfEnum("encoding", N->getEncoding(), dwarf::AttributeEncodingString); Printer.printInt("num_extra_inhabitants", N->getNumExtraInhabitants()); diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp index 07a870f..ca11ecf 100644 --- a/llvm/lib/IR/DIBuilder.cpp +++ b/llvm/lib/IR/DIBuilder.cpp @@ -261,10 +261,12 @@ DIBasicType *DIBuilder::createNullPtrType() { DIBasicType *DIBuilder::createBasicType(StringRef Name, uint64_t SizeInBits, unsigned Encoding, DINode::DIFlags Flags, - uint32_t NumExtraInhabitants) { + uint32_t NumExtraInhabitants, + uint32_t DataSizeInBits) { assert(!Name.empty() && "Unable to create type without name"); return DIBasicType::get(VMContext, dwarf::DW_TAG_base_type, Name, SizeInBits, - 0, Encoding, NumExtraInhabitants, Flags); + 0, Encoding, NumExtraInhabitants, DataSizeInBits, + Flags); } DIFixedPointType * diff --git a/llvm/lib/IR/DebugInfoMetadata.cpp b/llvm/lib/IR/DebugInfoMetadata.cpp index e30df88..fafc325 100644 --- a/llvm/lib/IR/DebugInfoMetadata.cpp +++ b/llvm/lib/IR/DebugInfoMetadata.cpp @@ -872,15 +872,18 @@ DIEnumerator *DIEnumerator::getImpl(LLVMContext &Context, const APInt &Value, DIBasicType *DIBasicType::getImpl(LLVMContext &Context, unsigned Tag, MDString *Name, Metadata *SizeInBits, uint32_t AlignInBits, unsigned Encoding, - uint32_t NumExtraInhabitants, DIFlags Flags, + uint32_t NumExtraInhabitants, + uint32_t DataSizeInBits, DIFlags Flags, StorageType Storage, bool ShouldCreate) { assert(isCanonical(Name) && "Expected canonical MDString"); - DEFINE_GETIMPL_LOOKUP(DIBasicType, (Tag, Name, SizeInBits, AlignInBits, - Encoding, NumExtraInhabitants, Flags)); + DEFINE_GETIMPL_LOOKUP(DIBasicType, + (Tag, Name, SizeInBits, AlignInBits, Encoding, + NumExtraInhabitants, DataSizeInBits, Flags)); Metadata *Ops[] = {nullptr, nullptr, Name, SizeInBits, nullptr}; - DEFINE_GETIMPL_STORE(DIBasicType, - (Tag, AlignInBits, Encoding, NumExtraInhabitants, Flags), - Ops); + DEFINE_GETIMPL_STORE( + DIBasicType, + (Tag, AlignInBits, Encoding, NumExtraInhabitants, DataSizeInBits, Flags), + Ops); } std::optional<DIBasicType::Signedness> DIBasicType::getSignedness() const { diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h index e03f993..2c9921d 100644 --- a/llvm/lib/IR/LLVMContextImpl.h +++ b/llvm/lib/IR/LLVMContextImpl.h @@ -480,20 +480,22 @@ template <> struct MDNodeKeyImpl<DIBasicType> { uint32_t AlignInBits; unsigned Encoding; uint32_t NumExtraInhabitants; + uint32_t DataSizeInBits; unsigned Flags; MDNodeKeyImpl(unsigned Tag, MDString *Name, Metadata *SizeInBits, uint32_t AlignInBits, unsigned Encoding, - uint32_t NumExtraInhabitants, unsigned Flags) + uint32_t NumExtraInhabitants, uint32_t DataSizeInBits, + unsigned Flags) : Tag(Tag), Name(Name), SizeInBits(SizeInBits), AlignInBits(AlignInBits), Encoding(Encoding), NumExtraInhabitants(NumExtraInhabitants), - Flags(Flags) {} + DataSizeInBits(DataSizeInBits), Flags(Flags) {} MDNodeKeyImpl(const DIBasicType *N) : Tag(N->getTag()), Name(N->getRawName()), SizeInBits(N->getRawSizeInBits()), AlignInBits(N->getAlignInBits()), Encoding(N->getEncoding()), - NumExtraInhabitants(N->getNumExtraInhabitants()), Flags(N->getFlags()) { - } + NumExtraInhabitants(N->getNumExtraInhabitants()), + DataSizeInBits(N->getDataSizeInBits()), Flags(N->getFlags()) {} bool isKeyOf(const DIBasicType *RHS) const { return Tag == RHS->getTag() && Name == RHS->getRawName() && @@ -501,6 +503,7 @@ template <> struct MDNodeKeyImpl<DIBasicType> { AlignInBits == RHS->getAlignInBits() && Encoding == RHS->getEncoding() && NumExtraInhabitants == RHS->getNumExtraInhabitants() && + DataSizeInBits == RHS->getDataSizeInBits() && Flags == RHS->getFlags(); } diff --git a/llvm/lib/MC/MCParser/AsmLexer.cpp b/llvm/lib/MC/MCParser/AsmLexer.cpp index a6188f0..1af4a29 100644 --- a/llvm/lib/MC/MCParser/AsmLexer.cpp +++ b/llvm/lib/MC/MCParser/AsmLexer.cpp @@ -16,7 +16,6 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/MC/MCAsmInfo.h" -#include "llvm/MC/MCParser/AsmLexer.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/SMLoc.h" #include "llvm/Support/SaveAndRestore.h" diff --git a/llvm/lib/SandboxIR/Context.cpp b/llvm/lib/SandboxIR/Context.cpp index 70ac68a..fb6ff62 100644 --- a/llvm/lib/SandboxIR/Context.cpp +++ b/llvm/lib/SandboxIR/Context.cpp @@ -443,7 +443,7 @@ Argument *Context::getOrCreateArgument(llvm::Argument *LLVMArg) { } Constant *Context::getOrCreateConstant(llvm::Constant *LLVMC) { - return cast<Constant>(getOrCreateValueInternal(LLVMC, 0)); + return cast<Constant>(getOrCreateValueInternal(LLVMC, nullptr)); } BasicBlock *Context::createBasicBlock(llvm::BasicBlock *LLVMBB) { diff --git a/llvm/lib/Support/AutoConvert.cpp b/llvm/lib/Support/AutoConvert.cpp index 0b6928e..741bb7b 100644 --- a/llvm/lib/Support/AutoConvert.cpp +++ b/llvm/lib/Support/AutoConvert.cpp @@ -96,7 +96,7 @@ std::error_code llvm::setzOSFileTag(int FD, int CCSID, bool Text) { return std::error_code(); } -ErrorOr<__ccsid_t> llvm::getzOSFileTag(const char *FileName, const int FD) { +ErrorOr<__ccsid_t> llvm::getzOSFileTag(const Twine &FileName, const int FD) { // If we have a file descriptor, use it to find out file tagging. Otherwise we // need to use stat() with the file path. if (FD != -1) { @@ -110,12 +110,12 @@ ErrorOr<__ccsid_t> llvm::getzOSFileTag(const char *FileName, const int FD) { return Query.fccsid; } struct stat Attr; - if (stat(FileName, &Attr) == -1) + if (stat(FileName.str().c_str(), &Attr) == -1) return std::error_code(errno, std::generic_category()); return Attr.st_tag.ft_ccsid; } -ErrorOr<bool> llvm::needzOSConversion(const char *FileName, const int FD) { +ErrorOr<bool> llvm::needzOSConversion(const Twine &FileName, const int FD) { ErrorOr<__ccsid_t> Ccsid = getzOSFileTag(FileName, FD); if (std::error_code EC = Ccsid.getError()) return EC; diff --git a/llvm/lib/Support/MemoryBuffer.cpp b/llvm/lib/Support/MemoryBuffer.cpp index 1c4645a..23b9f8c 100644 --- a/llvm/lib/Support/MemoryBuffer.cpp +++ b/llvm/lib/Support/MemoryBuffer.cpp @@ -512,7 +512,7 @@ getOpenFileImpl(sys::fs::file_t FD, const Twine &Filename, uint64_t FileSize, } #ifdef __MVS__ - ErrorOr<bool> NeedsConversion = needConversion(Filename.str().c_str(), FD); + ErrorOr<bool> NeedsConversion = needConversion(Filename, FD); if (std::error_code EC = NeedsConversion.getError()) return EC; // File size may increase due to EBCDIC -> UTF-8 conversion, therefore we diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index b3ec65c..2783147 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -366,6 +366,7 @@ def AArch64PostLegalizerCombiner select_to_minmax, or_to_bsp, combine_concat_vector, commute_constant_to_rhs, extract_vec_elt_combines, push_freeze_to_prevent_poison_from_propagating, - combine_mul_cmlt, combine_use_vector_truncate, - extmultomull, truncsat_combines, lshr_of_trunc_of_lshr]> { + combine_mul_cmlt, combine_use_vector_truncate, + extmultomull, truncsat_combines, lshr_of_trunc_of_lshr, + funnel_shift_from_or_shift_constants_are_legal]> { } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 457e540..ccc8eb8 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -122,7 +122,7 @@ unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { NumBytes = Desc.getSize() ? Desc.getSize() : 4; const auto *MFI = MF->getInfo<AArch64FunctionInfo>(); - if (!MFI->shouldSignReturnAddress(MF)) + if (!MFI->shouldSignReturnAddress(*MF)) return NumBytes; const auto &STI = MF->getSubtarget<AArch64Subtarget>(); diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index b9e299e..2871a20 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -1805,14 +1805,22 @@ def : SHA3_pattern<EOR3, int_aarch64_crypto_eor3u, v8i16>; def : SHA3_pattern<EOR3, int_aarch64_crypto_eor3u, v4i32>; def : SHA3_pattern<EOR3, int_aarch64_crypto_eor3u, v2i64>; -class EOR3_pattern<ValueType VecTy> - : Pat<(xor (xor (VecTy V128:$Vn), (VecTy V128:$Vm)), (VecTy V128:$Va)), - (EOR3 (VecTy V128:$Vn), (VecTy V128:$Vm), (VecTy V128:$Va))>; - -def : EOR3_pattern<v16i8>; -def : EOR3_pattern<v8i16>; -def : EOR3_pattern<v4i32>; -def : EOR3_pattern<v2i64>; +multiclass EOR3_pattern<ValueType Vec128Ty, ValueType Vec64Ty>{ + def : Pat<(xor (xor (Vec128Ty V128:$Vn), (Vec128Ty V128:$Vm)), (Vec128Ty V128:$Va)), + (EOR3 (Vec128Ty V128:$Vn), (Vec128Ty V128:$Vm), (Vec128Ty V128:$Va))>; + def : Pat<(xor (xor (Vec64Ty V64:$Vn), (Vec64Ty V64:$Vm)), (Vec64Ty V64:$Va)), + (EXTRACT_SUBREG + (EOR3 + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vn, dsub), + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vm, dsub), + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Va, dsub)), + dsub)>; +} + +defm : EOR3_pattern<v16i8, v8i8>; +defm : EOR3_pattern<v8i16, v4i16>; +defm : EOR3_pattern<v4i32, v2i32>; +defm : EOR3_pattern<v2i64, v1i64>; class BCAX_pattern<ValueType VecTy> : Pat<(xor (VecTy V128:$Vn), (and (VecTy V128:$Vm), (vnot (VecTy V128:$Va)))), diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index ce2b4a5..cd8b249 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -562,9 +562,13 @@ public: void initializeAMDGPURewriteAGPRCopyMFMALegacyPass(PassRegistry &); extern char &AMDGPURewriteAGPRCopyMFMALegacyID; +void initializeAMDGPUUniformIntrinsicCombineLegacyPass(PassRegistry &); +extern char &AMDGPUUniformIntrinsicCombineLegacyPassID; +FunctionPass *createAMDGPUUniformIntrinsicCombineLegacyPass(); + struct AMDGPUUniformIntrinsicCombinePass : public PassInfoMixin<AMDGPUUniformIntrinsicCombinePass> { - PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); }; namespace AMDGPU { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h index 0eb00cb..529da8d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h @@ -50,6 +50,7 @@ const D16ImageDimIntrinsic *lookupD16ImageDimIntrinsic(unsigned Intr); struct ImageDimIntrinsicInfo { unsigned Intr; unsigned BaseOpcode; + unsigned AtomicNoRetBaseOpcode; MIMGDim Dim; uint8_t NumOffsetArgs; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 97c2c9c..9ce1224 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2006,19 +2006,27 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const { MachineBasicBlock *MBB = MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); + unsigned IntrOpcode = Intr->BaseOpcode; + + // For image atomic: use no-return opcode if result is unused. + if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode) { + Register ResultDef = MI.getOperand(0).getReg(); + if (MRI->use_nodbg_empty(ResultDef)) + IntrOpcode = Intr->AtomicNoRetBaseOpcode; + } const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = - AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); + AMDGPU::getMIMGBaseOpcodeInfo(IntrOpcode); const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); - unsigned IntrOpcode = Intr->BaseOpcode; const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI); const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI); const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI); const unsigned ArgOffset = MI.getNumExplicitDefs() + 1; - Register VDataIn, VDataOut; + Register VDataIn = AMDGPU::NoRegister; + Register VDataOut = AMDGPU::NoRegister; LLT VDataTy; int NumVDataDwords = -1; bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 || @@ -2049,7 +2057,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( unsigned DMaskLanes = 0; if (BaseOpcode->Atomic) { - VDataOut = MI.getOperand(0).getReg(); + if (!BaseOpcode->NoReturn) + VDataOut = MI.getOperand(0).getReg(); VDataIn = MI.getOperand(2).getReg(); LLT Ty = MRI->getType(VDataIn); @@ -2099,8 +2108,9 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this"); unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm(); - if (BaseOpcode->Atomic) - CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization + // Keep GLC only when the atomic's result is actually used. + if (BaseOpcode->Atomic && !BaseOpcode->NoReturn) + CPol |= AMDGPU::CPol::GLC; if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) | AMDGPU::CPol::VOLATILE)) return false; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index a6074ea..bf6f1a9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -30,7 +30,6 @@ MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass( MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass()) MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this)) MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this)) -MODULE_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass()) #undef MODULE_PASS #ifndef MODULE_PASS_WITH_PARAMS @@ -69,6 +68,7 @@ FUNCTION_PASS("amdgpu-unify-divergent-exit-nodes", AMDGPUUnifyDivergentExitNodesPass()) FUNCTION_PASS("amdgpu-usenative", AMDGPUUseNativeCallsPass()) FUNCTION_PASS("si-annotate-control-flow", SIAnnotateControlFlowPass(*static_cast<const GCNTargetMachine *>(this))) +FUNCTION_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass()) #undef FUNCTION_PASS #ifndef FUNCTION_ANALYSIS diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 6214f4d..75a94ac 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -619,6 +619,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPUPreloadKernArgPrologLegacyPass(*PR); initializeAMDGPUWaitSGPRHazardsLegacyPass(*PR); initializeAMDGPUPreloadKernelArgumentsLegacyPass(*PR); + initializeAMDGPUUniformIntrinsicCombineLegacyPass(*PR); } static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { @@ -887,9 +888,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { if (EarlyInlineAll && !EnableFunctionCalls) PM.addPass(AMDGPUAlwaysInlinePass()); - - if (EnableUniformIntrinsicCombine) - PM.addPass(AMDGPUUniformIntrinsicCombinePass()); }); PB.registerPeepholeEPCallback( @@ -900,6 +898,9 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { FPM.addPass(AMDGPUUseNativeCallsPass()); if (EnableLibCallSimplify) FPM.addPass(AMDGPUSimplifyLibCallsPass()); + + if (EnableUniformIntrinsicCombine) + FPM.addPass(AMDGPUUniformIntrinsicCombinePass()); }); PB.registerCGSCCOptimizerLateEPCallback( diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp index 50c78d8..65e6ed9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp @@ -16,12 +16,6 @@ /// uniformity. And every instruction that's downstream and cares about dynamic /// uniformity must be convergent (and isel will introduce v_readfirstlane for /// them if their operands can't be proven statically uniform). -/// -/// This pass is implemented as a ModulePass because intrinsic declarations -/// exist at the module scope, allowing us to skip processing entirely if no -/// declarations are present and to traverse their user lists directly when -/// they are. A FunctionPass would instead require scanning every instruction -/// in every function to find relevant intrinsics, which is far less efficient. //===----------------------------------------------------------------------===// #include "AMDGPU.h" @@ -97,14 +91,12 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II, Tracker[NotOp] = true; // NOT preserves uniformity LLVM_DEBUG(dbgs() << "Replacing ICMP_EQ: " << *NotOp << '\n'); ICmp->replaceAllUsesWith(NotOp); - ICmp->eraseFromParent(); Changed = true; } else if (Pred == ICmpInst::ICMP_NE && match(OtherOp, m_Zero())) { // Case: (icmp ne %ballot, 0) -> %ballot_arg LLVM_DEBUG(dbgs() << "Replacing ICMP_NE with ballot argument: " << *Src << '\n'); ICmp->replaceAllUsesWith(Src); - ICmp->eraseFromParent(); Changed = true; } } @@ -120,15 +112,17 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II, return false; } -/// Iterates over intrinsic declarations in the module to optimize their uses. -static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) { +/// Iterates over intrinsic calls in the Function to optimize. +static bool runUniformIntrinsicCombine(Function &F, const UniformityInfo &UI) { bool IsChanged = false; ValueMap<const Value *, bool> Tracker; - FunctionAnalysisManager &FAM = - AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); - for (Function &F : M) { - switch (F.getIntrinsicID()) { + for (Instruction &I : make_early_inc_range(instructions(F))) { + auto *II = dyn_cast<IntrinsicInst>(&I); + if (!II) + continue; + + switch (II->getIntrinsicID()) { case Intrinsic::amdgcn_permlane64: case Intrinsic::amdgcn_readfirstlane: case Intrinsic::amdgcn_readlane: @@ -137,23 +131,61 @@ static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) { default: continue; } - - for (User *U : make_early_inc_range(F.users())) { - auto *II = cast<IntrinsicInst>(U); - Function *ParentF = II->getFunction(); - const auto &UI = FAM.getResult<UniformityInfoAnalysis>(*ParentF); - IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker); - } + IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker); } return IsChanged; } PreservedAnalyses -AMDGPUUniformIntrinsicCombinePass::run(Module &M, ModuleAnalysisManager &AM) { - if (!runUniformIntrinsicCombine(M, AM)) +AMDGPUUniformIntrinsicCombinePass::run(Function &F, + FunctionAnalysisManager &AM) { + const auto &UI = AM.getResult<UniformityInfoAnalysis>(F); + if (!runUniformIntrinsicCombine(F, UI)) return PreservedAnalyses::all(); PreservedAnalyses PA; PA.preserve<UniformityInfoAnalysis>(); return PA; } + +namespace { +class AMDGPUUniformIntrinsicCombineLegacy : public FunctionPass { +public: + static char ID; + AMDGPUUniformIntrinsicCombineLegacy() : FunctionPass(ID) { + initializeAMDGPUUniformIntrinsicCombineLegacyPass( + *PassRegistry::getPassRegistry()); + } + +private: + bool runOnFunction(Function &F) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<UniformityInfoWrapperPass>(); + AU.addRequired<TargetPassConfig>(); + } +}; +} // namespace + +char AMDGPUUniformIntrinsicCombineLegacy::ID = 0; +char &llvm::AMDGPUUniformIntrinsicCombineLegacyPassID = + AMDGPUUniformIntrinsicCombineLegacy::ID; + +bool AMDGPUUniformIntrinsicCombineLegacy::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + const UniformityInfo &UI = + getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo(); + return runUniformIntrinsicCombine(F, UI); +} + +INITIALIZE_PASS_BEGIN(AMDGPUUniformIntrinsicCombineLegacy, DEBUG_TYPE, + "AMDGPU Uniform Intrinsic Combine", false, false) +INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_END(AMDGPUUniformIntrinsicCombineLegacy, DEBUG_TYPE, + "AMDGPU Uniform Intrinsic Combine", false, false) + +FunctionPass *llvm::createAMDGPUUniformIntrinsicCombineLegacyPass() { + return new AMDGPUUniformIntrinsicCombineLegacy(); +} diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index 5f6d742..d950131 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -877,69 +877,69 @@ multiclass MIMG_Store <mimgopc op, string asm, bit has_d16, bit mip = 0> { } class MIMG_Atomic_gfx6789_base <bits<8> op, string asm, RegisterOperand data_rc, - RegisterClass addr_rc, string dns=""> - : MIMG_gfx6789 <op, (outs data_rc:$vdst), dns> { - let Constraints = "$vdst = $vdata"; - + RegisterClass addr_rc, bit noRtn, string dns=""> + : MIMG_gfx6789 <op, !if(noRtn, (outs), (outs data_rc:$vdst)), dns> { + let Constraints = !if(noRtn, "", "$vdst = $vdata"); + let isCodeGenOnly = noRtn; let InOperandList = (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256_XNULL:$srsrc, DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da); - let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$cpol$r128$tfe$lwe$da"; + let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$cpol$r128$tfe$lwe$da"; } class MIMG_Atomic_gfx90a_base <bits<8> op, string asm, RegisterOperand data_rc, - RegisterClass addr_rc, string dns=""> - : MIMG_gfx90a <op, (outs getAlign2RegOp<data_rc>.ret:$vdst), dns> { - let Constraints = "$vdst = $vdata"; - + RegisterClass addr_rc, bit noRtn, string dns=""> + : MIMG_gfx90a <op, !if(noRtn, (outs), (outs getAlign2RegOp<data_rc>.ret:$vdst)), dns> { + let Constraints = !if(noRtn, "", "$vdst = $vdata"); + let isCodeGenOnly = noRtn; let InOperandList = (ins getAlign2RegOp<data_rc>.ret:$vdata, addr_rc:$vaddr, SReg_256_XNULL:$srsrc, DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, LWE:$lwe, DA:$da); - let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$cpol$r128$lwe$da"; + let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$cpol$r128$lwe$da"; } class MIMG_Atomic_si<mimgopc op, string asm, RegisterOperand data_rc, - RegisterClass addr_rc, bit enableDasm = 0> - : MIMG_Atomic_gfx6789_base<op.SI, asm, data_rc, addr_rc, + RegisterClass addr_rc, bit noRtn = 0, bit enableDasm = 0> + : MIMG_Atomic_gfx6789_base<op.SI, asm, data_rc, addr_rc, noRtn, !if(enableDasm, "GFX6GFX7", "")> { let AssemblerPredicate = isGFX6GFX7; } class MIMG_Atomic_vi<mimgopc op, string asm, RegisterOperand data_rc, - RegisterClass addr_rc, bit enableDasm = 0> - : MIMG_Atomic_gfx6789_base<op.VI, asm, data_rc, addr_rc, !if(enableDasm, "GFX8", "")> { + RegisterClass addr_rc, bit noRtn = 0, bit enableDasm = 0> + : MIMG_Atomic_gfx6789_base<op.VI, asm, data_rc, addr_rc, noRtn, !if(enableDasm, "GFX8", "")> { let AssemblerPredicate = isGFX8GFX9NotGFX90A; let MIMGEncoding = MIMGEncGfx8; } class MIMG_Atomic_gfx90a<mimgopc op, string asm, RegisterOperand data_rc, - RegisterClass addr_rc, bit enableDasm = 0> - : MIMG_Atomic_gfx90a_base<op.VI, asm, data_rc, addr_rc, !if(enableDasm, "GFX90A", "")> { + RegisterClass addr_rc, bit noRtn = 0, bit enableDasm = 0> + : MIMG_Atomic_gfx90a_base<op.VI, asm, data_rc, addr_rc, noRtn, !if(enableDasm, "GFX90A", "")> { let AssemblerPredicate = isGFX90APlus; let MIMGEncoding = MIMGEncGfx90a; } class MIMG_Atomic_gfx10<mimgopc op, string opcode, RegisterOperand DataRC, RegisterClass AddrRC, - bit enableDisasm = 0> - : MIMG_gfx10<op.GFX10M, (outs DataRC:$vdst), + bit noRtn = 0, bit enableDisasm = 0> + : MIMG_gfx10<op.GFX10M, !if(noRtn, (outs), (outs DataRC:$vdst)), !if(enableDisasm, "GFX10", "")> { - let Constraints = "$vdst = $vdata"; - + let Constraints = !if(noRtn, "", "$vdst = $vdata"); + let isCodeGenOnly = noRtn; let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256_XNULL:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe); - let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"; + let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"; } class MIMG_Atomic_nsa_gfx10<mimgopc op, string opcode, RegisterOperand DataRC, int num_addrs, - bit enableDisasm = 0> - : MIMG_nsa_gfx10<op.GFX10M, (outs DataRC:$vdst), num_addrs, + bit noRtn = 0, bit enableDisasm = 0> + : MIMG_nsa_gfx10<op.GFX10M, !if(noRtn, (outs), (outs DataRC:$vdst)), num_addrs, !if(enableDisasm, "GFX10", "")> { - let Constraints = "$vdst = $vdata"; - + let Constraints = !if(noRtn, "", "$vdst = $vdata"); + let isCodeGenOnly = noRtn; let InOperandList = !con((ins DataRC:$vdata), AddrIns, (ins SReg_256_XNULL:$srsrc, DMask:$dmask, @@ -950,24 +950,24 @@ class MIMG_Atomic_nsa_gfx10<mimgopc op, string opcode, class MIMG_Atomic_gfx11<mimgopc op, string opcode, RegisterOperand DataRC, RegisterClass AddrRC, - bit enableDisasm = 0> - : MIMG_gfx11<op.GFX11, (outs DataRC:$vdst), + bit noRtn = 0, bit enableDisasm = 0> + : MIMG_gfx11<op.GFX11, !if(noRtn, (outs), (outs DataRC:$vdst)), !if(enableDisasm, "GFX11", "")> { - let Constraints = "$vdst = $vdata"; - + let Constraints = !if(noRtn, "", "$vdst = $vdata"); + let isCodeGenOnly = noRtn; let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256_XNULL:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe); - let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"; + let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"; } class MIMG_Atomic_nsa_gfx11<mimgopc op, string opcode, RegisterOperand DataRC, int num_addrs, - bit enableDisasm = 0> - : MIMG_nsa_gfx11<op.GFX11, (outs DataRC:$vdst), num_addrs, + bit noRtn = 0, bit enableDisasm = 0> + : MIMG_nsa_gfx11<op.GFX11, !if(noRtn, (outs), (outs DataRC:$vdst)), num_addrs, !if(enableDisasm, "GFX11", "")> { - let Constraints = "$vdst = $vdata"; - + let Constraints = !if(noRtn, "", "$vdst = $vdata"); + let isCodeGenOnly = noRtn; let InOperandList = !con((ins DataRC:$vdata), AddrIns, (ins SReg_256_XNULL:$srsrc, DMask:$dmask, @@ -977,11 +977,11 @@ class MIMG_Atomic_nsa_gfx11<mimgopc op, string opcode, } class VIMAGE_Atomic_gfx12<mimgopc op, string opcode, RegisterOperand DataRC, - int num_addrs, string renamed, bit enableDisasm = 0> - : VIMAGE_gfx12<op.GFX12, (outs DataRC:$vdst), num_addrs, + int num_addrs, string renamed, bit noRtn = 0, bit enableDisasm = 0> + : VIMAGE_gfx12<op.GFX12, !if(noRtn, (outs), (outs DataRC:$vdst)), num_addrs, !if(enableDisasm, "GFX12", "")> { - let Constraints = "$vdst = $vdata"; - + let Constraints = !if(noRtn, "", "$vdst = $vdata"); + let isCodeGenOnly = noRtn; let InOperandList = !con((ins DataRC:$vdata), AddrIns, (ins SReg_256_XNULL:$rsrc, DMask:$dmask, Dim:$dim, @@ -994,95 +994,96 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm, RegisterOperand data_rc, bit enableDasm = 0, bit isFP = 0, + bit noRtn = 0, string renamed = ""> { let hasSideEffects = 1, // FIXME: remove this mayLoad = 1, mayStore = 1, hasPostISelHook = 0, DisableWQM = 1, - FPAtomic = isFP in { + FPAtomic = isFP, IsAtomicNoRet = noRtn in { let VAddrDwords = 1 in { let ssamp = 0 in { if op.HAS_SI then { - def _V1_si : MIMG_Atomic_si <op, asm, data_rc, VGPR_32, enableDasm>; + def _V1_si : MIMG_Atomic_si <op, asm, data_rc, VGPR_32, noRtn, enableDasm>; } if op.HAS_VI then { - def _V1_vi : MIMG_Atomic_vi <op, asm, data_rc, VGPR_32, enableDasm>; + def _V1_vi : MIMG_Atomic_vi <op, asm, data_rc, VGPR_32, noRtn, enableDasm>; let hasPostISelHook = 1 in - def _V1_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VGPR_32, enableDasm>; + def _V1_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VGPR_32, noRtn, enableDasm>; } if op.HAS_GFX10M then { - def _V1_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VGPR_32, enableDasm>; + def _V1_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VGPR_32, noRtn, enableDasm>; } if op.HAS_GFX11 then { - def _V1_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VGPR_32, enableDasm>; + def _V1_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VGPR_32, noRtn, enableDasm>; } } if op.HAS_GFX12 then { - def _V1_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 1, renamed>; + def _V1_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 1, renamed, noRtn>; } } let VAddrDwords = 2 in { let ssamp = 0 in { if op.HAS_SI then { - def _V2_si : MIMG_Atomic_si <op, asm, data_rc, VReg_64, 0>; + def _V2_si : MIMG_Atomic_si <op, asm, data_rc, VReg_64, noRtn, 0>; } if op.HAS_VI then { - def _V2_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_64, 0>; - def _V2_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_64_Align2, 0>; + def _V2_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_64, noRtn, 0>; + def _V2_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_64_Align2, noRtn, 0>; } if op.HAS_GFX10M then { - def _V2_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_64, 0>; - def _V2_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 2, 0>; + def _V2_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_64, noRtn, 0>; + def _V2_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 2, noRtn, 0>; } if op.HAS_GFX11 then { - def _V2_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_64, 0>; - def _V2_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 2, 0>; + def _V2_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_64, noRtn, 0>; + def _V2_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 2, noRtn, 0>; } } if op.HAS_GFX12 then { - def _V2_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 2, renamed>; + def _V2_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 2, renamed, noRtn>; } } let VAddrDwords = 3 in { let ssamp = 0 in { if op.HAS_SI then { - def _V3_si : MIMG_Atomic_si <op, asm, data_rc, VReg_96, 0>; + def _V3_si : MIMG_Atomic_si <op, asm, data_rc, VReg_96, noRtn, 0>; } if op.HAS_VI then { - def _V3_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_96, 0>; - def _V3_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_96_Align2, 0>; + def _V3_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_96, noRtn, 0>; + def _V3_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_96_Align2, noRtn, 0>; } if op.HAS_GFX10M then { - def _V3_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_96, 0>; - def _V3_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 3, 0>; + def _V3_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_96, noRtn, 0>; + def _V3_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 3, noRtn, 0>; } if op.HAS_GFX11 then { - def _V3_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_96, 0>; - def _V3_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 3, 0>; + def _V3_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_96, noRtn, 0>; + def _V3_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 3, noRtn, 0>; } } if op.HAS_GFX12 then { - def _V3_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 3, renamed>; + def _V3_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 3, renamed, noRtn>; } } let VAddrDwords = 4 in { let ssamp = 0 in { if op.HAS_SI then { - def _V4_si : MIMG_Atomic_si <op, asm, data_rc, VReg_128, 0>; + def _V4_si : MIMG_Atomic_si <op, asm, data_rc, VReg_128, noRtn, 0>; } if op.HAS_VI then { - def _V4_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_128, 0>; - def _V4_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_128_Align2, 0>; + def _V4_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_128, noRtn, 0>; + def _V4_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_128_Align2, noRtn, 0>; } if op.HAS_GFX10M then { - def _V4_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_128, 0>; - def _V4_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 4, enableDasm>; + def _V4_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_128, noRtn, 0>; + def _V4_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 4, noRtn, enableDasm>; } if op.HAS_GFX11 then { - def _V4_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_128, 0>; - def _V4_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 4, enableDasm>; + def _V4_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_128, noRtn, 0>; + def _V4_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 4, noRtn, enableDasm>; } } if op.HAS_GFX12 then { - def _V4_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 4, renamed, enableDasm>; + def _V4_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 4, renamed, noRtn, enableDasm>; } } } @@ -1095,12 +1096,13 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm, } } -multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0, - string renamed = ""> { // 64-bit atomics - let IsAtomicRet = 1 in { +multiclass MIMG_Atomic_Base <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0, + bit noRtn = 0, string renamed = ""> { // 64-bit atomics + let IsAtomicRet = !not(noRtn) in { def "" : MIMGBaseOpcode { let Atomic = 1; let AtomicX2 = isCmpSwap; + let NoReturn = noRtn; } let BaseOpcode = !cast<MIMGBaseOpcode>(NAME) in { @@ -1109,22 +1111,28 @@ multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0, // Other variants are reconstructed by disassembler using dmask and tfe. if !not(isCmpSwap) then { let VDataDwords = 1 in - defm _V1 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_32, 1, isFP, renamed>; + defm _V1 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_32, 1, isFP, noRtn, renamed>; } let VDataDwords = 2 in - defm _V2 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_64, isCmpSwap, isFP, renamed>; + defm _V2 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_64, isCmpSwap, isFP, noRtn, renamed>; let VDataDwords = 3 in - defm _V3 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_96, 0, isFP, renamed>; + defm _V3 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_96, 0, isFP, noRtn, renamed>; if isCmpSwap then { let VDataDwords = 4 in - defm _V4 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_128, 0, isFP, renamed>; + defm _V4 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_128, 0, isFP, noRtn, renamed>; let VDataDwords = 5 in - defm _V5 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_160, 0, isFP, renamed>; + defm _V5 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_160, 0, isFP, noRtn, renamed>; } } - } // End IsAtomicRet = 1 + } +} + +multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0, + string renamed = ""> { + defm "" : MIMG_Atomic_Base <op, asm, isCmpSwap, isFP, /*noRtn=*/0, renamed>; + defm "_NORTN" : MIMG_Atomic_Base <op, asm, isCmpSwap, isFP, /*noRtn=*/1, renamed>; } multiclass MIMG_Atomic_Renamed <mimgopc op, string asm, string renamed, @@ -1820,6 +1828,7 @@ let SubtargetPredicate = isGFX12Plus in { class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> { Intrinsic Intr = I; MIMGBaseOpcode BaseOpcode = !cast<MIMGBaseOpcode>(!strconcat("IMAGE_", I.P.OpMod)); + MIMGBaseOpcode AtomicNoRetBaseOpcode = BaseOpcode; AMDGPUDimProps Dim = I.P.Dim; AMDGPUImageDimIntrinsicEval DimEval = AMDGPUImageDimIntrinsicEval<I.P>; @@ -1855,13 +1864,20 @@ class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> { bits<8> CoordTyArg = !add(GradientTyArg, !if(I.P.Gradients, 1, 0)); } +class ImageDimAtomicIntrinsicInfo<AMDGPUImageDimIntrinsic I> + : ImageDimIntrinsicInfo<I> { + MIMGBaseOpcode AtomicNoRetBaseOpcode = + !cast<MIMGBaseOpcode>(!strconcat("IMAGE_", I.P.OpMod, "_NORTN")); +} + def ImageDimIntrinsicTable : GenericTable { let FilterClass = "ImageDimIntrinsicInfo"; - let Fields = ["Intr", "BaseOpcode", "Dim", "NumOffsetArgs", "NumBiasArgs", "NumZCompareArgs", "NumGradients", "NumDmask", "NumData", "NumVAddrs", "NumArgs", - "DMaskIndex", "VAddrStart", "OffsetIndex", "BiasIndex", "ZCompareIndex", "GradientStart", "CoordStart", "LodIndex", "MipIndex", "VAddrEnd", - "RsrcIndex", "SampIndex", "UnormIndex", "TexFailCtrlIndex", "CachePolicyIndex", + let Fields = ["Intr", "BaseOpcode", "AtomicNoRetBaseOpcode", "Dim", "NumOffsetArgs", "NumBiasArgs", "NumZCompareArgs", "NumGradients", "NumDmask", "NumData", + "NumVAddrs", "NumArgs", "DMaskIndex", "VAddrStart", "OffsetIndex", "BiasIndex", "ZCompareIndex", "GradientStart", "CoordStart", "LodIndex", "MipIndex", + "VAddrEnd", "RsrcIndex", "SampIndex", "UnormIndex", "TexFailCtrlIndex", "CachePolicyIndex", "BiasTyArg", "GradientTyArg", "CoordTyArg"]; string TypeOf_BaseOpcode = "MIMGBaseOpcode"; + string TypeOf_AtomicNoRetBaseOpcode = "MIMGBaseOpcode"; string TypeOf_Dim = "MIMGDim"; let PrimaryKey = ["Intr"]; @@ -1874,11 +1890,14 @@ def getImageDimIntrinsicByBaseOpcode : SearchIndex { let Key = ["BaseOpcode", "Dim"]; } -foreach intr = !listconcat(AMDGPUImageDimIntrinsics, - AMDGPUImageDimAtomicIntrinsics) in { +foreach intr = AMDGPUImageDimIntrinsics in { def : ImageDimIntrinsicInfo<intr>; } +foreach intr = AMDGPUImageDimAtomicIntrinsics in { + def : ImageDimAtomicIntrinsicInfo<intr>; +} + // L to LZ Optimization Mapping def : MIMGLZMapping<IMAGE_SAMPLE_L, IMAGE_SAMPLE_LZ>; def : MIMGLZMapping<IMAGE_SAMPLE_C_L, IMAGE_SAMPLE_C_LZ>; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index be42291..b34ab2a 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -9134,16 +9134,23 @@ SDValue SITargetLowering::lowerImage(SDValue Op, SDLoc DL(Op); MachineFunction &MF = DAG.getMachineFunction(); const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>(); + unsigned IntrOpcode = Intr->BaseOpcode; + // For image atomic: use no-return opcode if result is unused. + if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode && + !Op.getNode()->hasAnyUseOfValue(0)) + IntrOpcode = Intr->AtomicNoRetBaseOpcode; const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = - AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); + AMDGPU::getMIMGBaseOpcodeInfo(IntrOpcode); const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); - unsigned IntrOpcode = Intr->BaseOpcode; bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget); bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget); bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget); SmallVector<EVT, 3> ResultTypes(Op->values()); SmallVector<EVT, 3> OrigResultTypes(Op->values()); + if (BaseOpcode->NoReturn && BaseOpcode->Atomic) + ResultTypes.erase(&ResultTypes[0]); + bool IsD16 = false; bool IsG16 = false; bool IsA16 = false; @@ -9162,8 +9169,10 @@ SDValue SITargetLowering::lowerImage(SDValue Op, VData = Op.getOperand(2); IsAtomicPacked16Bit = - (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 || - Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16); + (IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 || + IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN || + IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 || + IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN); bool Is64Bit = VData.getValueSizeInBits() == 64; if (BaseOpcode->AtomicX2) { @@ -9173,7 +9182,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op, if (Is64Bit) VData = DAG.getBitcast(MVT::v4i32, VData); - ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32; + if (!BaseOpcode->NoReturn) + ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32; + DMask = Is64Bit ? 0xf : 0x3; NumVDataDwords = Is64Bit ? 4 : 2; } else { @@ -9399,8 +9410,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op, } unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex); - if (BaseOpcode->Atomic) - CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization + // Keep GLC only when the atomic's result is actually used. + if (BaseOpcode->Atomic && !BaseOpcode->NoReturn) + CPol |= AMDGPU::CPol::GLC; if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) | AMDGPU::CPol::VOLATILE)) return Op; @@ -9512,13 +9524,20 @@ SDValue SITargetLowering::lowerImage(SDValue Op, DAG.setNodeMemRefs(NewNode, {MemRef}); } + if (BaseOpcode->NoReturn) { + if (BaseOpcode->Atomic) + return DAG.getMergeValues( + {DAG.getPOISON(OrigResultTypes[0]), SDValue(NewNode, 0)}, DL); + + return SDValue(NewNode, 0); + } + if (BaseOpcode->AtomicX2) { SmallVector<SDValue, 1> Elt; DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1); return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL); } - if (BaseOpcode->NoReturn) - return SDValue(NewNode, 0); + return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail, Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes, NumVDataDwords, IsAtomicPacked16Bit, DL); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index d80a6f3..a6c1af2 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1823,6 +1823,16 @@ void SIRegisterInfo::buildSpillLoadStore( } } + Register FinalValueReg = ValueReg; + if (LoadStoreOp == AMDGPU::SCRATCH_LOAD_USHORT_SADDR) { + // If we are loading 16-bit value with SRAMECC endabled we need a temp + // 32-bit VGPR to load and extract 16-bits into the final register. + ValueReg = + RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0); + SubReg = ValueReg; + IsKill = false; + } + MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RegOffset); MachineMemOperand *NewMMO = MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize, @@ -1863,6 +1873,17 @@ void SIRegisterInfo::buildSpillLoadStore( MIB.addImm(0); // swz MIB.addMemOperand(NewMMO); + if (FinalValueReg != ValueReg) { + // Extract 16-bit from the loaded 32-bit value. + ValueReg = getSubReg(ValueReg, AMDGPU::lo16); + MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B16_t16_e64)) + .addReg(FinalValueReg, getDefRegState(true)) + .addImm(0) + .addReg(ValueReg, getKillRegState(true)) + .addImm(0); + ValueReg = FinalValueReg; + } + if (!IsAGPR && NeedSuperRegDef) MIB.addReg(ValueReg, RegState::ImplicitDefine); @@ -2505,7 +2526,9 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, unsigned Opc; if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_RESTORE) { assert(ST.enableFlatScratch() && "Flat Scratch is not enabled!"); - Opc = AMDGPU::SCRATCH_LOAD_SHORT_D16_SADDR_t16; + Opc = ST.d16PreservesUnusedBits() + ? AMDGPU::SCRATCH_LOAD_SHORT_D16_SADDR_t16 + : AMDGPU::SCRATCH_LOAD_USHORT_SADDR; } else { Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE ? AMDGPU::SCRATCH_LOAD_BLOCK_SADDR diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index fdba454..a4d3d62 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -601,10 +601,20 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom); setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom); - if (!Subtarget->hasVFP2Base()) + if (!Subtarget->hasVFP2Base()) { setAllExpand(MVT::f32); - if (!Subtarget->hasFP64()) + } else { + for (auto Op : {ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL, + ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FSQRT}) + setOperationAction(Op, MVT::f32, Legal); + } + if (!Subtarget->hasFP64()) { setAllExpand(MVT::f64); + } else { + for (auto Op : {ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL, + ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FSQRT}) + setOperationAction(Op, MVT::f64, Legal); + } } if (Subtarget->hasFullFP16()) { @@ -1281,12 +1291,16 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) { setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); + setOperationAction(ISD::STRICT_FP16_TO_FP, MVT::f64, LibCall); + setOperationAction(ISD::STRICT_FP_TO_FP16, MVT::f64, LibCall); } // fp16 is a special v7 extension that adds f16 <-> f32 conversions. if (!Subtarget->hasFP16()) { setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); + setOperationAction(ISD::STRICT_FP16_TO_FP, MVT::f32, LibCall); + setOperationAction(ISD::STRICT_FP_TO_FP16, MVT::f32, LibCall); } // Strict floating-point comparisons need custom lowering. @@ -1333,31 +1347,42 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, } // FP16 often need to be promoted to call lib functions + // clang-format off if (Subtarget->hasFullFP16()) { - setOperationAction(ISD::FREM, MVT::f16, Promote); - setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); - setOperationAction(ISD::FSIN, MVT::f16, Promote); - setOperationAction(ISD::FCOS, MVT::f16, Promote); - setOperationAction(ISD::FTAN, MVT::f16, Promote); - setOperationAction(ISD::FSINCOS, MVT::f16, Promote); - setOperationAction(ISD::FPOWI, MVT::f16, Promote); - setOperationAction(ISD::FPOW, MVT::f16, Promote); - setOperationAction(ISD::FEXP, MVT::f16, Promote); - setOperationAction(ISD::FEXP2, MVT::f16, Promote); - setOperationAction(ISD::FEXP10, MVT::f16, Promote); - setOperationAction(ISD::FLOG, MVT::f16, Promote); - setOperationAction(ISD::FLOG10, MVT::f16, Promote); - setOperationAction(ISD::FLOG2, MVT::f16, Promote); setOperationAction(ISD::LRINT, MVT::f16, Expand); setOperationAction(ISD::LROUND, MVT::f16, Expand); - - setOperationAction(ISD::FROUND, MVT::f16, Legal); - setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal); - setOperationAction(ISD::FTRUNC, MVT::f16, Legal); - setOperationAction(ISD::FNEARBYINT, MVT::f16, Legal); - setOperationAction(ISD::FRINT, MVT::f16, Legal); - setOperationAction(ISD::FFLOOR, MVT::f16, Legal); - setOperationAction(ISD::FCEIL, MVT::f16, Legal); + setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); + + for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI, + ISD::FCOS, ISD::FSIN, ISD::FSINCOS, + ISD::FSINCOSPI, ISD::FMODF, ISD::FACOS, + ISD::FASIN, ISD::FATAN, ISD::FATAN2, + ISD::FCOSH, ISD::FSINH, ISD::FTANH, + ISD::FTAN, ISD::FEXP, ISD::FEXP2, + ISD::FEXP10, ISD::FLOG, ISD::FLOG2, + ISD::FLOG10, ISD::STRICT_FREM, ISD::STRICT_FPOW, + ISD::STRICT_FPOWI, ISD::STRICT_FCOS, ISD::STRICT_FSIN, + ISD::STRICT_FACOS, ISD::STRICT_FASIN, ISD::STRICT_FATAN, + ISD::STRICT_FATAN2, ISD::STRICT_FCOSH, ISD::STRICT_FSINH, + ISD::STRICT_FTANH, ISD::STRICT_FEXP, ISD::STRICT_FEXP2, + ISD::STRICT_FLOG, ISD::STRICT_FLOG2, ISD::STRICT_FLOG10, + ISD::STRICT_FTAN}) { + setOperationAction(Op, MVT::f16, Promote); + } + + // Round-to-integer need custom lowering for fp16, as Promote doesn't work + // because the result type is integer. + for (auto Op : {ISD::STRICT_LROUND, ISD::STRICT_LLROUND, ISD::STRICT_LRINT, ISD::STRICT_LLRINT}) + setOperationAction(Op, MVT::f16, Custom); + + for (auto Op : {ISD::FROUND, ISD::FROUNDEVEN, ISD::FTRUNC, + ISD::FNEARBYINT, ISD::FRINT, ISD::FFLOOR, + ISD::FCEIL, ISD::STRICT_FROUND, ISD::STRICT_FROUNDEVEN, + ISD::STRICT_FTRUNC, ISD::STRICT_FNEARBYINT, ISD::STRICT_FRINT, + ISD::STRICT_FFLOOR, ISD::STRICT_FCEIL}) { + setOperationAction(Op, MVT::f16, Legal); + } + // clang-format on } if (Subtarget->hasNEON()) { @@ -10725,6 +10750,19 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerCMP(Op, DAG); case ISD::ABS: return LowerABS(Op, DAG); + case ISD::STRICT_LROUND: + case ISD::STRICT_LLROUND: + case ISD::STRICT_LRINT: + case ISD::STRICT_LLRINT: { + assert((Op.getOperand(1).getValueType() == MVT::f16 || + Op.getOperand(1).getValueType() == MVT::bf16) && + "Expected custom lowering of rounding operations only for f16"); + SDLoc DL(Op); + SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other}, + {Op.getOperand(0), Op.getOperand(1)}); + return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other}, + {Ext.getValue(1), Ext.getValue(0)}); + } } } diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td index 10d4cd5..f7176a6 100644 --- a/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -473,15 +473,15 @@ def xor_su : PatFrag<(ops node:$lhs, node:$rhs), (xor node:$lhs, node:$rhs)>; // An 'fmul' node with a single use. let HasOneUse = 1 in -def fmul_su : PatFrag<(ops node:$lhs, node:$rhs), (fmul node:$lhs, node:$rhs)>; +def fmul_su : PatFrag<(ops node:$lhs, node:$rhs), (any_fmul node:$lhs, node:$rhs)>; // An 'fadd' node which checks for single non-hazardous use. -def fadd_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fadd node:$lhs, node:$rhs),[{ +def fadd_mlx : PatFrag<(ops node:$lhs, node:$rhs),(any_fadd node:$lhs, node:$rhs),[{ return hasNoVMLxHazardUse(N); }]>; // An 'fsub' node which checks for single non-hazardous use. -def fsub_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fsub node:$lhs, node:$rhs),[{ +def fsub_mlx : PatFrag<(ops node:$lhs, node:$rhs),(any_fsub node:$lhs, node:$rhs),[{ return hasNoVMLxHazardUse(N); }]>; diff --git a/llvm/lib/Target/ARM/ARMInstrVFP.td b/llvm/lib/Target/ARM/ARMInstrVFP.td index 6771106..e2cc97b 100644 --- a/llvm/lib/Target/ARM/ARMInstrVFP.td +++ b/llvm/lib/Target/ARM/ARMInstrVFP.td @@ -439,14 +439,14 @@ let TwoOperandAliasConstraint = "$Dn = $Dd", mayRaiseFPException = 1, Uses = [FP def VADDD : ADbI<0b11100, 0b11, 0, 0, (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), IIC_fpALU64, "vadd", ".f64\t$Dd, $Dn, $Dm", - [(set DPR:$Dd, (fadd DPR:$Dn, (f64 DPR:$Dm)))]>, + [(set DPR:$Dd, (any_fadd DPR:$Dn, (f64 DPR:$Dm)))]>, Sched<[WriteFPALU64]>; let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VADDS : ASbIn<0b11100, 0b11, 0, 0, (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), IIC_fpALU32, "vadd", ".f32\t$Sd, $Sn, $Sm", - [(set SPR:$Sd, (fadd SPR:$Sn, SPR:$Sm))]>, + [(set SPR:$Sd, (any_fadd SPR:$Sn, SPR:$Sm))]>, Sched<[WriteFPALU32]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines on A8. @@ -457,21 +457,21 @@ let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FP def VADDH : AHbI<0b11100, 0b11, 0, 0, (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpALU16, "vadd", ".f16\t$Sd, $Sn, $Sm", - [(set (f16 HPR:$Sd), (fadd (f16 HPR:$Sn), (f16 HPR:$Sm)))]>, + [(set (f16 HPR:$Sd), (any_fadd (f16 HPR:$Sn), (f16 HPR:$Sm)))]>, Sched<[WriteFPALU32]>; let TwoOperandAliasConstraint = "$Dn = $Dd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VSUBD : ADbI<0b11100, 0b11, 1, 0, (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), IIC_fpALU64, "vsub", ".f64\t$Dd, $Dn, $Dm", - [(set DPR:$Dd, (fsub DPR:$Dn, (f64 DPR:$Dm)))]>, + [(set DPR:$Dd, (any_fsub DPR:$Dn, (f64 DPR:$Dm)))]>, Sched<[WriteFPALU64]>; let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VSUBS : ASbIn<0b11100, 0b11, 1, 0, (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), IIC_fpALU32, "vsub", ".f32\t$Sd, $Sn, $Sm", - [(set SPR:$Sd, (fsub SPR:$Sn, SPR:$Sm))]>, + [(set SPR:$Sd, (any_fsub SPR:$Sn, SPR:$Sm))]>, Sched<[WriteFPALU32]>{ // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines on A8. @@ -482,42 +482,42 @@ let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FP def VSUBH : AHbI<0b11100, 0b11, 1, 0, (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpALU16, "vsub", ".f16\t$Sd, $Sn, $Sm", - [(set (f16 HPR:$Sd), (fsub (f16 HPR:$Sn), (f16 HPR:$Sm)))]>, + [(set (f16 HPR:$Sd), (any_fsub (f16 HPR:$Sn), (f16 HPR:$Sm)))]>, Sched<[WriteFPALU32]>; let TwoOperandAliasConstraint = "$Dn = $Dd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VDIVD : ADbI<0b11101, 0b00, 0, 0, (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), IIC_fpDIV64, "vdiv", ".f64\t$Dd, $Dn, $Dm", - [(set DPR:$Dd, (fdiv DPR:$Dn, (f64 DPR:$Dm)))]>, + [(set DPR:$Dd, (any_fdiv DPR:$Dn, (f64 DPR:$Dm)))]>, Sched<[WriteFPDIV64]>; let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VDIVS : ASbI<0b11101, 0b00, 0, 0, (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), IIC_fpDIV32, "vdiv", ".f32\t$Sd, $Sn, $Sm", - [(set SPR:$Sd, (fdiv SPR:$Sn, SPR:$Sm))]>, + [(set SPR:$Sd, (any_fdiv SPR:$Sn, SPR:$Sm))]>, Sched<[WriteFPDIV32]>; let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VDIVH : AHbI<0b11101, 0b00, 0, 0, (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpDIV16, "vdiv", ".f16\t$Sd, $Sn, $Sm", - [(set (f16 HPR:$Sd), (fdiv (f16 HPR:$Sn), (f16 HPR:$Sm)))]>, + [(set (f16 HPR:$Sd), (any_fdiv (f16 HPR:$Sn), (f16 HPR:$Sm)))]>, Sched<[WriteFPDIV32]>; let TwoOperandAliasConstraint = "$Dn = $Dd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VMULD : ADbI<0b11100, 0b10, 0, 0, (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), IIC_fpMUL64, "vmul", ".f64\t$Dd, $Dn, $Dm", - [(set DPR:$Dd, (fmul DPR:$Dn, (f64 DPR:$Dm)))]>, + [(set DPR:$Dd, (any_fmul DPR:$Dn, (f64 DPR:$Dm)))]>, Sched<[WriteFPMUL64, ReadFPMUL, ReadFPMUL]>; let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VMULS : ASbIn<0b11100, 0b10, 0, 0, (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), IIC_fpMUL32, "vmul", ".f32\t$Sd, $Sn, $Sm", - [(set SPR:$Sd, (fmul SPR:$Sn, SPR:$Sm))]>, + [(set SPR:$Sd, (any_fmul SPR:$Sn, SPR:$Sm))]>, Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines on A8. @@ -528,21 +528,21 @@ let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FP def VMULH : AHbI<0b11100, 0b10, 0, 0, (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpMUL16, "vmul", ".f16\t$Sd, $Sn, $Sm", - [(set (f16 HPR:$Sd), (fmul (f16 HPR:$Sn), (f16 HPR:$Sm)))]>, + [(set (f16 HPR:$Sd), (any_fmul (f16 HPR:$Sn), (f16 HPR:$Sm)))]>, Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]>; let TwoOperandAliasConstraint = "$Dn = $Dd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VNMULD : ADbI<0b11100, 0b10, 1, 0, (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), IIC_fpMUL64, "vnmul", ".f64\t$Dd, $Dn, $Dm", - [(set DPR:$Dd, (fneg (fmul DPR:$Dn, (f64 DPR:$Dm))))]>, + [(set DPR:$Dd, (fneg (any_fmul DPR:$Dn, (f64 DPR:$Dm))))]>, Sched<[WriteFPMUL64, ReadFPMUL, ReadFPMUL]>; let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VNMULS : ASbI<0b11100, 0b10, 1, 0, (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), IIC_fpMUL32, "vnmul", ".f32\t$Sd, $Sn, $Sm", - [(set SPR:$Sd, (fneg (fmul SPR:$Sn, SPR:$Sm)))]>, + [(set SPR:$Sd, (fneg (any_fmul SPR:$Sn, SPR:$Sm)))]>, Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines on A8. @@ -553,7 +553,7 @@ let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FP def VNMULH : AHbI<0b11100, 0b10, 1, 0, (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpMUL16, "vnmul", ".f16\t$Sd, $Sn, $Sm", - [(set (f16 HPR:$Sd), (fneg (fmul (f16 HPR:$Sn), (f16 HPR:$Sm))))]>, + [(set (f16 HPR:$Sd), (fneg (any_fmul (f16 HPR:$Sn), (f16 HPR:$Sm))))]>, Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]>; multiclass vsel_inst<string op, bits<2> opc, int CC> { @@ -587,7 +587,7 @@ defm VSELGE : vsel_inst<"ge", 0b10, 10>; defm VSELEQ : vsel_inst<"eq", 0b00, 0>; defm VSELVS : vsel_inst<"vs", 0b01, 6>; -multiclass vmaxmin_inst<string op, bit opc, SDNode SD> { +multiclass vmaxmin_inst<string op, bit opc, PatFrags SD> { let DecoderNamespace = "VFPV8", PostEncoderMethod = "", isUnpredicable = 1, mayRaiseFPException = 1 in { def H : AHbInp<0b11101, 0b00, opc, @@ -610,8 +610,8 @@ multiclass vmaxmin_inst<string op, bit opc, SDNode SD> { } } -defm VFP_VMAXNM : vmaxmin_inst<"vmaxnm", 0, fmaxnum>; -defm VFP_VMINNM : vmaxmin_inst<"vminnm", 1, fminnum>; +defm VFP_VMAXNM : vmaxmin_inst<"vmaxnm", 0, any_fmaxnum>; +defm VFP_VMINNM : vmaxmin_inst<"vminnm", 1, any_fminnum>; // Match reassociated forms only if not sign dependent rounding. def : Pat<(fmul (fneg DPR:$a), (f64 DPR:$b)), @@ -746,7 +746,7 @@ let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VCVTDS : ASuI<0b11101, 0b11, 0b0111, 0b11, 0, (outs DPR:$Dd), (ins SPR:$Sm), IIC_fpCVTDS, "vcvt", ".f64.f32\t$Dd, $Sm", "", - [(set DPR:$Dd, (fpextend SPR:$Sm))]>, + [(set DPR:$Dd, (any_fpextend SPR:$Sm))]>, Sched<[WriteFPCVT]> { // Instruction operands. bits<5> Dd; @@ -766,7 +766,7 @@ def VCVTDS : ASuI<0b11101, 0b11, 0b0111, 0b11, 0, let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VCVTSD : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm, IIC_fpCVTSD, "vcvt", ".f32.f64\t$Sd, $Dm", "", - [(set SPR:$Sd, (fpround DPR:$Dm))]>, + [(set SPR:$Sd, (any_fpround DPR:$Dm))]>, Sched<[WriteFPCVT]> { // Instruction operands. bits<5> Sd; @@ -796,7 +796,7 @@ def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm), Requires<[HasFP16]>, Sched<[WriteFPCVT]>; -def : FP16Pat<(f32 (fpextend (f16 HPR:$Sm))), +def : FP16Pat<(f32 (any_fpextend (f16 HPR:$Sm))), (VCVTBHS (COPY_TO_REGCLASS (f16 HPR:$Sm), SPR))>; def : FP16Pat<(f16_to_fp GPR:$a), (VCVTBHS (COPY_TO_REGCLASS GPR:$a, SPR))>; @@ -808,16 +808,16 @@ def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sda, Requires<[HasFP16]>, Sched<[WriteFPCVT]>; -def : FP16Pat<(f16 (fpround SPR:$Sm)), +def : FP16Pat<(f16 (any_fpround SPR:$Sm)), (COPY_TO_REGCLASS (VCVTBSH (IMPLICIT_DEF), SPR:$Sm), HPR)>; def : FP16Pat<(fp_to_f16 SPR:$a), (i32 (COPY_TO_REGCLASS (VCVTBSH (IMPLICIT_DEF), SPR:$a), GPR))>; -def : FP16Pat<(insertelt (v8f16 MQPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_even:$lane), +def : FP16Pat<(insertelt (v8f16 MQPR:$src1), (f16 (any_fpround (f32 SPR:$src2))), imm_even:$lane), (v8f16 (INSERT_SUBREG (v8f16 MQPR:$src1), (VCVTBSH (EXTRACT_SUBREG (v8f16 MQPR:$src1), (SSubReg_f16_reg imm:$lane)), SPR:$src2), (SSubReg_f16_reg imm:$lane)))>; -def : FP16Pat<(insertelt (v4f16 DPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_even:$lane), +def : FP16Pat<(insertelt (v4f16 DPR:$src1), (f16 (any_fpround (f32 SPR:$src2))), imm_even:$lane), (v4f16 (INSERT_SUBREG (v4f16 DPR:$src1), (VCVTBSH (EXTRACT_SUBREG (v4f16 DPR:$src1), (SSubReg_f16_reg imm:$lane)), SPR:$src2), @@ -830,9 +830,9 @@ def VCVTTHS: ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm), Requires<[HasFP16]>, Sched<[WriteFPCVT]>; -def : FP16Pat<(f32 (fpextend (extractelt (v8f16 MQPR:$src), imm_odd:$lane))), +def : FP16Pat<(f32 (any_fpextend (extractelt (v8f16 MQPR:$src), imm_odd:$lane))), (VCVTTHS (EXTRACT_SUBREG MQPR:$src, (SSubReg_f16_reg imm_odd:$lane)))>; -def : FP16Pat<(f32 (fpextend (extractelt (v4f16 DPR:$src), imm_odd:$lane))), +def : FP16Pat<(f32 (any_fpextend (extractelt (v4f16 DPR:$src), imm_odd:$lane))), (VCVTTHS (EXTRACT_SUBREG (v2f32 (COPY_TO_REGCLASS (v4f16 DPR:$src), DPR_VFP2)), (SSubReg_f16_reg imm_odd:$lane)))>; @@ -844,12 +844,12 @@ def VCVTTSH: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sda, Requires<[HasFP16]>, Sched<[WriteFPCVT]>; -def : FP16Pat<(insertelt (v8f16 MQPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_odd:$lane), +def : FP16Pat<(insertelt (v8f16 MQPR:$src1), (f16 (any_fpround (f32 SPR:$src2))), imm_odd:$lane), (v8f16 (INSERT_SUBREG (v8f16 MQPR:$src1), (VCVTTSH (EXTRACT_SUBREG (v8f16 MQPR:$src1), (SSubReg_f16_reg imm:$lane)), SPR:$src2), (SSubReg_f16_reg imm:$lane)))>; -def : FP16Pat<(insertelt (v4f16 DPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_odd:$lane), +def : FP16Pat<(insertelt (v4f16 DPR:$src1), (f16 (any_fpround (f32 SPR:$src2))), imm_odd:$lane), (v4f16 (INSERT_SUBREG (v4f16 DPR:$src1), (VCVTTSH (EXTRACT_SUBREG (v4f16 DPR:$src1), (SSubReg_f16_reg imm:$lane)), SPR:$src2), @@ -872,7 +872,7 @@ def VCVTBHD : ADuI<0b11101, 0b11, 0b0010, 0b01, 0, let hasSideEffects = 0; } -def : FullFP16Pat<(f64 (fpextend (f16 HPR:$Sm))), +def : FullFP16Pat<(f64 (any_fpextend (f16 HPR:$Sm))), (VCVTBHD (COPY_TO_REGCLASS (f16 HPR:$Sm), SPR))>, Requires<[HasFPARMv8, HasDPVFP]>; def : FP16Pat<(f64 (f16_to_fp GPR:$a)), @@ -898,7 +898,7 @@ def VCVTBDH : ADuI<0b11101, 0b11, 0b0011, 0b01, 0, let hasSideEffects = 0; } -def : FullFP16Pat<(f16 (fpround DPR:$Dm)), +def : FullFP16Pat<(f16 (any_fpround DPR:$Dm)), (COPY_TO_REGCLASS (VCVTBDH (IMPLICIT_DEF), DPR:$Dm), HPR)>, Requires<[HasFPARMv8, HasDPVFP]>; def : FP16Pat<(fp_to_f16 (f64 DPR:$a)), @@ -1007,41 +1007,41 @@ multiclass vcvt_inst<string opc, bits<2> rm, let Predicates = [HasFPARMv8] in { let Predicates = [HasFullFP16] in { - def : Pat<(i32 (fp_to_sint (node (f16 HPR:$a)))), + def : Pat<(i32 (any_fp_to_sint (node (f16 HPR:$a)))), (COPY_TO_REGCLASS (!cast<Instruction>(NAME#"SH") (f16 HPR:$a)), GPR)>; - def : Pat<(i32 (fp_to_uint (node (f16 HPR:$a)))), + def : Pat<(i32 (any_fp_to_uint (node (f16 HPR:$a)))), (COPY_TO_REGCLASS (!cast<Instruction>(NAME#"UH") (f16 HPR:$a)), GPR)>; } - def : Pat<(i32 (fp_to_sint (node SPR:$a))), + def : Pat<(i32 (any_fp_to_sint (node SPR:$a))), (COPY_TO_REGCLASS (!cast<Instruction>(NAME#"SS") SPR:$a), GPR)>; - def : Pat<(i32 (fp_to_uint (node SPR:$a))), + def : Pat<(i32 (any_fp_to_uint (node SPR:$a))), (COPY_TO_REGCLASS (!cast<Instruction>(NAME#"US") SPR:$a), GPR)>; } let Predicates = [HasFPARMv8, HasDPVFP] in { - def : Pat<(i32 (fp_to_sint (node (f64 DPR:$a)))), + def : Pat<(i32 (any_fp_to_sint (node (f64 DPR:$a)))), (COPY_TO_REGCLASS (!cast<Instruction>(NAME#"SD") DPR:$a), GPR)>; - def : Pat<(i32 (fp_to_uint (node (f64 DPR:$a)))), + def : Pat<(i32 (any_fp_to_uint (node (f64 DPR:$a)))), (COPY_TO_REGCLASS (!cast<Instruction>(NAME#"UD") DPR:$a), GPR)>; } } -defm VCVTA : vcvt_inst<"a", 0b00, fround>; +defm VCVTA : vcvt_inst<"a", 0b00, any_fround>; defm VCVTN : vcvt_inst<"n", 0b01>; -defm VCVTP : vcvt_inst<"p", 0b10, fceil>; -defm VCVTM : vcvt_inst<"m", 0b11, ffloor>; +defm VCVTP : vcvt_inst<"p", 0b10, any_fceil>; +defm VCVTM : vcvt_inst<"m", 0b11, any_ffloor>; def VNEGD : ADuI<0b11101, 0b11, 0b0001, 0b01, 0, (outs DPR:$Dd), (ins DPR:$Dm), @@ -1103,9 +1103,9 @@ multiclass vrint_inst_zrx<string opc, bit op, bit op2, SDPatternOperator node, Requires<[HasFPARMv8,HasDPVFP]>; } -defm VRINTZ : vrint_inst_zrx<"z", 0, 1, ftrunc, [], 0>; -defm VRINTR : vrint_inst_zrx<"r", 0, 0, fnearbyint, [FPSCR_RM], 0>; -defm VRINTX : vrint_inst_zrx<"x", 1, 0, frint, [FPSCR_RM], 1>; +defm VRINTZ : vrint_inst_zrx<"z", 0, 1, any_ftrunc, [], 0>; +defm VRINTR : vrint_inst_zrx<"r", 0, 0, any_fnearbyint, [FPSCR_RM], 0>; +defm VRINTX : vrint_inst_zrx<"x", 1, 0, any_frint, [FPSCR_RM], 1>; multiclass vrint_inst_anpm<string opc, bits<2> rm, SDPatternOperator node = null_frag> { @@ -1145,30 +1145,31 @@ multiclass vrint_inst_anpm<string opc, bits<2> rm, Requires<[HasFPARMv8,HasDPVFP]>; } -defm VRINTA : vrint_inst_anpm<"a", 0b00, fround>; -defm VRINTN : vrint_inst_anpm<"n", 0b01, froundeven>; -defm VRINTP : vrint_inst_anpm<"p", 0b10, fceil>; -defm VRINTM : vrint_inst_anpm<"m", 0b11, ffloor>; +defm VRINTA : vrint_inst_anpm<"a", 0b00, any_fround>; +defm VRINTN : vrint_inst_anpm<"n", 0b01, any_froundeven>; +defm VRINTP : vrint_inst_anpm<"p", 0b10, any_fceil>; +defm VRINTM : vrint_inst_anpm<"m", 0b11, any_ffloor>; + let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VSQRTD : ADuI<0b11101, 0b11, 0b0001, 0b11, 0, (outs DPR:$Dd), (ins DPR:$Dm), IIC_fpSQRT64, "vsqrt", ".f64\t$Dd, $Dm", "", - [(set DPR:$Dd, (fsqrt (f64 DPR:$Dm)))]>, + [(set DPR:$Dd, (any_fsqrt (f64 DPR:$Dm)))]>, Sched<[WriteFPSQRT64]>; let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VSQRTS : ASuI<0b11101, 0b11, 0b0001, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm), IIC_fpSQRT32, "vsqrt", ".f32\t$Sd, $Sm", "", - [(set SPR:$Sd, (fsqrt SPR:$Sm))]>, + [(set SPR:$Sd, (any_fsqrt SPR:$Sm))]>, Sched<[WriteFPSQRT32]>; let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VSQRTH : AHuI<0b11101, 0b11, 0b0001, 0b11, 0, (outs HPR:$Sd), (ins HPR:$Sm), IIC_fpSQRT16, "vsqrt", ".f16\t$Sd, $Sm", - [(set (f16 HPR:$Sd), (fsqrt (f16 HPR:$Sm)))]>; + [(set (f16 HPR:$Sd), (any_fsqrt (f16 HPR:$Sm)))]>; let hasSideEffects = 0 in { let isMoveReg = 1 in { @@ -1509,10 +1510,10 @@ def VSITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011, } let Predicates=[HasVFP2, HasDPVFP] in { - def : VFPPat<(f64 (sint_to_fp GPR:$a)), + def : VFPPat<(f64 (any_sint_to_fp GPR:$a)), (VSITOD (COPY_TO_REGCLASS GPR:$a, SPR))>; - def : VFPPat<(f64 (sint_to_fp (i32 (alignedload32 addrmode5:$a)))), + def : VFPPat<(f64 (any_sint_to_fp (i32 (alignedload32 addrmode5:$a)))), (VSITOD (VLDRS addrmode5:$a))>; } @@ -1529,10 +1530,10 @@ def VSITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010, let D = VFPNeonA8Domain; } -def : VFPNoNEONPat<(f32 (sint_to_fp GPR:$a)), +def : VFPNoNEONPat<(f32 (any_sint_to_fp GPR:$a)), (VSITOS (COPY_TO_REGCLASS GPR:$a, SPR))>; -def : VFPNoNEONPat<(f32 (sint_to_fp (i32 (alignedload32 addrmode5:$a)))), +def : VFPNoNEONPat<(f32 (any_sint_to_fp (i32 (alignedload32 addrmode5:$a)))), (VSITOS (VLDRS addrmode5:$a))>; let mayRaiseFPException = 1 in @@ -1545,7 +1546,7 @@ def VSITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001, let isUnpredicable = 1; } -def : VFPNoNEONPat<(f16 (sint_to_fp GPR:$a)), +def : VFPNoNEONPat<(f16 (any_sint_to_fp GPR:$a)), (VSITOH (COPY_TO_REGCLASS GPR:$a, SPR))>; let mayRaiseFPException = 1 in @@ -1558,10 +1559,10 @@ def VUITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011, } let Predicates=[HasVFP2, HasDPVFP] in { - def : VFPPat<(f64 (uint_to_fp GPR:$a)), + def : VFPPat<(f64 (any_uint_to_fp GPR:$a)), (VUITOD (COPY_TO_REGCLASS GPR:$a, SPR))>; - def : VFPPat<(f64 (uint_to_fp (i32 (alignedload32 addrmode5:$a)))), + def : VFPPat<(f64 (any_uint_to_fp (i32 (alignedload32 addrmode5:$a)))), (VUITOD (VLDRS addrmode5:$a))>; } @@ -1578,10 +1579,10 @@ def VUITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010, let D = VFPNeonA8Domain; } -def : VFPNoNEONPat<(f32 (uint_to_fp GPR:$a)), +def : VFPNoNEONPat<(f32 (any_uint_to_fp GPR:$a)), (VUITOS (COPY_TO_REGCLASS GPR:$a, SPR))>; -def : VFPNoNEONPat<(f32 (uint_to_fp (i32 (alignedload32 addrmode5:$a)))), +def : VFPNoNEONPat<(f32 (any_uint_to_fp (i32 (alignedload32 addrmode5:$a)))), (VUITOS (VLDRS addrmode5:$a))>; let mayRaiseFPException = 1 in @@ -1594,7 +1595,7 @@ def VUITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001, let isUnpredicable = 1; } -def : VFPNoNEONPat<(f16 (uint_to_fp GPR:$a)), +def : VFPNoNEONPat<(f16 (any_uint_to_fp GPR:$a)), (VUITOH (COPY_TO_REGCLASS GPR:$a, SPR))>; // FP -> Int: @@ -1669,12 +1670,12 @@ def VTOSIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1101, 0b1011, } let Predicates=[HasVFP2, HasDPVFP] in { - def : VFPPat<(i32 (fp_to_sint (f64 DPR:$a))), + def : VFPPat<(i32 (any_fp_to_sint (f64 DPR:$a))), (COPY_TO_REGCLASS (VTOSIZD DPR:$a), GPR)>; def : VFPPat<(i32 (fp_to_sint_sat (f64 DPR:$a), i32)), (COPY_TO_REGCLASS (VTOSIZD DPR:$a), GPR)>; - def : VFPPat<(alignedstore32 (i32 (fp_to_sint (f64 DPR:$a))), addrmode5:$ptr), + def : VFPPat<(alignedstore32 (i32 (any_fp_to_sint (f64 DPR:$a))), addrmode5:$ptr), (VSTRS (VTOSIZD DPR:$a), addrmode5:$ptr)>; def : VFPPat<(alignedstore32 (i32 (fp_to_sint_sat (f64 DPR:$a), i32)), addrmode5:$ptr), (VSTRS (VTOSIZD DPR:$a), addrmode5:$ptr)>; @@ -1693,12 +1694,12 @@ def VTOSIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1101, 0b1010, let D = VFPNeonA8Domain; } -def : VFPNoNEONPat<(i32 (fp_to_sint SPR:$a)), +def : VFPNoNEONPat<(i32 (any_fp_to_sint SPR:$a)), (COPY_TO_REGCLASS (VTOSIZS SPR:$a), GPR)>; def : VFPPat<(i32 (fp_to_sint_sat SPR:$a, i32)), (COPY_TO_REGCLASS (VTOSIZS SPR:$a), GPR)>; -def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_sint (f32 SPR:$a))), +def : VFPNoNEONPat<(alignedstore32 (i32 (any_fp_to_sint (f32 SPR:$a))), addrmode5:$ptr), (VSTRS (VTOSIZS SPR:$a), addrmode5:$ptr)>; def : VFPPat<(alignedstore32 (i32 (fp_to_sint_sat (f32 SPR:$a), i32)), @@ -1715,7 +1716,7 @@ def VTOSIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1101, 0b1001, let isUnpredicable = 1; } -def : VFPNoNEONPat<(i32 (fp_to_sint (f16 HPR:$a))), +def : VFPNoNEONPat<(i32 (any_fp_to_sint (f16 HPR:$a))), (COPY_TO_REGCLASS (VTOSIZH (f16 HPR:$a)), GPR)>; def : VFPPat<(i32 (fp_to_sint_sat (f16 HPR:$a), i32)), (COPY_TO_REGCLASS (VTOSIZH (f16 HPR:$a)), GPR)>; @@ -1730,12 +1731,12 @@ def VTOUIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011, } let Predicates=[HasVFP2, HasDPVFP] in { - def : VFPPat<(i32 (fp_to_uint (f64 DPR:$a))), + def : VFPPat<(i32 (any_fp_to_uint (f64 DPR:$a))), (COPY_TO_REGCLASS (VTOUIZD DPR:$a), GPR)>; def : VFPPat<(i32 (fp_to_uint_sat (f64 DPR:$a), i32)), (COPY_TO_REGCLASS (VTOUIZD DPR:$a), GPR)>; - def : VFPPat<(alignedstore32 (i32 (fp_to_uint (f64 DPR:$a))), addrmode5:$ptr), + def : VFPPat<(alignedstore32 (i32 (any_fp_to_uint (f64 DPR:$a))), addrmode5:$ptr), (VSTRS (VTOUIZD DPR:$a), addrmode5:$ptr)>; def : VFPPat<(alignedstore32 (i32 (fp_to_uint_sat (f64 DPR:$a), i32)), addrmode5:$ptr), (VSTRS (VTOUIZD DPR:$a), addrmode5:$ptr)>; @@ -1754,12 +1755,12 @@ def VTOUIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1100, 0b1010, let D = VFPNeonA8Domain; } -def : VFPNoNEONPat<(i32 (fp_to_uint SPR:$a)), +def : VFPNoNEONPat<(i32 (any_fp_to_uint SPR:$a)), (COPY_TO_REGCLASS (VTOUIZS SPR:$a), GPR)>; def : VFPPat<(i32 (fp_to_uint_sat SPR:$a, i32)), (COPY_TO_REGCLASS (VTOUIZS SPR:$a), GPR)>; -def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_uint (f32 SPR:$a))), +def : VFPNoNEONPat<(alignedstore32 (i32 (any_fp_to_uint (f32 SPR:$a))), addrmode5:$ptr), (VSTRS (VTOUIZS SPR:$a), addrmode5:$ptr)>; def : VFPPat<(alignedstore32 (i32 (fp_to_uint_sat (f32 SPR:$a), i32)), @@ -1776,7 +1777,7 @@ def VTOUIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1100, 0b1001, let isUnpredicable = 1; } -def : VFPNoNEONPat<(i32 (fp_to_uint (f16 HPR:$a))), +def : VFPNoNEONPat<(i32 (any_fp_to_uint (f16 HPR:$a))), (COPY_TO_REGCLASS (VTOUIZH (f16 HPR:$a)), GPR)>; def : VFPPat<(i32 (fp_to_uint_sat (f16 HPR:$a), i32)), (COPY_TO_REGCLASS (VTOUIZH (f16 HPR:$a)), GPR)>; @@ -2320,13 +2321,13 @@ def : Pat<(fadd_mlx HPR:$dstin, (fmul_su (f16 HPR:$a), HPR:$b)), // Match @llvm.fma.* intrinsics // (fma x, y, z) -> (vfms z, x, y) -def : Pat<(f64 (fma DPR:$Dn, DPR:$Dm, DPR:$Ddin)), +def : Pat<(f64 (any_fma DPR:$Dn, DPR:$Dm, DPR:$Ddin)), (VFMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, Requires<[HasVFP4,HasDPVFP]>; -def : Pat<(f32 (fma SPR:$Sn, SPR:$Sm, SPR:$Sdin)), +def : Pat<(f32 (any_fma SPR:$Sn, SPR:$Sm, SPR:$Sdin)), (VFMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; -def : Pat<(f16 (fma HPR:$Sn, HPR:$Sm, (f16 HPR:$Sdin))), +def : Pat<(f16 (any_fma HPR:$Sn, HPR:$Sm, (f16 HPR:$Sdin))), (VFMAH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>, Requires<[HasFullFP16]>; @@ -2375,13 +2376,13 @@ def : Pat<(fsub_mlx HPR:$dstin, (fmul_su (f16 HPR:$a), HPR:$b)), // Match @llvm.fma.* intrinsics // (fma (fneg x), y, z) -> (vfms z, x, y) -def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin)), +def : Pat<(f64 (any_fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin)), (VFMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, Requires<[HasVFP4,HasDPVFP]>; -def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin)), +def : Pat<(f32 (any_fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin)), (VFMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; -def : Pat<(f16 (fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (f16 HPR:$Sdin))), +def : Pat<(f16 (any_fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (f16 HPR:$Sdin))), (VFMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>, Requires<[HasFullFP16]>; @@ -2427,23 +2428,23 @@ def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin), // Match @llvm.fma.* intrinsics // (fneg (fma x, y, z)) -> (vfnma z, x, y) -def : Pat<(fneg (fma (f64 DPR:$Dn), (f64 DPR:$Dm), (f64 DPR:$Ddin))), +def : Pat<(fneg (any_fma (f64 DPR:$Dn), (f64 DPR:$Dm), (f64 DPR:$Ddin))), (VFNMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, Requires<[HasVFP4,HasDPVFP]>; -def : Pat<(fneg (fma (f32 SPR:$Sn), (f32 SPR:$Sm), (f32 SPR:$Sdin))), +def : Pat<(fneg (any_fma (f32 SPR:$Sn), (f32 SPR:$Sm), (f32 SPR:$Sdin))), (VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; -def : Pat<(fneg (fma (f16 HPR:$Sn), (f16 HPR:$Sm), (f16 (f16 HPR:$Sdin)))), +def : Pat<(fneg (any_fma (f16 HPR:$Sn), (f16 HPR:$Sm), (f16 (f16 HPR:$Sdin)))), (VFNMAH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>, Requires<[HasFullFP16]>; // (fma (fneg x), y, (fneg z)) -> (vfnma z, x, y) -def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, (fneg DPR:$Ddin))), +def : Pat<(f64 (any_fma (fneg DPR:$Dn), DPR:$Dm, (fneg DPR:$Ddin))), (VFNMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, Requires<[HasVFP4,HasDPVFP]>; -def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, (fneg SPR:$Sdin))), +def : Pat<(f32 (any_fma (fneg SPR:$Sn), SPR:$Sm, (fneg SPR:$Sdin))), (VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; -def : Pat<(f16 (fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (fneg (f16 HPR:$Sdin)))), +def : Pat<(f16 (any_fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (fneg (f16 HPR:$Sdin)))), (VFNMAH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>, Requires<[HasFullFP16]>; @@ -2488,23 +2489,23 @@ def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin), // Match @llvm.fma.* intrinsics // (fma x, y, (fneg z)) -> (vfnms z, x, y)) -def : Pat<(f64 (fma DPR:$Dn, DPR:$Dm, (fneg DPR:$Ddin))), +def : Pat<(f64 (any_fma DPR:$Dn, DPR:$Dm, (fneg DPR:$Ddin))), (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, Requires<[HasVFP4,HasDPVFP]>; -def : Pat<(f32 (fma SPR:$Sn, SPR:$Sm, (fneg SPR:$Sdin))), +def : Pat<(f32 (any_fma SPR:$Sn, SPR:$Sm, (fneg SPR:$Sdin))), (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; -def : Pat<(f16 (fma (f16 HPR:$Sn), (f16 HPR:$Sm), (fneg (f16 HPR:$Sdin)))), +def : Pat<(f16 (any_fma (f16 HPR:$Sn), (f16 HPR:$Sm), (fneg (f16 HPR:$Sdin)))), (VFNMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>, Requires<[HasFullFP16]>; // (fneg (fma (fneg x), y, z)) -> (vfnms z, x, y) -def : Pat<(fneg (f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin))), +def : Pat<(fneg (f64 (any_fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin))), (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, Requires<[HasVFP4,HasDPVFP]>; -def : Pat<(fneg (f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin))), +def : Pat<(fneg (f32 (any_fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin))), (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; -def : Pat<(fneg (f16 (fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (f16 HPR:$Sdin)))), +def : Pat<(fneg (f16 (any_fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (f16 HPR:$Sdin)))), (VFNMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>, Requires<[HasFullFP16]>; diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td index 44c4830..7ae500a 100644 --- a/llvm/lib/Target/DirectX/DXIL.td +++ b/llvm/lib/Target/DirectX/DXIL.td @@ -1058,6 +1058,16 @@ def WaveActiveOp : DXILOp<119, waveActiveOp> { IntrinArgIndex<0>, IntrinArgI8<WaveOpKind_Max>, IntrinArgI8<SignedOpKind_Unsigned> ]>, + IntrinSelect<int_dx_wave_reduce_min, + [ + IntrinArgIndex<0>, IntrinArgI8<WaveOpKind_Min>, + IntrinArgI8<SignedOpKind_Signed> + ]>, + IntrinSelect<int_dx_wave_reduce_umin, + [ + IntrinArgIndex<0>, IntrinArgI8<WaveOpKind_Min>, + IntrinArgI8<SignedOpKind_Unsigned> + ]>, ]; let arguments = [OverloadTy, Int8Ty, Int8Ty]; diff --git a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp index e7e7f2c..ce6e812 100644 --- a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp +++ b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp @@ -94,6 +94,8 @@ static bool checkWaveOps(Intrinsic::ID IID) { case Intrinsic::dx_wave_reduce_usum: case Intrinsic::dx_wave_reduce_max: case Intrinsic::dx_wave_reduce_umax: + case Intrinsic::dx_wave_reduce_min: + case Intrinsic::dx_wave_reduce_umin: return true; } } diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp index 1e4797b..cf8b833 100644 --- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp +++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp @@ -36,9 +36,10 @@ using namespace llvm; using namespace llvm::dxil; namespace { -/// A simple Wrapper DiagnosticInfo that generates Module-level diagnostic -/// for TranslateMetadata pass -class DiagnosticInfoTranslateMD : public DiagnosticInfo { + +/// A simple wrapper of DiagnosticInfo that generates module-level diagnostic +/// for the DXILValidateMetadata pass +class DiagnosticInfoValidateMD : public DiagnosticInfo { private: const Twine &Msg; const Module &Mod; @@ -47,9 +48,9 @@ public: /// \p M is the module for which the diagnostic is being emitted. \p Msg is /// the message to show. Note that this class does not copy this message, so /// this reference must be valid for the whole life time of the diagnostic. - DiagnosticInfoTranslateMD(const Module &M, - const Twine &Msg LLVM_LIFETIME_BOUND, - DiagnosticSeverity Severity = DS_Error) + DiagnosticInfoValidateMD(const Module &M, + const Twine &Msg LLVM_LIFETIME_BOUND, + DiagnosticSeverity Severity = DS_Error) : DiagnosticInfo(DK_Unsupported, Severity), Msg(Msg), Mod(M) {} void print(DiagnosticPrinter &DP) const override { @@ -57,6 +58,16 @@ public: } }; +static void reportError(Module &M, Twine Message, + DiagnosticSeverity Severity = DS_Error) { + M.getContext().diagnose(DiagnosticInfoValidateMD(M, Message, Severity)); +} + +static void reportLoopError(Module &M, Twine Message, + DiagnosticSeverity Severity = DS_Error) { + reportError(M, Twine("Invalid \"llvm.loop\" metadata: ") + Message, Severity); +} + enum class EntryPropsTag { ShaderFlags = 0, GSState, @@ -314,25 +325,122 @@ static void translateBranchMetadata(Module &M, Instruction *BBTerminatorInst) { BBTerminatorInst->setMetadata("hlsl.controlflow.hint", nullptr); } -static std::array<unsigned, 6> getCompatibleInstructionMDs(llvm::Module &M) { +// Determines if the metadata node will be compatible with DXIL's loop metadata +// representation. +// +// Reports an error for compatible metadata that is ill-formed. +static bool isLoopMDCompatible(Module &M, Metadata *MD) { + // DXIL only accepts the following loop hints: + std::array<StringLiteral, 3> ValidHintNames = {"llvm.loop.unroll.count", + "llvm.loop.unroll.disable", + "llvm.loop.unroll.full"}; + + MDNode *HintMD = dyn_cast<MDNode>(MD); + if (!HintMD || HintMD->getNumOperands() == 0) + return false; + + auto *HintStr = dyn_cast<MDString>(HintMD->getOperand(0)); + if (!HintStr) + return false; + + if (!llvm::is_contained(ValidHintNames, HintStr->getString())) + return false; + + auto ValidCountNode = [](MDNode *CountMD) -> bool { + if (CountMD->getNumOperands() == 2) + if (auto *Count = dyn_cast<ConstantAsMetadata>(CountMD->getOperand(1))) + if (isa<ConstantInt>(Count->getValue())) + return true; + return false; + }; + + if (HintStr->getString() == "llvm.loop.unroll.count") { + if (!ValidCountNode(HintMD)) { + reportLoopError(M, "\"llvm.loop.unroll.count\" must have 2 operands and " + "the second must be a constant integer"); + return false; + } + } else if (HintMD->getNumOperands() != 1) { + reportLoopError( + M, "\"llvm.loop.unroll.disable\" and \"llvm.loop.unroll.full\" " + "must be provided as a single operand"); + return false; + } + + return true; +} + +static void translateLoopMetadata(Module &M, Instruction *I, MDNode *BaseMD) { + // A distinct node has the self-referential form: !0 = !{ !0, ... } + auto IsDistinctNode = [](MDNode *Node) -> bool { + return Node && Node->getNumOperands() != 0 && Node == Node->getOperand(0); + }; + + // Set metadata to null to remove empty/ill-formed metadata from instruction + if (BaseMD->getNumOperands() == 0 || !IsDistinctNode(BaseMD)) + return I->setMetadata("llvm.loop", nullptr); + + // It is valid to have a chain of self-refential loop metadata nodes, as + // below. We will collapse these into just one when we reconstruct the + // metadata. + // + // Eg: + // !0 = !{!0, !1} + // !1 = !{!1, !2} + // !2 = !{!"llvm.loop.unroll.disable"} + // + // So, traverse down a potential self-referential chain + while (1 < BaseMD->getNumOperands() && + IsDistinctNode(dyn_cast<MDNode>(BaseMD->getOperand(1)))) + BaseMD = dyn_cast<MDNode>(BaseMD->getOperand(1)); + + // To reconstruct a distinct node we create a temporary node that we will + // then update to create a self-reference. + llvm::TempMDTuple TempNode = llvm::MDNode::getTemporary(M.getContext(), {}); + SmallVector<Metadata *> CompatibleOperands = {TempNode.get()}; + + // Iterate and reconstruct the metadata nodes that contains any hints, + // stripping any unrecognized metadata. + ArrayRef<MDOperand> Operands = BaseMD->operands(); + for (auto &Op : Operands.drop_front()) + if (isLoopMDCompatible(M, Op.get())) + CompatibleOperands.push_back(Op.get()); + + if (2 < CompatibleOperands.size()) + reportLoopError(M, "Provided conflicting hints"); + + MDNode *CompatibleLoopMD = MDNode::get(M.getContext(), CompatibleOperands); + TempNode->replaceAllUsesWith(CompatibleLoopMD); + + I->setMetadata("llvm.loop", CompatibleLoopMD); +} + +using InstructionMDList = std::array<unsigned, 7>; + +static InstructionMDList getCompatibleInstructionMDs(llvm::Module &M) { return { M.getMDKindID("dx.nonuniform"), M.getMDKindID("dx.controlflow.hints"), M.getMDKindID("dx.precise"), llvm::LLVMContext::MD_range, - llvm::LLVMContext::MD_alias_scope, llvm::LLVMContext::MD_noalias}; + llvm::LLVMContext::MD_alias_scope, llvm::LLVMContext::MD_noalias, + M.getMDKindID("llvm.loop")}; } static void translateInstructionMetadata(Module &M) { // construct allowlist of valid metadata node kinds - std::array<unsigned, 6> DXILCompatibleMDs = getCompatibleInstructionMDs(M); + InstructionMDList DXILCompatibleMDs = getCompatibleInstructionMDs(M); + unsigned char MDLoopKind = M.getContext().getMDKindID("llvm.loop"); for (Function &F : M) { for (BasicBlock &BB : F) { // This needs to be done first so that "hlsl.controlflow.hints" isn't - // removed in the whitelist below + // removed in the allow-list below if (auto *I = BB.getTerminator()) translateBranchMetadata(M, I); for (auto &I : make_early_inc_range(BB)) { + if (isa<BranchInst>(I)) + if (MDNode *LoopMD = I.getMetadata(MDLoopKind)) + translateLoopMetadata(M, &I, LoopMD); I.dropUnknownNonDebugMetadata(DXILCompatibleMDs); } } @@ -364,6 +472,16 @@ static void cleanModuleFlags(Module &M) { M.addModuleFlag(Flag.Behavior, Flag.Key->getString(), Flag.Val); } +using GlobalMDList = std::array<StringLiteral, 7>; + +// The following are compatible with DXIL but not emit with clang, they can +// be added when applicable: +// dx.typeAnnotations, dx.viewIDState, dx.dxrPayloadAnnotations +static GlobalMDList CompatibleNamedModuleMDs = { + "llvm.ident", "llvm.module.flags", "dx.resources", "dx.valver", + "dx.shaderModel", "dx.version", "dx.entryPoints", +}; + static void translateGlobalMetadata(Module &M, DXILResourceMap &DRM, DXILResourceTypeMap &DRTM, const ModuleShaderFlags &ShaderFlags, @@ -389,31 +507,23 @@ static void translateGlobalMetadata(Module &M, DXILResourceMap &DRM, uint64_t CombinedMask = ShaderFlags.getCombinedFlags(); EntryFnMDNodes.emplace_back( emitTopLevelLibraryNode(M, ResourceMD, CombinedMask)); - } else if (MMDI.EntryPropertyVec.size() > 1) { - M.getContext().diagnose(DiagnosticInfoTranslateMD( - M, "Non-library shader: One and only one entry expected")); - } + } else if (1 < MMDI.EntryPropertyVec.size()) + reportError(M, "Non-library shader: One and only one entry expected"); for (const EntryProperties &EntryProp : MMDI.EntryPropertyVec) { - const ComputedShaderFlags &EntrySFMask = - ShaderFlags.getFunctionFlags(EntryProp.Entry); - - // If ShaderProfile is Library, mask is already consolidated in the - // top-level library node. Hence it is not emitted. uint64_t EntryShaderFlags = 0; if (MMDI.ShaderProfile != Triple::EnvironmentType::Library) { - EntryShaderFlags = EntrySFMask; - if (EntryProp.ShaderStage != MMDI.ShaderProfile) { - M.getContext().diagnose(DiagnosticInfoTranslateMD( - M, - "Shader stage '" + - Twine(getShortShaderStage(EntryProp.ShaderStage) + - "' for entry '" + Twine(EntryProp.Entry->getName()) + - "' different from specified target profile '" + - Twine(Triple::getEnvironmentTypeName(MMDI.ShaderProfile) + - "'")))); - } + EntryShaderFlags = ShaderFlags.getFunctionFlags(EntryProp.Entry); + if (EntryProp.ShaderStage != MMDI.ShaderProfile) + reportError( + M, "Shader stage '" + + Twine(getShortShaderStage(EntryProp.ShaderStage)) + + "' for entry '" + Twine(EntryProp.Entry->getName()) + + "' different from specified target profile '" + + Twine(Triple::getEnvironmentTypeName(MMDI.ShaderProfile) + + "'")); } + EntryFnMDNodes.emplace_back(emitEntryMD(EntryProp, Signatures, ResourceMD, EntryShaderFlags, MMDI.ShaderProfile)); @@ -426,19 +536,17 @@ static void translateGlobalMetadata(Module &M, DXILResourceMap &DRM, cleanModuleFlags(M); - // dx.rootsignatures will have been parsed from its metadata form as its - // binary form as part of the RootSignatureAnalysisWrapper, so safely - // remove it as it is not recognized in DXIL - if (NamedMDNode *RootSignature = M.getNamedMetadata("dx.rootsignatures")) - RootSignature->eraseFromParent(); + // Finally, strip all module metadata that is not explicitly specified in the + // allow-list + SmallVector<NamedMDNode *> ToStrip; - // llvm.errno.tbaa was recently added but is not supported in LLVM 3.7 and - // causes all tests using the DXIL Validator to fail. - // - // This is a temporary fix and should be replaced with a allowlist once - // we have determined all metadata that the DXIL Validator allows - if (NamedMDNode *ErrNo = M.getNamedMetadata("llvm.errno.tbaa")) - ErrNo->eraseFromParent(); + for (NamedMDNode &NamedMD : M.named_metadata()) + if (!NamedMD.getName().starts_with("llvm.dbg.") && + !llvm::is_contained(CompatibleNamedModuleMDs, NamedMD.getName())) + ToStrip.push_back(&NamedMD); + + for (NamedMDNode *NamedMD : ToStrip) + NamedMD->eraseFromParent(); } PreservedAnalyses DXILTranslateMetadata::run(Module &M, @@ -454,45 +562,34 @@ PreservedAnalyses DXILTranslateMetadata::run(Module &M, return PreservedAnalyses::all(); } -namespace { -class DXILTranslateMetadataLegacy : public ModulePass { -public: - static char ID; // Pass identification, replacement for typeid - explicit DXILTranslateMetadataLegacy() : ModulePass(ID) {} - - StringRef getPassName() const override { return "DXIL Translate Metadata"; } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<DXILResourceTypeWrapperPass>(); - AU.addRequired<DXILResourceWrapperPass>(); - AU.addRequired<ShaderFlagsAnalysisWrapper>(); - AU.addRequired<DXILMetadataAnalysisWrapperPass>(); - AU.addRequired<RootSignatureAnalysisWrapper>(); - - AU.addPreserved<DXILMetadataAnalysisWrapperPass>(); - AU.addPreserved<DXILResourceBindingWrapperPass>(); - AU.addPreserved<DXILResourceWrapperPass>(); - AU.addPreserved<RootSignatureAnalysisWrapper>(); - AU.addPreserved<ShaderFlagsAnalysisWrapper>(); - } +void DXILTranslateMetadataLegacy::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<DXILResourceTypeWrapperPass>(); + AU.addRequired<DXILResourceWrapperPass>(); + AU.addRequired<ShaderFlagsAnalysisWrapper>(); + AU.addRequired<DXILMetadataAnalysisWrapperPass>(); + AU.addRequired<RootSignatureAnalysisWrapper>(); + + AU.addPreserved<DXILMetadataAnalysisWrapperPass>(); + AU.addPreserved<DXILResourceBindingWrapperPass>(); + AU.addPreserved<DXILResourceWrapperPass>(); + AU.addPreserved<RootSignatureAnalysisWrapper>(); + AU.addPreserved<ShaderFlagsAnalysisWrapper>(); +} - bool runOnModule(Module &M) override { - DXILResourceMap &DRM = - getAnalysis<DXILResourceWrapperPass>().getResourceMap(); - DXILResourceTypeMap &DRTM = - getAnalysis<DXILResourceTypeWrapperPass>().getResourceTypeMap(); - const ModuleShaderFlags &ShaderFlags = - getAnalysis<ShaderFlagsAnalysisWrapper>().getShaderFlags(); - dxil::ModuleMetadataInfo MMDI = - getAnalysis<DXILMetadataAnalysisWrapperPass>().getModuleMetadata(); - - translateGlobalMetadata(M, DRM, DRTM, ShaderFlags, MMDI); - translateInstructionMetadata(M); - return true; - } -}; +bool DXILTranslateMetadataLegacy::runOnModule(Module &M) { + DXILResourceMap &DRM = + getAnalysis<DXILResourceWrapperPass>().getResourceMap(); + DXILResourceTypeMap &DRTM = + getAnalysis<DXILResourceTypeWrapperPass>().getResourceTypeMap(); + const ModuleShaderFlags &ShaderFlags = + getAnalysis<ShaderFlagsAnalysisWrapper>().getShaderFlags(); + dxil::ModuleMetadataInfo MMDI = + getAnalysis<DXILMetadataAnalysisWrapperPass>().getModuleMetadata(); -} // namespace + translateGlobalMetadata(M, DRM, DRTM, ShaderFlags, MMDI); + translateInstructionMetadata(M); + return true; +} char DXILTranslateMetadataLegacy::ID = 0; diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.h b/llvm/lib/Target/DirectX/DXILTranslateMetadata.h index 4c1ffac..cfb8aaa 100644 --- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.h +++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.h @@ -10,6 +10,7 @@ #define LLVM_TARGET_DIRECTX_DXILTRANSLATEMETADATA_H #include "llvm/IR/PassManager.h" +#include "llvm/Pass.h" namespace llvm { @@ -20,6 +21,22 @@ public: PreservedAnalyses run(Module &M, ModuleAnalysisManager &); }; +/// Wrapper pass for the legacy pass manager. +/// +/// This is required because the passes that will depend on this are codegen +/// passes which run through the legacy pass manager. +class DXILTranslateMetadataLegacy : public ModulePass { +public: + static char ID; // Pass identification, replacement for typeid + explicit DXILTranslateMetadataLegacy() : ModulePass(ID) {} + + StringRef getPassName() const override { return "DXIL Translate Metadata"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override; + + bool runOnModule(Module &M) override; +}; + } // namespace llvm #endif // LLVM_TARGET_DIRECTX_DXILTRANSLATEMETADATA_H diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp index 68fd3e0..60dfd96 100644 --- a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp +++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp @@ -55,8 +55,10 @@ bool DirectXTTIImpl::isTargetIntrinsicTriviallyScalarizable( case Intrinsic::dx_splitdouble: case Intrinsic::dx_wave_readlane: case Intrinsic::dx_wave_reduce_max: + case Intrinsic::dx_wave_reduce_min: case Intrinsic::dx_wave_reduce_sum: case Intrinsic::dx_wave_reduce_umax: + case Intrinsic::dx_wave_reduce_umin: case Intrinsic::dx_wave_reduce_usum: case Intrinsic::dx_imad: case Intrinsic::dx_umad: diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index 4029e14..729c077 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -493,7 +493,7 @@ NVPTXTTIImpl::getInstructionCost(const User *U, // predicate ("@"). return !AsmInst.empty() && (AsmInst[0] == '@' || isAlpha(AsmInst[0]) || - AsmInst.find(".pragma") != StringRef::npos); + AsmInst.contains(".pragma")); }); return InstCount * TargetTransformInfo::TCC_Basic; } diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp index 41a9c92..96e8afc 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp @@ -823,6 +823,7 @@ static bool relaxableFixupNeedsRelocation(const MCFixupKind Kind) { break; case RISCV::fixup_riscv_rvc_jump: case RISCV::fixup_riscv_rvc_branch: + case RISCV::fixup_riscv_rvc_imm: case RISCV::fixup_riscv_jal: return false; } diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp index 6d587e6..5934c91 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp @@ -688,6 +688,7 @@ uint64_t RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo, // the `jal` again in the assembler. } else if (MIFrm == RISCVII::InstFormatCI) { FixupKind = RISCV::fixup_riscv_rvc_imm; + AsmRelaxToLinkerRelaxableWithFeature(RISCV::FeatureVendorXqcili); } else if (MIFrm == RISCVII::InstFormatI) { FixupKind = RISCV::fixup_riscv_12_i; } else if (MIFrm == RISCVII::InstFormatQC_EB) { diff --git a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp index 98b636e..9bd66a4 100644 --- a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp +++ b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp @@ -373,6 +373,26 @@ static void doAtomicBinOpExpansion(const RISCVInstrInfo *TII, MachineInstr &MI, .addReg(ScratchReg) .addImm(-1); break; + case AtomicRMWInst::Max: + BuildMI(LoopMBB, DL, TII->get(RISCV::MAX), ScratchReg) + .addReg(DestReg) + .addReg(IncrReg); + break; + case AtomicRMWInst::Min: + BuildMI(LoopMBB, DL, TII->get(RISCV::MIN), ScratchReg) + .addReg(DestReg) + .addReg(IncrReg); + break; + case AtomicRMWInst::UMax: + BuildMI(LoopMBB, DL, TII->get(RISCV::MAXU), ScratchReg) + .addReg(DestReg) + .addReg(IncrReg); + break; + case AtomicRMWInst::UMin: + BuildMI(LoopMBB, DL, TII->get(RISCV::MINU), ScratchReg) + .addReg(DestReg) + .addReg(IncrReg); + break; } BuildMI(LoopMBB, DL, TII->get(getSCForRMW(Ordering, Width, STI)), ScratchReg) .addReg(ScratchReg) @@ -682,6 +702,9 @@ bool RISCVExpandAtomicPseudo::expandAtomicMinMaxOp( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, AtomicRMWInst::BinOp BinOp, bool IsMasked, int Width, MachineBasicBlock::iterator &NextMBBI) { + // Using MIN(U)/MAX(U) is preferrable if permitted + if (STI->hasPermissiveZalrsc() && STI->hasStdExtZbb() && !IsMasked) + return expandAtomicBinOp(MBB, MBBI, BinOp, IsMasked, Width, NextMBBI); MachineInstr &MI = *MBBI; DebugLoc DL = MI.getDebugLoc(); diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 2754d78..b4556f6 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1906,6 +1906,25 @@ def FeatureForcedAtomics : SubtargetFeature< def HasAtomicLdSt : Predicate<"Subtarget->hasStdExtZalrsc() || Subtarget->hasForcedAtomics()">; +// The RISC-V Unprivileged Architecture - ISA Volume 1 (Version: 20250508) +// [https://docs.riscv.org/reference/isa/_attachments/riscv-unprivileged.pdf] +// in section 13.3. Eventual Success of Store-Conditional Instructions, defines +// _constrained_ LR/SC loops: +// The dynamic code executed between the LR and SC instructions can only +// contain instructions from the base ''I'' instruction set, excluding loads, +// stores, backward jumps, taken backward branches, JALR, FENCE, and SYSTEM +// instructions. Compressed forms of the aforementioned ''I'' instructions in +// the Zca and Zcb extensions are also permitted. +// LR/SC loops that do not adhere to the above are _unconstrained_ LR/SC loops, +// and success is implementation specific. For implementations which know that +// non-base instructions (such as the ''B'' extension) will not violate any +// forward progress guarantees, using these instructions to reduce the LR/SC +// sequence length is desirable. +def FeaturePermissiveZalrsc + : SubtargetFeature< + "permissive-zalrsc", "HasPermissiveZalrsc", "true", + "Implementation permits non-base instructions between LR/SC pairs">; + def FeatureTaggedGlobals : SubtargetFeature<"tagged-globals", "AllowTaggedGlobals", "true", "Use an instruction sequence for taking the address of a global " diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp index 6181abb..47022b3 100644 --- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp @@ -745,7 +745,7 @@ Register SPIRVGlobalRegistry::buildGlobalVariable( .addDef(ResVReg) .addUse(getSPIRVTypeID(BaseType)) .addImm(static_cast<uint32_t>(Storage)); - if (Init != 0) + if (Init) MIB.addUse(Init->getOperand(0).getReg()); // ISel may introduce a new register on this step, so we need to add it to // DT and correct its type avoiding fails on the next stage. diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index 021353a..3fea21e 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -222,6 +222,9 @@ private: bool selectWaveReduceMax(Register ResVReg, const SPIRVType *ResType, MachineInstr &I, bool IsUnsigned) const; + bool selectWaveReduceMin(Register ResVReg, const SPIRVType *ResType, + MachineInstr &I, bool IsUnsigned) const; + bool selectWaveReduceSum(Register ResVReg, const SPIRVType *ResType, MachineInstr &I) const; @@ -2456,6 +2459,35 @@ bool SPIRVInstructionSelector::selectWaveReduceMax(Register ResVReg, .constrainAllUses(TII, TRI, RBI); } +bool SPIRVInstructionSelector::selectWaveReduceMin(Register ResVReg, + const SPIRVType *ResType, + MachineInstr &I, + bool IsUnsigned) const { + assert(I.getNumOperands() == 3); + assert(I.getOperand(2).isReg()); + MachineBasicBlock &BB = *I.getParent(); + Register InputRegister = I.getOperand(2).getReg(); + SPIRVType *InputType = GR.getSPIRVTypeForVReg(InputRegister); + + if (!InputType) + report_fatal_error("Input Type could not be determined."); + + SPIRVType *IntTy = GR.getOrCreateSPIRVIntegerType(32, I, TII); + // Retreive the operation to use based on input type + bool IsFloatTy = GR.isScalarOrVectorOfType(InputRegister, SPIRV::OpTypeFloat); + auto IntegerOpcodeType = + IsUnsigned ? SPIRV::OpGroupNonUniformUMin : SPIRV::OpGroupNonUniformSMin; + auto Opcode = IsFloatTy ? SPIRV::OpGroupNonUniformFMin : IntegerOpcodeType; + return BuildMI(BB, I, I.getDebugLoc(), TII.get(Opcode)) + .addDef(ResVReg) + .addUse(GR.getSPIRVTypeID(ResType)) + .addUse(GR.getOrCreateConstInt(SPIRV::Scope::Subgroup, I, IntTy, TII, + !STI.isShader())) + .addImm(SPIRV::GroupOperation::Reduce) + .addUse(I.getOperand(2).getReg()) + .constrainAllUses(TII, TRI, RBI); +} + bool SPIRVInstructionSelector::selectWaveReduceSum(Register ResVReg, const SPIRVType *ResType, MachineInstr &I) const { @@ -3431,6 +3463,10 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg, return selectWaveReduceMax(ResVReg, ResType, I, /*IsUnsigned*/ true); case Intrinsic::spv_wave_reduce_max: return selectWaveReduceMax(ResVReg, ResType, I, /*IsUnsigned*/ false); + case Intrinsic::spv_wave_reduce_umin: + return selectWaveReduceMin(ResVReg, ResType, I, /*IsUnsigned*/ true); + case Intrinsic::spv_wave_reduce_min: + return selectWaveReduceMin(ResVReg, ResType, I, /*IsUnsigned*/ false); case Intrinsic::spv_wave_reduce_sum: return selectWaveReduceSum(ResVReg, ResType, I); case Intrinsic::spv_wave_readlane: diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index 3da720f..58109ac 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -8973,8 +8973,7 @@ SystemZTargetLowering::getJumpConditionMergingParams(Instruction::BinaryOps Opc, if (const auto *CB = dyn_cast<CallBase>(RHSVal)) { if (CB->isInlineAsm()) { const InlineAsm *IA = cast<InlineAsm>(CB->getCalledOperand()); - return IA && - IA->getConstraintString().find("{@cc}") != std::string::npos; + return IA && IA->getConstraintString().contains("{@cc}"); } } } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp index 27f7e1a..5a1779c 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp @@ -81,7 +81,7 @@ WebAssemblyFrameLowering::getLocalForStackObject(MachineFunction &MF, // Abuse object size to record number of WebAssembly locals allocated to // this object. MFI.setObjectSize(FrameIndex, ValueVTs.size()); - return static_cast<unsigned>(Local); + return Local; } /// We need a base pointer in the case of having items on the stack that diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index b86020a..624cff2 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -12213,7 +12213,7 @@ static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale); ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8) : MVT::getVectorVT(ShiftSVT, Size / Scale); - return (int)ShiftAmt; + return ShiftAmt; }; // SSE/AVX supports logical shifts up to 64-bit integers - so we can just @@ -48787,6 +48787,11 @@ static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC, } if (CC == X86::COND_E || CC == X86::COND_NE) { + // Canonicalize constant to RHS if we're just using ZF. + if (Op0 != Op1 && DAG.isConstantIntBuildVectorOrConstantInt(Op0) && + !DAG.isConstantIntBuildVectorOrConstantInt(Op1)) + return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op0); + // TESTZ(X,~Y) == TESTC(Y,X) if (SDValue NotOp1 = IsNOT(Op1, DAG)) { CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE); @@ -48850,10 +48855,6 @@ static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC, } } - // TESTZ(-1,X) == TESTZ(X,X) - if (ISD::isBuildVectorAllOnes(Op0.getNode())) - return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1); - // TESTZ(X,-1) == TESTZ(X,X) if (ISD::isBuildVectorAllOnes(Op1.getNode())) return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0); @@ -54679,7 +54680,8 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, SDValue NewPtr = DAG.getMemBasePlusOffset( Ld->getBasePtr(), PtrByteOfs, DL, SDNodeFlags::NoUnsignedWrap); SDValue NewLoad = - DAG.getLoad(VT, DL, Ld->getChain(), NewPtr, Ld->getMemOperand()); + DAG.getLoad(VT, DL, Ld->getChain(), NewPtr, Ld->getPointerInfo(), + Align(), Ld->getMemOperand()->getFlags()); DAG.ReplaceAllUsesOfValueWith(Src.getOperand(0).getValue(1), NewLoad.getValue(1)); return NewLoad; diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp index c8d1938..0849fc7 100644 --- a/llvm/lib/TargetParser/Host.cpp +++ b/llvm/lib/TargetParser/Host.cpp @@ -1179,7 +1179,7 @@ static const char *getAMDProcessorTypeAndSubtype(unsigned Family, const unsigned *Features, unsigned *Type, unsigned *Subtype) { - const char *CPU = 0; + const char *CPU = nullptr; switch (Family) { case 4: diff --git a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp index 042578d..6a11aec 100644 --- a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp +++ b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp @@ -380,7 +380,7 @@ bool ExpandVariadics::runOnModule(Module &M) { if (CB->isIndirectCall()) { FunctionType *FTy = CB->getFunctionType(); if (FTy->isVarArg()) - Changed |= expandCall(M, Builder, CB, FTy, 0); + Changed |= expandCall(M, Builder, CB, FTy, /*NF=*/nullptr); } } } diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp index 5ba2167..cc53ec2 100644 --- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp @@ -1957,8 +1957,12 @@ Value *DataFlowSanitizer::getShadowAddress(Value *Addr, Value *DataFlowSanitizer::getShadowAddress(Value *Addr, BasicBlock::iterator Pos) { IRBuilder<> IRB(Pos->getParent(), Pos); - Value *ShadowOffset = getShadowOffset(Addr, IRB); - return getShadowAddress(Addr, Pos, ShadowOffset); + Value *ShadowAddr = getShadowOffset(Addr, IRB); + uint64_t ShadowBase = MapParams->ShadowBase; + if (ShadowBase != 0) + ShadowAddr = + IRB.CreateAdd(ShadowAddr, ConstantInt::get(IntptrTy, ShadowBase)); + return getShadowAddress(Addr, Pos, ShadowAddr); } Value *DFSanFunction::combineShadowsThenConvert(Type *T, Value *V1, Value *V2, diff --git a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp index a6ec6c1..2f256df 100644 --- a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp @@ -216,7 +216,6 @@ static void HandleUnsupportedAnnotationKinds(GlobalVariable &GVar, } LLVM_DEBUG(dbgs() << "Skip annotation for " << GVar.getName() << " due to " << Reason << ".\n"); - return; } struct AllocMatchInfo { diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp index 66e45ec..e84ca81 100644 --- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp @@ -122,16 +122,22 @@ static cl::opt<unsigned> cl::desc("Maximum cost accepted for the transformation"), cl::Hidden, cl::init(50)); -extern cl::opt<bool> ProfcheckDisableMetadataFixes; - -} // namespace llvm - static cl::opt<double> MaxClonedRate( "dfa-max-cloned-rate", cl::desc( "Maximum cloned instructions rate accepted for the transformation"), cl::Hidden, cl::init(7.5)); +static cl::opt<unsigned> + MaxOuterUseBlocks("dfa-max-out-use-blocks", + cl::desc("Maximum unduplicated blocks with outer uses " + "accepted for the transformation"), + cl::Hidden, cl::init(40)); + +extern cl::opt<bool> ProfcheckDisableMetadataFixes; + +} // namespace llvm + namespace { class SelectInstToUnfold { SelectInst *SI; @@ -965,8 +971,16 @@ private: // SLPVectorizer. // TODO: Thread the switch partially before reaching the threshold. uint64_t NumOrigInst = 0; - for (auto *BB : DuplicateMap.keys()) + uint64_t NumOuterUseBlock = 0; + for (auto *BB : DuplicateMap.keys()) { NumOrigInst += BB->sizeWithoutDebug(); + // Only unduplicated blocks with single predecessor require new phi + // nodes. + for (auto *Succ : successors(BB)) + if (!DuplicateMap.count(Succ) && Succ->getSinglePredecessor()) + NumOuterUseBlock++; + } + if (double(NumClonedInst) / double(NumOrigInst) > MaxClonedRate) { LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, too much " "instructions wll be cloned\n"); @@ -977,6 +991,20 @@ private: return false; } + // Too much unduplicated blocks with outer uses may cause too much + // insertions of phi nodes for duplicated definitions. TODO: Drop this + // threshold if we come up with another way to reduce the number of inserted + // phi nodes. + if (NumOuterUseBlock > MaxOuterUseBlocks) { + LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, too much " + "blocks with outer uses\n"); + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "NotProfitable", Switch) + << "Too much blocks with outer uses."; + }); + return false; + } + InstructionCost DuplicationCost = 0; unsigned JumpTableSize = 0; diff --git a/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/llvm/lib/Transforms/Utils/PredicateInfo.cpp index a9ab3b3..27fed73 100644 --- a/llvm/lib/Transforms/Utils/PredicateInfo.cpp +++ b/llvm/lib/Transforms/Utils/PredicateInfo.cpp @@ -809,7 +809,6 @@ public: void emitInstructionAnnot(const Instruction *I, formatted_raw_ostream &OS) override { if (const auto *PI = PredInfo->getPredicateInfoFor(I)) { - OS << "; Has predicate info\n"; if (const auto *PB = dyn_cast<PredicateBranch>(PI)) { OS << "; branch predicate info { TrueEdge: " << PB->TrueEdge << " Comparison:" << *PB->Condition << " Edge: ["; diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index c537be5c..b03fb62 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -1866,10 +1866,19 @@ bool SimplifyCFGOpt::hoistCommonCodeFromSuccessors(Instruction *TI, // If either of the blocks has it's address taken, then we can't do this fold, // because the code we'd hoist would no longer run when we jump into the block // by it's address. - for (auto *Succ : successors(BB)) - if (Succ->hasAddressTaken() || !Succ->getSinglePredecessor()) + for (auto *Succ : successors(BB)) { + if (Succ->hasAddressTaken()) return false; - + if (Succ->getSinglePredecessor()) + continue; + // If Succ has >1 predecessors, continue to check if the Succ contains only + // one `unreachable` inst. Since executing `unreachable` inst is an UB, we + // can relax the condition based on the assumptiom that the program would + // never enter Succ and trigger such an UB. + if (isa<UnreachableInst>(*Succ->begin())) + continue; + return false; + } // The second of pair is a SkipFlags bitmask. using SuccIterPair = std::pair<BasicBlock::iterator, unsigned>; SmallVector<SuccIterPair, 8> SuccIterPairs; @@ -5228,32 +5237,52 @@ bool SimplifyCFGOpt::simplifyBranchOnICmpChain(BranchInst *BI, CompVal, DL.getIntPtrType(CompVal->getType()), "magicptr"); } - // Create the new switch instruction now. - SwitchInst *New = Builder.CreateSwitch(CompVal, DefaultBB, Values.size()); - if (HasProfile) { - // We know the weight of the default case. We don't know the weight of the - // other cases, but rather than completely lose profiling info, we split - // the remaining probability equally over them. - SmallVector<uint32_t> NewWeights(Values.size() + 1); - NewWeights[0] = BranchWeights[1]; // this is the default, and we swapped if - // TrueWhenEqual. - for (auto &V : drop_begin(NewWeights)) - V = BranchWeights[0] / Values.size(); - setBranchWeights(*New, NewWeights, /*IsExpected=*/false); - } - - // Add all of the 'cases' to the switch instruction. - for (ConstantInt *Val : Values) - New->addCase(Val, EdgeBB); + // Check if we can represent the values as a contiguous range. If so, we use a + // range check + conditional branch instead of a switch. + if (Values.front()->getValue() - Values.back()->getValue() == + Values.size() - 1) { + ConstantRange RangeToCheck = ConstantRange::getNonEmpty( + Values.back()->getValue(), Values.front()->getValue() + 1); + APInt Offset, RHS; + ICmpInst::Predicate Pred; + RangeToCheck.getEquivalentICmp(Pred, RHS, Offset); + Value *X = CompVal; + if (!Offset.isZero()) + X = Builder.CreateAdd(X, ConstantInt::get(CompVal->getType(), Offset)); + Value *Cond = + Builder.CreateICmp(Pred, X, ConstantInt::get(CompVal->getType(), RHS)); + BranchInst *NewBI = Builder.CreateCondBr(Cond, EdgeBB, DefaultBB); + if (HasProfile) + setBranchWeights(*NewBI, BranchWeights, /*IsExpected=*/false); + // We don't need to update PHI nodes since we don't add any new edges. + } else { + // Create the new switch instruction now. + SwitchInst *New = Builder.CreateSwitch(CompVal, DefaultBB, Values.size()); + if (HasProfile) { + // We know the weight of the default case. We don't know the weight of the + // other cases, but rather than completely lose profiling info, we split + // the remaining probability equally over them. + SmallVector<uint32_t> NewWeights(Values.size() + 1); + NewWeights[0] = BranchWeights[1]; // this is the default, and we swapped + // if TrueWhenEqual. + for (auto &V : drop_begin(NewWeights)) + V = BranchWeights[0] / Values.size(); + setBranchWeights(*New, NewWeights, /*IsExpected=*/false); + } - // We added edges from PI to the EdgeBB. As such, if there were any - // PHI nodes in EdgeBB, they need entries to be added corresponding to - // the number of edges added. - for (BasicBlock::iterator BBI = EdgeBB->begin(); isa<PHINode>(BBI); ++BBI) { - PHINode *PN = cast<PHINode>(BBI); - Value *InVal = PN->getIncomingValueForBlock(BB); - for (unsigned i = 0, e = Values.size() - 1; i != e; ++i) - PN->addIncoming(InVal, BB); + // Add all of the 'cases' to the switch instruction. + for (ConstantInt *Val : Values) + New->addCase(Val, EdgeBB); + + // We added edges from PI to the EdgeBB. As such, if there were any + // PHI nodes in EdgeBB, they need entries to be added corresponding to + // the number of edges added. + for (BasicBlock::iterator BBI = EdgeBB->begin(); isa<PHINode>(BBI); ++BBI) { + PHINode *PN = cast<PHINode>(BBI); + Value *InVal = PN->getIncomingValueForBlock(BB); + for (unsigned i = 0, e = Values.size() - 1; i != e; ++i) + PN->addIncoming(InVal, BB); + } } // Erase the old branch instruction. @@ -7603,7 +7632,9 @@ static bool simplifySwitchOfPowersOfTwo(SwitchInst *SI, IRBuilder<> &Builder, auto *DefaultCaseBB = SI->getDefaultDest(); BasicBlock *SplitBB = SplitBlock(OrigBB, SI, DTU); auto It = OrigBB->getTerminator()->getIterator(); - BranchInst::Create(SplitBB, DefaultCaseBB, IsPow2, It); + auto *BI = BranchInst::Create(SplitBB, DefaultCaseBB, IsPow2, It); + // BI is handling the default case for SI, and so should share its DebugLoc. + BI->setDebugLoc(SI->getDebugLoc()); It->eraseFromParent(); addPredecessorToBlock(DefaultCaseBB, OrigBB, SplitBB); diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 4fcaf6d..1b55a3b 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -5608,6 +5608,7 @@ private: for (ScheduleBundle *Bundle : Bundles) { if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0) break; + SmallPtrSet<Value *, 4> ParentsUniqueUsers; // Need to search for the lane since the tree entry can be // reordered. auto *It = find(Bundle->getTreeEntry()->Scalars, In); @@ -5636,6 +5637,22 @@ private: Bundle->getTreeEntry()->isCopyableElement(In)) && "Missed TreeEntry operands?"); + bool IsNonSchedulableWithParentPhiNode = + Bundle->getTreeEntry()->doesNotNeedToSchedule() && + Bundle->getTreeEntry()->UserTreeIndex && + Bundle->getTreeEntry()->UserTreeIndex.UserTE->hasState() && + Bundle->getTreeEntry()->UserTreeIndex.UserTE->getOpcode() == + Instruction::PHI; + // Count the number of unique phi nodes, which are the parent for + // parent entry, and exit, if all the unique phis are processed. + if (IsNonSchedulableWithParentPhiNode) { + const TreeEntry *ParentTE = + Bundle->getTreeEntry()->UserTreeIndex.UserTE; + Value *User = ParentTE->Scalars[Lane]; + if (!ParentsUniqueUsers.insert(User).second) + break; + } + for (unsigned OpIdx : seq<unsigned>(Bundle->getTreeEntry()->getNumOperands())) if (auto *I = dyn_cast<Instruction>( @@ -5644,8 +5661,8 @@ private: << *I << "\n"); DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx, Checked); } - // If parent node is schedulable, it will be handle correctly. - if (!Bundle->getTreeEntry()->doesNotNeedToSchedule()) + // If parent node is schedulable, it will be handled correctly. + if (!IsNonSchedulableWithParentPhiNode) break; It = std::find(std::next(It), Bundle->getTreeEntry()->Scalars.end(), In); @@ -16903,7 +16920,10 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( // otherwise TEPtr depends on TE. if ((TEInsertBlock != InsertPt->getParent() || TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) && - !CheckOrdering(InsertPt)) + (!CheckOrdering(InsertPt) || + (UseEI.UserTE->hasCopyableElements() && + isUsedOutsideBlock(const_cast<Instruction *>(TEInsertPt)) && + is_contained(UseEI.UserTE->Scalars, TEInsertPt)))) continue; // The node is reused - exit. if (CheckAndUseSameNode(TEPtr)) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index acad795..4d98014 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -3648,6 +3648,37 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, Sub = VecOp->getDefiningRecipe(); VecOp = Tmp; } + + // If ValB is a constant and can be safely extended, truncate it to the same + // type as ExtA's operand, then extend it to the same type as ExtA. This + // creates two uniform extends that can more easily be matched by the rest of + // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all + // replaced with the new extend of the constant. + auto ExtendAndReplaceConstantOp = [&Ctx](VPWidenCastRecipe *ExtA, + VPWidenCastRecipe *&ExtB, + VPValue *&ValB, VPWidenRecipe *Mul) { + if (!ExtA || ExtB || !ValB->isLiveIn()) + return; + Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0)); + Instruction::CastOps ExtOpc = ExtA->getOpcode(); + const APInt *Const; + if (!match(ValB, m_APInt(Const)) || + !llvm::canConstantBeExtended( + Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc))) + return; + // The truncate ensures that the type of each extended operand is the + // same, and it's been proven that the constant can be extended from + // NarrowTy safely. Necessary since ExtA's extended operand would be + // e.g. an i8, while the const will likely be an i32. This will be + // elided by later optimisations. + VPBuilder Builder(Mul); + auto *Trunc = + Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy); + Type *WideTy = Ctx.Types.inferScalarType(ExtA); + ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy); + Mul->setOperand(1, ExtB); + }; + // Try to match reduce.add(mul(...)). if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) { auto *RecipeA = @@ -3656,6 +3687,9 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe()); auto *Mul = cast<VPWidenRecipe>(VecOp->getDefiningRecipe()); + // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const))) + ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul); + // Match reduce.add/sub(mul(ext, ext)). if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) && match(RecipeB, m_ZExtOrSExt(m_VPValue())) && @@ -3665,7 +3699,6 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, cast<VPWidenRecipe>(Sub), Red); return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red); } - // Match reduce.add(mul). // TODO: Add an expression type for this variant with a negated mul if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr)) return new VPExpressionRecipe(Mul, Red); @@ -3674,18 +3707,26 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, // variants. if (Sub) return nullptr; - // Match reduce.add(ext(mul(ext(A), ext(B)))). - // All extend recipes must have same opcode or A == B - // which can be transform to reduce.add(zext(mul(sext(A), sext(B)))). - if (match(VecOp, m_ZExtOrSExt(m_Mul(m_ZExtOrSExt(m_VPValue()), - m_ZExtOrSExt(m_VPValue()))))) { + + // Match reduce.add(ext(mul(A, B))). + if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) { auto *Ext = cast<VPWidenCastRecipe>(VecOp->getDefiningRecipe()); auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0)->getDefiningRecipe()); - auto *Ext0 = - cast<VPWidenCastRecipe>(Mul->getOperand(0)->getDefiningRecipe()); - auto *Ext1 = - cast<VPWidenCastRecipe>(Mul->getOperand(1)->getDefiningRecipe()); - if ((Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) && + auto *Ext0 = dyn_cast_if_present<VPWidenCastRecipe>(A->getDefiningRecipe()); + auto *Ext1 = dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe()); + + // reduce.add(ext(mul(ext, const))) + // -> reduce.add(ext(mul(ext, ext(const)))) + ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul); + + // reduce.add(ext(mul(ext(A), ext(B)))) + // -> reduce.add(mul(wider_ext(A), wider_ext(B))) + // The inner extends must either have the same opcode as the outer extend or + // be the same, in which case the multiply can never result in a negative + // value and the outer extend can be folded away by doing wider + // extends for the operands of the mul. + if (Ext0 && Ext1 && + (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) && Ext0->getOpcode() == Ext1->getOpcode() && IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) { auto *NewExt0 = new VPWidenCastRecipe( @@ -4021,7 +4062,7 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, DenseMap<const SCEV *, Value *> VPlanTransforms::expandSCEVs(VPlan &Plan, ScalarEvolution &SE) { const DataLayout &DL = SE.getDataLayout(); - SCEVExpander Expander(SE, DL, "induction", /*PreserveLCSSA=*/true); + SCEVExpander Expander(SE, DL, "induction", /*PreserveLCSSA=*/false); auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry()); BasicBlock *EntryBB = Entry->getIRBasicBlock(); |
