diff options
Diffstat (limited to 'clang/lib')
39 files changed, 622 insertions, 1293 deletions
diff --git a/clang/lib/AST/CommentSema.cpp b/clang/lib/AST/CommentSema.cpp index 27ff5ab..d5ba240 100644 --- a/clang/lib/AST/CommentSema.cpp +++ b/clang/lib/AST/CommentSema.cpp @@ -225,7 +225,7 @@ static ParamCommandPassDirection getParamPassDirection(StringRef Arg) { return llvm::StringSwitch<ParamCommandPassDirection>(Arg) .Case("[in]", ParamCommandPassDirection::In) .Case("[out]", ParamCommandPassDirection::Out) - .Cases("[in,out]", "[out,in]", ParamCommandPassDirection::InOut) + .Cases({"[in,out]", "[out,in]"}, ParamCommandPassDirection::InOut) .Default(static_cast<ParamCommandPassDirection>(-1)); } diff --git a/clang/lib/Basic/Targets/AVR.cpp b/clang/lib/Basic/Targets/AVR.cpp index 2673669..90b4ac1 100644 --- a/clang/lib/Basic/Targets/AVR.cpp +++ b/clang/lib/Basic/Targets/AVR.cpp @@ -30,13 +30,13 @@ struct LLVM_LIBRARY_VISIBILITY MCUInfo { // NOTE: This list has been synchronized with gcc-avr 5.4.0 and avr-libc 2.0.0. static MCUInfo AVRMcus[] = { - {"avr1", NULL, "1", 0}, + {"avr1", nullptr, "1", 0}, {"at90s1200", "__AVR_AT90S1200__", "1", 0}, {"attiny11", "__AVR_ATtiny11__", "1", 0}, {"attiny12", "__AVR_ATtiny12__", "1", 0}, {"attiny15", "__AVR_ATtiny15__", "1", 0}, {"attiny28", "__AVR_ATtiny28__", "1", 0}, - {"avr2", NULL, "2", 1}, + {"avr2", nullptr, "2", 1}, {"at90s2313", "__AVR_AT90S2313__", "2", 1}, {"at90s2323", "__AVR_AT90S2323__", "2", 1}, {"at90s2333", "__AVR_AT90S2333__", "2", 1}, @@ -50,7 +50,7 @@ static MCUInfo AVRMcus[] = { {"at90s8515", "__AVR_AT90S8515__", "2", 1}, {"at90c8534", "__AVR_AT90c8534__", "2", 1}, {"at90s8535", "__AVR_AT90S8535__", "2", 1}, - {"avr25", NULL, "25", 1}, + {"avr25", nullptr, "25", 1}, {"ata5272", "__AVR_ATA5272__", "25", 1}, {"ata6616c", "__AVR_ATA6616c__", "25", 1}, {"attiny13", "__AVR_ATtiny13__", "25", 1}, @@ -80,13 +80,13 @@ static MCUInfo AVRMcus[] = { {"attiny48", "__AVR_ATtiny48__", "25", 1}, {"attiny88", "__AVR_ATtiny88__", "25", 1}, {"attiny828", "__AVR_ATtiny828__", "25", 1}, - {"avr3", NULL, "3", 1}, + {"avr3", nullptr, "3", 1}, {"at43usb355", "__AVR_AT43USB355__", "3", 1}, {"at76c711", "__AVR_AT76C711__", "3", 1}, - {"avr31", NULL, "31", 1}, + {"avr31", nullptr, "31", 1}, {"atmega103", "__AVR_ATmega103__", "31", 1}, {"at43usb320", "__AVR_AT43USB320__", "31", 1}, - {"avr35", NULL, "35", 1}, + {"avr35", nullptr, "35", 1}, {"attiny167", "__AVR_ATtiny167__", "35", 1}, {"at90usb82", "__AVR_AT90USB82__", "35", 1}, {"at90usb162", "__AVR_AT90USB162__", "35", 1}, @@ -97,7 +97,7 @@ static MCUInfo AVRMcus[] = { {"atmega16u2", "__AVR_ATmega16U2__", "35", 1}, {"atmega32u2", "__AVR_ATmega32U2__", "35", 1}, {"attiny1634", "__AVR_ATtiny1634__", "35", 1}, - {"avr4", NULL, "4", 1}, + {"avr4", nullptr, "4", 1}, {"atmega8", "__AVR_ATmega8__", "4", 1}, {"ata6289", "__AVR_ATA6289__", "4", 1}, {"atmega8a", "__AVR_ATmega8A__", "4", 1}, @@ -123,7 +123,7 @@ static MCUInfo AVRMcus[] = { {"at90pwm3", "__AVR_AT90PWM3__", "4", 1}, {"at90pwm3b", "__AVR_AT90PWM3B__", "4", 1}, {"at90pwm81", "__AVR_AT90PWM81__", "4", 1}, - {"avr5", NULL, "5", 1}, + {"avr5", nullptr, "5", 1}, {"ata5702m322", "__AVR_ATA5702M322__", "5", 1}, {"ata5782", "__AVR_ATA5782__", "5", 1}, {"ata5790", "__AVR_ATA5790__", "5", 1}, @@ -230,7 +230,7 @@ static MCUInfo AVRMcus[] = { {"at90scr100", "__AVR_AT90SCR100__", "5", 1}, {"at94k", "__AVR_AT94K__", "5", 1}, {"m3000", "__AVR_AT000__", "5", 1}, - {"avr51", NULL, "51", 2}, + {"avr51", nullptr, "51", 2}, {"atmega128", "__AVR_ATmega128__", "51", 2}, {"atmega128a", "__AVR_ATmega128A__", "51", 2}, {"atmega1280", "__AVR_ATmega1280__", "51", 2}, @@ -243,12 +243,12 @@ static MCUInfo AVRMcus[] = { {"at90can128", "__AVR_AT90CAN128__", "51", 2}, {"at90usb1286", "__AVR_AT90USB1286__", "51", 2}, {"at90usb1287", "__AVR_AT90USB1287__", "51", 2}, - {"avr6", NULL, "6", 4}, + {"avr6", nullptr, "6", 4}, {"atmega2560", "__AVR_ATmega2560__", "6", 4}, {"atmega2561", "__AVR_ATmega2561__", "6", 4}, {"atmega256rfr2", "__AVR_ATmega256RFR2__", "6", 4}, {"atmega2564rfr2", "__AVR_ATmega2564RFR2__", "6", 4}, - {"avrxmega2", NULL, "102", 1}, + {"avrxmega2", nullptr, "102", 1}, {"atxmega16a4", "__AVR_ATxmega16A4__", "102", 1}, {"atxmega16a4u", "__AVR_ATxmega16A4U__", "102", 1}, {"atxmega16c4", "__AVR_ATxmega16C4__", "102", 1}, @@ -262,7 +262,7 @@ static MCUInfo AVRMcus[] = { {"atxmega32e5", "__AVR_ATxmega32E5__", "102", 1}, {"atxmega16e5", "__AVR_ATxmega16E5__", "102", 1}, {"atxmega8e5", "__AVR_ATxmega8E5__", "102", 1}, - {"avrxmega4", NULL, "104", 1}, + {"avrxmega4", nullptr, "104", 1}, {"atxmega64a3", "__AVR_ATxmega64A3__", "104", 1}, {"atxmega64a3u", "__AVR_ATxmega64A3U__", "104", 1}, {"atxmega64a4u", "__AVR_ATxmega64A4U__", "104", 1}, @@ -271,10 +271,10 @@ static MCUInfo AVRMcus[] = { {"atxmega64c3", "__AVR_ATxmega64C3__", "104", 1}, {"atxmega64d3", "__AVR_ATxmega64D3__", "104", 1}, {"atxmega64d4", "__AVR_ATxmega64D4__", "104", 1}, - {"avrxmega5", NULL, "105", 1}, + {"avrxmega5", nullptr, "105", 1}, {"atxmega64a1", "__AVR_ATxmega64A1__", "105", 1}, {"atxmega64a1u", "__AVR_ATxmega64A1U__", "105", 1}, - {"avrxmega6", NULL, "106", 6}, + {"avrxmega6", nullptr, "106", 6}, {"atxmega128a3", "__AVR_ATxmega128A3__", "106", 2}, {"atxmega128a3u", "__AVR_ATxmega128A3U__", "106", 2}, {"atxmega128b1", "__AVR_ATxmega128B1__", "106", 2}, @@ -294,11 +294,11 @@ static MCUInfo AVRMcus[] = { {"atxmega256d3", "__AVR_ATxmega256D3__", "106", 4}, {"atxmega384c3", "__AVR_ATxmega384C3__", "106", 6}, {"atxmega384d3", "__AVR_ATxmega384D3__", "106", 6}, - {"avrxmega7", NULL, "107", 2}, + {"avrxmega7", nullptr, "107", 2}, {"atxmega128a1", "__AVR_ATxmega128A1__", "107", 2}, {"atxmega128a1u", "__AVR_ATxmega128A1U__", "107", 2}, {"atxmega128a4u", "__AVR_ATxmega128A4U__", "107", 2}, - {"avrtiny", NULL, "100", 0}, + {"avrtiny", nullptr, "100", 0}, {"attiny4", "__AVR_ATtiny4__", "100", 0}, {"attiny5", "__AVR_ATtiny5__", "100", 0}, {"attiny9", "__AVR_ATtiny9__", "100", 0}, @@ -307,7 +307,7 @@ static MCUInfo AVRMcus[] = { {"attiny40", "__AVR_ATtiny40__", "100", 0}, {"attiny102", "__AVR_ATtiny102__", "100", 0}, {"attiny104", "__AVR_ATtiny104__", "100", 0}, - {"avrxmega3", NULL, "103", 1}, + {"avrxmega3", nullptr, "103", 1}, {"attiny202", "__AVR_ATtiny202__", "103", 1}, {"attiny402", "__AVR_ATtiny402__", "103", 1}, {"attiny204", "__AVR_ATtiny204__", "103", 1}, diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp index 9651c38..ec4e40b 100644 --- a/clang/lib/Basic/Targets/NVPTX.cpp +++ b/clang/lib/Basic/Targets/NVPTX.cpp @@ -171,7 +171,7 @@ ArrayRef<const char *> NVPTXTargetInfo::getGCCRegNames() const { bool NVPTXTargetInfo::hasFeature(StringRef Feature) const { return llvm::StringSwitch<bool>(Feature) - .Cases("ptx", "nvptx", true) + .Cases({"ptx", "nvptx"}, true) .Default(false); } diff --git a/clang/lib/Basic/Targets/PPC.h b/clang/lib/Basic/Targets/PPC.h index d2eb9c5..d4ada2a 100644 --- a/clang/lib/Basic/Targets/PPC.h +++ b/clang/lib/Basic/Targets/PPC.h @@ -125,9 +125,8 @@ public: .Cases({"power3", "pwr3"}, ArchDefinePpcgr) .Cases({"power4", "pwr4"}, ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq) - .Cases("power5", "pwr5", - ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr | - ArchDefinePpcsq) + .Cases({"power5", "pwr5"}, ArchDefinePwr5 | ArchDefinePwr4 | + ArchDefinePpcgr | ArchDefinePpcsq) .Cases({"power5x", "pwr5x"}, ArchDefinePwr5x | ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq) @@ -166,7 +165,7 @@ public: ArchDefinePwr9 | ArchDefinePwr8 | ArchDefinePwr7 | ArchDefinePwr6 | ArchDefinePwr5x | ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq) - .Cases("8548", "e500", ArchDefineE500) + .Cases({"8548", "e500"}, ArchDefineE500) .Default(ArchDefineNone); } return CPUKnown; diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp index e71f10c..7a90c89 100644 --- a/clang/lib/Basic/Targets/X86.cpp +++ b/clang/lib/Basic/Targets/X86.cpp @@ -396,8 +396,6 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features, HasAMXFP8 = true; } else if (Feature == "+amx-movrs") { HasAMXMOVRS = true; - } else if (Feature == "+amx-transpose") { - HasAMXTRANSPOSE = true; } else if (Feature == "+amx-avx512") { HasAMXAVX512 = true; } else if (Feature == "+amx-tf32") { @@ -925,8 +923,6 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts, Builder.defineMacro("__AMX_FP8__"); if (HasAMXMOVRS) Builder.defineMacro("__AMX_MOVRS__"); - if (HasAMXTRANSPOSE) - Builder.defineMacro("__AMX_TRANSPOSE__"); if (HasAMXAVX512) Builder.defineMacro("__AMX_AVX512__"); if (HasAMXTF32) @@ -1068,7 +1064,6 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const { .Case("amx-movrs", true) .Case("amx-tf32", true) .Case("amx-tile", true) - .Case("amx-transpose", true) .Case("avx", true) .Case("avx10.1", true) .Case("avx10.2", true) @@ -1189,7 +1184,6 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const { .Case("amx-movrs", HasAMXMOVRS) .Case("amx-tf32", HasAMXTF32) .Case("amx-tile", HasAMXTILE) - .Case("amx-transpose", HasAMXTRANSPOSE) .Case("avx", SSELevel >= AVX) .Case("avx10.1", HasAVX10_1) .Case("avx10.2", HasAVX10_2) diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h index be3a473..e7da262 100644 --- a/clang/lib/Basic/Targets/X86.h +++ b/clang/lib/Basic/Targets/X86.h @@ -160,7 +160,6 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo { bool HasAMXCOMPLEX = false; bool HasAMXFP8 = false; bool HasAMXMOVRS = false; - bool HasAMXTRANSPOSE = false; bool HasAMXAVX512 = false; bool HasAMXTF32 = false; bool HasSERIALIZE = false; diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp index 3c9c7ec..0198a9d 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp @@ -771,14 +771,6 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, case X86::BI_WriteBarrier: case X86::BI_AddressOfReturnAddress: case X86::BI__stosb: - case X86::BI__builtin_ia32_t2rpntlvwz0_internal: - case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal: - case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal: - case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal: - case X86::BI__builtin_ia32_t2rpntlvwz1_internal: - case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal: - case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal: - case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal: case X86::BI__ud2: case X86::BI__int2c: case X86::BI__readfsbyte: diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp index 71ff20a..5d5209b 100644 --- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp @@ -242,12 +242,19 @@ void CIRGenFunction::LexicalScope::cleanup() { } }; - if (returnBlock != nullptr) { - // Write out the return block, which loads the value from `__retval` and - // issues the `cir.return`. + // Cleanup are done right before codegen resumes a scope. This is where + // objects are destroyed. Process all return blocks. + // TODO(cir): Handle returning from a switch statement through a cleanup + // block. We can't simply jump to the cleanup block, because the cleanup block + // is not part of the case region. Either reemit all cleanups in the return + // block or wait for MLIR structured control flow to support early exits. + llvm::SmallVector<mlir::Block *> retBlocks; + for (mlir::Block *retBlock : localScope->getRetBlocks()) { mlir::OpBuilder::InsertionGuard guard(builder); - builder.setInsertionPointToEnd(returnBlock); - (void)emitReturn(*returnLoc); + builder.setInsertionPointToEnd(retBlock); + retBlocks.push_back(retBlock); + mlir::Location retLoc = localScope->getRetLoc(retBlock); + emitReturn(retLoc); } auto insertCleanupAndLeave = [&](mlir::Block *insPt) { @@ -274,19 +281,22 @@ void CIRGenFunction::LexicalScope::cleanup() { if (localScope->depth == 0) { // Reached the end of the function. - if (returnBlock != nullptr) { - if (returnBlock->getUses().empty()) { - returnBlock->erase(); + // Special handling only for single return block case + if (localScope->getRetBlocks().size() == 1) { + mlir::Block *retBlock = localScope->getRetBlocks()[0]; + mlir::Location retLoc = localScope->getRetLoc(retBlock); + if (retBlock->getUses().empty()) { + retBlock->erase(); } else { // Thread return block via cleanup block. if (cleanupBlock) { - for (mlir::BlockOperand &blockUse : returnBlock->getUses()) { + for (mlir::BlockOperand &blockUse : retBlock->getUses()) { cir::BrOp brOp = mlir::cast<cir::BrOp>(blockUse.getOwner()); brOp.setSuccessor(cleanupBlock); } } - cir::BrOp::create(builder, *returnLoc, returnBlock); + cir::BrOp::create(builder, retLoc, retBlock); return; } } @@ -324,8 +334,10 @@ void CIRGenFunction::LexicalScope::cleanup() { bool entryBlock = builder.getInsertionBlock()->isEntryBlock(); if (!entryBlock && curBlock->empty()) { curBlock->erase(); - if (returnBlock != nullptr && returnBlock->getUses().empty()) - returnBlock->erase(); + for (mlir::Block *retBlock : retBlocks) { + if (retBlock->getUses().empty()) + retBlock->erase(); + } return; } diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h index c3fcd1a6..e5cecaa5 100644 --- a/clang/lib/CIR/CodeGen/CIRGenFunction.h +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h @@ -1103,44 +1103,69 @@ public: // --- private: - // `returnBlock`, `returnLoc`, and all the functions that deal with them - // will change and become more complicated when `switch` statements are - // upstreamed. `case` statements within the `switch` are in the same scope - // but have their own regions. Therefore the LexicalScope will need to - // keep track of multiple return blocks. - mlir::Block *returnBlock = nullptr; - std::optional<mlir::Location> returnLoc; - - // See the comment on `getOrCreateRetBlock`. + // On switches we need one return block per region, since cases don't + // have their own scopes but are distinct regions nonetheless. + + // TODO: This implementation should change once we have support for early + // exits in MLIR structured control flow (llvm-project#161575) + llvm::SmallVector<mlir::Block *> retBlocks; + llvm::DenseMap<mlir::Block *, mlir::Location> retLocs; + llvm::DenseMap<cir::CaseOp, unsigned> retBlockInCaseIndex; + std::optional<unsigned> normalRetBlockIndex; + + // There's usually only one ret block per scope, but this needs to be + // get or create because of potential unreachable return statements, note + // that for those, all source location maps to the first one found. mlir::Block *createRetBlock(CIRGenFunction &cgf, mlir::Location loc) { - assert(returnBlock == nullptr && "only one return block per scope"); - // Create the cleanup block but don't hook it up just yet. + assert((isa_and_nonnull<cir::CaseOp>( + cgf.builder.getBlock()->getParentOp()) || + retBlocks.size() == 0) && + "only switches can hold more than one ret block"); + + // Create the return block but don't hook it up just yet. mlir::OpBuilder::InsertionGuard guard(cgf.builder); - returnBlock = - cgf.builder.createBlock(cgf.builder.getBlock()->getParent()); - updateRetLoc(returnBlock, loc); - return returnBlock; + auto *b = cgf.builder.createBlock(cgf.builder.getBlock()->getParent()); + retBlocks.push_back(b); + updateRetLoc(b, loc); + return b; } cir::ReturnOp emitReturn(mlir::Location loc); void emitImplicitReturn(); public: - mlir::Block *getRetBlock() { return returnBlock; } - mlir::Location getRetLoc(mlir::Block *b) { return *returnLoc; } - void updateRetLoc(mlir::Block *b, mlir::Location loc) { returnLoc = loc; } - - // Create the return block for this scope, or return the existing one. - // This get-or-create logic is necessary to handle multiple return - // statements within the same scope, which can happen if some of them are - // dead code or if there is a `goto` into the middle of the scope. + llvm::ArrayRef<mlir::Block *> getRetBlocks() { return retBlocks; } + mlir::Location getRetLoc(mlir::Block *b) { return retLocs.at(b); } + void updateRetLoc(mlir::Block *b, mlir::Location loc) { + retLocs.insert_or_assign(b, loc); + } + mlir::Block *getOrCreateRetBlock(CIRGenFunction &cgf, mlir::Location loc) { - if (returnBlock == nullptr) { - returnBlock = createRetBlock(cgf, loc); - return returnBlock; + // Check if we're inside a case region + if (auto caseOp = mlir::dyn_cast_if_present<cir::CaseOp>( + cgf.builder.getBlock()->getParentOp())) { + auto iter = retBlockInCaseIndex.find(caseOp); + if (iter != retBlockInCaseIndex.end()) { + // Reuse existing return block + mlir::Block *ret = retBlocks[iter->second]; + updateRetLoc(ret, loc); + return ret; + } + // Create new return block + mlir::Block *ret = createRetBlock(cgf, loc); + retBlockInCaseIndex[caseOp] = retBlocks.size() - 1; + return ret; } - updateRetLoc(returnBlock, loc); - return returnBlock; + + if (normalRetBlockIndex) { + mlir::Block *ret = retBlocks[*normalRetBlockIndex]; + updateRetLoc(ret, loc); + return ret; + } + + mlir::Block *ret = createRetBlock(cgf, loc); + normalRetBlockIndex = retBlocks.size() - 1; + return ret; } mlir::Block *getEntryBlock() { return entryBlock; } diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp index 5010137..527dfd2 100644 --- a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp @@ -126,7 +126,7 @@ class OpenACCClauseCIREmitter final .CaseLower("default", mlir::acc::DeviceType::Default) .CaseLower("host", mlir::acc::DeviceType::Host) .CaseLower("multicore", mlir::acc::DeviceType::Multicore) - .CasesLower("nvidia", "acc_device_nvidia", + .CasesLower({"nvidia", "acc_device_nvidia"}, mlir::acc::DeviceType::Nvidia) .CaseLower("radeon", mlir::acc::DeviceType::Radeon); } diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index 3c31314..b967a26 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -313,7 +313,7 @@ getCodeModel(const CodeGenOptions &CodeGenOpts) { .Case("kernel", llvm::CodeModel::Kernel) .Case("medium", llvm::CodeModel::Medium) .Case("large", llvm::CodeModel::Large) - .Cases("default", "", ~1u) + .Cases({"default", ""}, ~1u) .Default(~0u); assert(CodeModel != ~0u && "invalid code model!"); if (CodeModel == ~1u) diff --git a/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp b/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp index 6da65b6..8a1cab3 100644 --- a/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp @@ -375,28 +375,28 @@ static Value *MakeCpAsync(unsigned IntrinsicID, unsigned IntrinsicIDS, CGF.EmitScalarExpr(E->getArg(1))}); } -static Value *MakeHalfType(unsigned IntrinsicID, unsigned BuiltinID, - const CallExpr *E, CodeGenFunction &CGF) { +static bool EnsureNativeHalfSupport(unsigned BuiltinID, const CallExpr *E, + CodeGenFunction &CGF) { auto &C = CGF.CGM.getContext(); - if (!(C.getLangOpts().NativeHalfType || - !C.getTargetInfo().useFP16ConversionIntrinsics())) { + if (!C.getLangOpts().NativeHalfType && + C.getTargetInfo().useFP16ConversionIntrinsics()) { CGF.CGM.Error(E->getExprLoc(), C.BuiltinInfo.getQuotedName(BuiltinID) + " requires native half type support."); - return nullptr; + return false; } + return true; +} - if (BuiltinID == NVPTX::BI__nvvm_ldg_h || BuiltinID == NVPTX::BI__nvvm_ldg_h2) - return MakeLdg(CGF, E); - - if (IntrinsicID == Intrinsic::nvvm_ldu_global_f) - return MakeLdu(IntrinsicID, CGF, E); +static Value *MakeHalfType(Function *Intrinsic, unsigned BuiltinID, + const CallExpr *E, CodeGenFunction &CGF) { + if (!EnsureNativeHalfSupport(BuiltinID, E, CGF)) + return nullptr; SmallVector<Value *, 16> Args; - auto *F = CGF.CGM.getIntrinsic(IntrinsicID); - auto *FTy = F->getFunctionType(); + auto *FTy = Intrinsic->getFunctionType(); unsigned ICEArguments = 0; ASTContext::GetBuiltinTypeError Error; - C.GetBuiltinType(BuiltinID, Error, &ICEArguments); + CGF.CGM.getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments); assert(Error == ASTContext::GE_None && "Should not codegen an error"); for (unsigned i = 0, e = E->getNumArgs(); i != e; ++i) { assert((ICEArguments & (1 << i)) == 0); @@ -407,8 +407,14 @@ static Value *MakeHalfType(unsigned IntrinsicID, unsigned BuiltinID, Args.push_back(ArgValue); } - return CGF.Builder.CreateCall(F, Args); + return CGF.Builder.CreateCall(Intrinsic, Args); } + +static Value *MakeHalfType(unsigned IntrinsicID, unsigned BuiltinID, + const CallExpr *E, CodeGenFunction &CGF) { + return MakeHalfType(CGF.CGM.getIntrinsic(IntrinsicID), BuiltinID, E, CGF); +} + } // namespace Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, @@ -913,9 +919,14 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, } // The following builtins require half type support case NVPTX::BI__nvvm_ex2_approx_f16: - return MakeHalfType(Intrinsic::nvvm_ex2_approx_f16, BuiltinID, E, *this); + return MakeHalfType( + CGM.getIntrinsic(Intrinsic::nvvm_ex2_approx, Builder.getHalfTy()), + BuiltinID, E, *this); case NVPTX::BI__nvvm_ex2_approx_f16x2: - return MakeHalfType(Intrinsic::nvvm_ex2_approx_f16x2, BuiltinID, E, *this); + return MakeHalfType( + CGM.getIntrinsic(Intrinsic::nvvm_ex2_approx, + FixedVectorType::get(Builder.getHalfTy(), 2)), + BuiltinID, E, *this); case NVPTX::BI__nvvm_ff2f16x2_rn: return MakeHalfType(Intrinsic::nvvm_ff2f16x2_rn, BuiltinID, E, *this); case NVPTX::BI__nvvm_ff2f16x2_rn_relu: @@ -1049,12 +1060,22 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, case NVPTX::BI__nvvm_fabs_d: return Builder.CreateUnaryIntrinsic(Intrinsic::fabs, EmitScalarExpr(E->getArg(0))); + case NVPTX::BI__nvvm_ex2_approx_d: + case NVPTX::BI__nvvm_ex2_approx_f: + return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_ex2_approx, + EmitScalarExpr(E->getArg(0))); + case NVPTX::BI__nvvm_ex2_approx_ftz_f: + return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_ex2_approx_ftz, + EmitScalarExpr(E->getArg(0))); case NVPTX::BI__nvvm_ldg_h: case NVPTX::BI__nvvm_ldg_h2: - return MakeHalfType(Intrinsic::not_intrinsic, BuiltinID, E, *this); + return EnsureNativeHalfSupport(BuiltinID, E, *this) ? MakeLdg(*this, E) + : nullptr; case NVPTX::BI__nvvm_ldu_h: case NVPTX::BI__nvvm_ldu_h2: - return MakeHalfType(Intrinsic::nvvm_ldu_global_f, BuiltinID, E, *this); + return EnsureNativeHalfSupport(BuiltinID, E, *this) + ? MakeLdu(Intrinsic::nvvm_ldu_global_f, *this, E) + : nullptr; case NVPTX::BI__nvvm_cp_async_ca_shared_global_4: return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_4, Intrinsic::nvvm_cp_async_ca_shared_global_4_s, *this, E, diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp index b924407..2381b2e 100644 --- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp @@ -2931,74 +2931,6 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, // instruction, but it will create a memset that won't be optimized away. return Builder.CreateMemSet(Ops[0], Ops[1], Ops[2], Align(1), true); } - // Corresponding to intrisics which will return 2 tiles (tile0_tile1). - case X86::BI__builtin_ia32_t2rpntlvwz0_internal: - case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal: - case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal: - case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal: - case X86::BI__builtin_ia32_t2rpntlvwz1_internal: - case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal: - case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal: - case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal: { - Intrinsic::ID IID; - switch (BuiltinID) { - default: - llvm_unreachable("Unsupported intrinsic!"); - case X86::BI__builtin_ia32_t2rpntlvwz0_internal: - IID = Intrinsic::x86_t2rpntlvwz0_internal; - break; - case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal: - IID = Intrinsic::x86_t2rpntlvwz0rs_internal; - break; - case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal: - IID = Intrinsic::x86_t2rpntlvwz0t1_internal; - break; - case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal: - IID = Intrinsic::x86_t2rpntlvwz0rst1_internal; - break; - case X86::BI__builtin_ia32_t2rpntlvwz1_internal: - IID = Intrinsic::x86_t2rpntlvwz1_internal; - break; - case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal: - IID = Intrinsic::x86_t2rpntlvwz1rs_internal; - break; - case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal: - IID = Intrinsic::x86_t2rpntlvwz1t1_internal; - break; - case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal: - IID = Intrinsic::x86_t2rpntlvwz1rst1_internal; - break; - } - - // Ops = (Row0, Col0, Col1, DstPtr0, DstPtr1, SrcPtr, Stride) - Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), - {Ops[0], Ops[1], Ops[2], Ops[5], Ops[6]}); - - auto *PtrTy = E->getArg(3)->getType()->getAs<PointerType>(); - assert(PtrTy && "arg3 must be of pointer type"); - QualType PtreeTy = PtrTy->getPointeeType(); - llvm::Type *TyPtee = ConvertType(PtreeTy); - - // Bitcast amx type (x86_amx) to vector type (256 x i32) - // Then store tile0 into DstPtr0 - Value *T0 = Builder.CreateExtractValue(Call, 0); - Value *VecT0 = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector, - {TyPtee}, {T0}); - Builder.CreateDefaultAlignedStore(VecT0, Ops[3]); - - // Then store tile1 into DstPtr1 - Value *T1 = Builder.CreateExtractValue(Call, 1); - Value *VecT1 = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector, - {TyPtee}, {T1}); - Value *Store = Builder.CreateDefaultAlignedStore(VecT1, Ops[4]); - - // Note: Here we escape directly use x86_tilestored64_internal to store - // the results due to it can't make sure the Mem written scope. This may - // cause shapes reloads after first amx intrinsic, which current amx reg- - // ister allocation has no ability to handle it. - - return Store; - } case X86::BI__ud2: // llvm.trap makes a ud2a instruction on x86. return EmitTrapCall(Intrinsic::trap); diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index 71c5280..51618d1 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -2540,10 +2540,14 @@ bool Driver::HandleImmediateArgs(Compilation &C) { } if (C.getArgs().hasArg(options::OPT_print_runtime_dir)) { - if (std::optional<std::string> RuntimePath = TC.getRuntimePath()) - llvm::outs() << *RuntimePath << '\n'; - else - llvm::outs() << TC.getCompilerRTPath() << '\n'; + for (auto RuntimePath : + {TC.getRuntimePath(), std::make_optional(TC.getCompilerRTPath())}) { + if (RuntimePath && getVFS().exists(*RuntimePath)) { + llvm::outs() << *RuntimePath << '\n'; + return false; + } + } + llvm::outs() << "(runtime dir is not present)" << '\n'; return false; } diff --git a/clang/lib/Driver/ToolChains/Arch/M68k.cpp b/clang/lib/Driver/ToolChains/Arch/M68k.cpp index 1037c0e..708ec84 100644 --- a/clang/lib/Driver/ToolChains/Arch/M68k.cpp +++ b/clang/lib/Driver/ToolChains/Arch/M68k.cpp @@ -36,12 +36,12 @@ std::string m68k::getM68kTargetCPU(const ArgList &Args) { return "generic"; return llvm::StringSwitch<std::string>(CPUName) - .Cases("m68000", "68000", "M68000") - .Cases("m68010", "68010", "M68010") - .Cases("m68020", "68020", "M68020") - .Cases("m68030", "68030", "M68030") - .Cases("m68040", "68040", "M68040") - .Cases("m68060", "68060", "M68060") + .Cases({"m68000", "68000"}, "M68000") + .Cases({"m68010", "68010"}, "M68010") + .Cases({"m68020", "68020"}, "M68020") + .Cases({"m68030", "68030"}, "M68030") + .Cases({"m68040", "68040"}, "M68040") + .Cases({"m68060", "68060"}, "M68060") .Default(CPUName.str()); } // FIXME: Throw error when multiple sub-architecture flag exist diff --git a/clang/lib/Driver/ToolChains/Arch/Mips.cpp b/clang/lib/Driver/ToolChains/Arch/Mips.cpp index 6a6a4ee..8d7b85d 100644 --- a/clang/lib/Driver/ToolChains/Arch/Mips.cpp +++ b/clang/lib/Driver/ToolChains/Arch/Mips.cpp @@ -117,7 +117,7 @@ void mips::getMipsCPUAndABI(const ArgList &Args, const llvm::Triple &Triple, // Deduce CPU name from ABI name. CPUName = llvm::StringSwitch<const char *>(ABIName) .Case("o32", DefMips32CPU) - .Cases("n32", "n64", DefMips64CPU) + .Cases({"n32", "n64"}, DefMips64CPU) .Default(""); } @@ -467,7 +467,7 @@ bool mips::isNaN2008(const Driver &D, const ArgList &Args, // NaN2008 is the default for MIPS32r6/MIPS64r6. return llvm::StringSwitch<bool>(getCPUName(D, Args, Triple)) - .Cases("mips32r6", "mips64r6", true) + .Cases({"mips32r6", "mips64r6"}, true) .Default(false); } diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 4e8f63e..d3ab6f1 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -3708,6 +3708,7 @@ static void RenderHLSLOptions(const ArgList &Args, ArgStringList &CmdArgs, options::OPT_emit_obj, options::OPT_disable_llvm_passes, options::OPT_fnative_half_type, + options::OPT_fnative_int16_type, options::OPT_hlsl_entrypoint, options::OPT_fdx_rootsignature_define, options::OPT_fdx_rootsignature_version, diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp index cc5bcd1..2fb7652 100644 --- a/clang/lib/Driver/ToolChains/Darwin.cpp +++ b/clang/lib/Driver/ToolChains/Darwin.cpp @@ -1035,12 +1035,12 @@ static const char *ArmMachOArchName(StringRef Arch) { .Case("xscale", "xscale") .Case("armv4t", "armv4t") .Case("armv7", "armv7") - .Cases("armv7a", "armv7-a", "armv7") - .Cases("armv7r", "armv7-r", "armv7") - .Cases("armv7em", "armv7e-m", "armv7em") - .Cases("armv7k", "armv7-k", "armv7k") - .Cases("armv7m", "armv7-m", "armv7m") - .Cases("armv7s", "armv7-s", "armv7s") + .Cases({"armv7a", "armv7-a"}, "armv7") + .Cases({"armv7r", "armv7-r"}, "armv7") + .Cases({"armv7em", "armv7e-m"}, "armv7em") + .Cases({"armv7k", "armv7-k"}, "armv7k") + .Cases({"armv7m", "armv7-m"}, "armv7m") + .Cases({"armv7s", "armv7-s"}, "armv7s") .Default(nullptr); } diff --git a/clang/lib/Driver/ToolChains/HLSL.cpp b/clang/lib/Driver/ToolChains/HLSL.cpp index 20a320e..8d3fba7 100644 --- a/clang/lib/Driver/ToolChains/HLSL.cpp +++ b/clang/lib/Driver/ToolChains/HLSL.cpp @@ -498,6 +498,15 @@ HLSLToolChain::TranslateArgs(const DerivedArgList &Args, StringRef BoundArch, continue; } + if (A->getOption().getID() == options::OPT_enable_16bit_types) { + // Translate -enable-16bit-types into -fnative-half-type and + // -fnative-int16-type + DAL->AddFlagArg(nullptr, Opts.getOption(options::OPT_fnative_half_type)); + DAL->AddFlagArg(nullptr, Opts.getOption(options::OPT_fnative_int16_type)); + A->claim(); + continue; + } + DAL->append(A); } diff --git a/clang/lib/Driver/ToolChains/Solaris.cpp b/clang/lib/Driver/ToolChains/Solaris.cpp index 02aa598..64c7d1c 100644 --- a/clang/lib/Driver/ToolChains/Solaris.cpp +++ b/clang/lib/Driver/ToolChains/Solaris.cpp @@ -346,7 +346,7 @@ SanitizerMask Solaris::getSupportedSanitizers() const { const char *Solaris::getDefaultLinker() const { // FIXME: Only handle Solaris ld and GNU ld here. return llvm::StringSwitch<const char *>(getDriver().getPreferredLinker()) - .Cases("bfd", "gld", "/usr/gnu/bin/ld") + .Cases({"bfd", "gld"}, "/usr/gnu/bin/ld") .Default("/usr/bin/ld"); } diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index bd36eb4..be7c1d3 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -4049,18 +4049,18 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args, // -cl-std only applies for OpenCL language standards. // Override the -std option in this case. if (const Arg *A = Args.getLastArg(OPT_cl_std_EQ)) { - LangStandard::Kind OpenCLLangStd - = llvm::StringSwitch<LangStandard::Kind>(A->getValue()) - .Cases("cl", "CL", LangStandard::lang_opencl10) - .Cases("cl1.0", "CL1.0", LangStandard::lang_opencl10) - .Cases("cl1.1", "CL1.1", LangStandard::lang_opencl11) - .Cases("cl1.2", "CL1.2", LangStandard::lang_opencl12) - .Cases("cl2.0", "CL2.0", LangStandard::lang_opencl20) - .Cases("cl3.0", "CL3.0", LangStandard::lang_opencl30) - .Cases("clc++", "CLC++", LangStandard::lang_openclcpp10) - .Cases("clc++1.0", "CLC++1.0", LangStandard::lang_openclcpp10) - .Cases("clc++2021", "CLC++2021", LangStandard::lang_openclcpp2021) - .Default(LangStandard::lang_unspecified); + LangStandard::Kind OpenCLLangStd = + llvm::StringSwitch<LangStandard::Kind>(A->getValue()) + .Cases({"cl", "CL"}, LangStandard::lang_opencl10) + .Cases({"cl1.0", "CL1.0"}, LangStandard::lang_opencl10) + .Cases({"cl1.1", "CL1.1"}, LangStandard::lang_opencl11) + .Cases({"cl1.2", "CL1.2"}, LangStandard::lang_opencl12) + .Cases({"cl2.0", "CL2.0"}, LangStandard::lang_opencl20) + .Cases({"cl3.0", "CL3.0"}, LangStandard::lang_opencl30) + .Cases({"clc++", "CLC++"}, LangStandard::lang_openclcpp10) + .Cases({"clc++1.0", "CLC++1.0"}, LangStandard::lang_openclcpp10) + .Cases({"clc++2021", "CLC++2021"}, LangStandard::lang_openclcpp2021) + .Default(LangStandard::lang_unspecified); if (OpenCLLangStd == LangStandard::lang_unspecified) { Diags.Report(diag::err_drv_invalid_value) @@ -4600,7 +4600,8 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args, // Validate that if fnative-half-type is given, that // the language standard is at least hlsl2018, and that // the target shader model is at least 6.2. - if (Args.getLastArg(OPT_fnative_half_type)) { + if (Args.getLastArg(OPT_fnative_half_type) || + Args.getLastArg(OPT_fnative_int16_type)) { const LangStandard &Std = LangStandard::getLangStandardForKind(Opts.LangStd); if (!(Opts.LangStd >= LangStandard::lang_hlsl2018 && @@ -4614,12 +4615,16 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args, Diags.Report(diag::err_drv_hlsl_bad_shader_unsupported) << VulkanEnv << T.getOSName() << T.str(); } - if (Args.getLastArg(OPT_fnative_half_type)) { + if (Args.getLastArg(OPT_fnative_half_type) || + Args.getLastArg(OPT_fnative_int16_type)) { + const char *Str = Args.getLastArg(OPT_fnative_half_type) + ? "-fnative-half-type" + : "-fnative-int16-type"; const LangStandard &Std = LangStandard::getLangStandardForKind(Opts.LangStd); if (!(Opts.LangStd >= LangStandard::lang_hlsl2018)) Diags.Report(diag::err_drv_hlsl_16bit_types_unsupported) - << "-fnative-half-type" << false << Std.getName(); + << Str << false << Std.getName(); } } else { llvm_unreachable("expected DXIL or SPIR-V target"); diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp index 47f1d5a..8602be1 100644 --- a/clang/lib/Frontend/InitPreprocessor.cpp +++ b/clang/lib/Frontend/InitPreprocessor.cpp @@ -399,7 +399,7 @@ static void InitializeStandardPredefinedMacros(const TargetInfo &TI, Builder.defineMacro("__HLSL_202y", Twine((unsigned)LangOptions::HLSLLangStd::HLSL_202y)); - if (LangOpts.NativeHalfType) + if (LangOpts.NativeHalfType && LangOpts.NativeInt16Type) Builder.defineMacro("__HLSL_ENABLE_16_BIT", "1"); // Shader target information diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt index 1858912..33fff76 100644 --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -162,18 +162,12 @@ set(x86_files adxintrin.h ammintrin.h amxavx512intrin.h - amxbf16transposeintrin.h amxcomplexintrin.h - amxcomplextransposeintrin.h amxfp16intrin.h - amxfp16transposeintrin.h amxfp8intrin.h amxintrin.h amxmovrsintrin.h - amxmovrstransposeintrin.h amxtf32intrin.h - amxtf32transposeintrin.h - amxtransposeintrin.h avx10_2_512bf16intrin.h avx10_2_512convertintrin.h avx10_2_512minmaxintrin.h diff --git a/clang/lib/Headers/amxbf16transposeintrin.h b/clang/lib/Headers/amxbf16transposeintrin.h deleted file mode 100644 index 86f09f2..0000000 --- a/clang/lib/Headers/amxbf16transposeintrin.h +++ /dev/null @@ -1,94 +0,0 @@ -/*===----- amxbf16transposeintrin.h - AMX-BF16 and AMX-TRANSPOSE ------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===------------------------------------------------------------------------=== - */ - -#ifndef __IMMINTRIN_H -#error \ - "Never use <amxbf16transposeintrin.h> directly; use <immintrin.h> instead." -#endif /* __IMMINTRIN_H */ - -#ifndef __AMX_BF16TRANSPOSEINTRIN_H -#define __AMX_BF16TRANSPOSEINTRIN_H -#ifdef __x86_64__ - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS \ - __attribute__((__always_inline__, __nodebug__, \ - __target__("amx-bf16,amx-transpose"))) - -/// Compute transpose and dot-product of BF16 (16-bit) floating-point pairs in -/// tiles \a a and \a b, accumulating the intermediate single-precision -/// (32-bit) floating-point elements with elements in \a dst, and store the -/// 32-bit result back to tile \a dst. -/// -/// \headerfile <immintrin.h> -/// -/// \code -/// void _tile_tdpbf16ps (__tile dst, __tile a, __tile b) -/// \endcode -/// -/// \code{.operation} -/// FOR m := 0 TO dst.rows - 1 -/// tmp := dst.row[m] -/// FOR k := 0 TO (a.colsb / 4) - 1 -/// FOR n := 0 TO (dst.colsb / 4) - 1 -/// tmp.bf32[n] += FP32(a.row[m].bf16[2*k+0]) * -/// FP32(b.row[k].bf16[2*n+0]) -/// tmp.bf32[n] += FP32(a.row[m].bf16[2*k+1]) * -/// FP32(b.row[k].bf16[2*n+1]) -/// ENDFOR -/// ENDFOR -/// write_row_and_zero(dst, m, tmp, dst.colsb) -/// ENDFOR -/// zero_upper_rows(dst, dst.rows) -/// zero_tileconfig_start() -/// \endcode -/// -/// This intrinsic corresponds to the \c TTDPBF16PS instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param a -/// The 1st source tile. Max size is 1024 Bytes. -/// \param b -/// The 2nd source tile. Max size is 1024 Bytes. -#define _tile_tdpbf16ps(dst, a, b) __builtin_ia32_ttdpbf16ps((dst), (a), (b)) - -/// This is internal intrinsic. C/C++ user should avoid calling it directly. -static __inline__ _tile1024i __DEFAULT_FN_ATTRS -_tile_tdpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k, - _tile1024i dst, _tile1024i src1, _tile1024i src2) { - return __builtin_ia32_ttdpbf16ps_internal(m, n, k, dst, src1, src2); -} - -/// Compute transpose and dot-product of BF16 (16-bit) floating-point pairs in -/// tiles src0 and src1, accumulating the intermediate single-precision -/// (32-bit) floating-point elements with elements in "dst", and store the -/// 32-bit result back to tile "dst". -/// -/// \headerfile <immintrin.h> -/// -/// This intrinsic corresponds to the <c> TTDPBF16PS </c> instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param src0 -/// The 1st source tile. Max size is 1024 Bytes. -/// \param src1 -/// The 2nd source tile. Max size is 1024 Bytes. -__DEFAULT_FN_ATTRS -static __inline__ void __tile_tdpbf16ps(__tile1024i *dst, __tile1024i src0, - __tile1024i src1) { - dst->tile = _tile_tdpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile, - src0.tile, src1.tile); -} - -#undef __DEFAULT_FN_ATTRS - -#endif /* __x86_64__ */ -#endif /* __AMX_BF16TRANSPOSEINTRIN_H */ diff --git a/clang/lib/Headers/amxcomplextransposeintrin.h b/clang/lib/Headers/amxcomplextransposeintrin.h deleted file mode 100644 index 11abaf9..0000000 --- a/clang/lib/Headers/amxcomplextransposeintrin.h +++ /dev/null @@ -1,303 +0,0 @@ -/*===----- amxcomplextransposeintrin.h - AMX-COMPLEX and AMX-TRANSPOSE ------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===------------------------------------------------------------------------=== - */ - -#ifndef __IMMINTRIN_H -#error \ - "Never use <amxcomplextransposeintrin.h> directly; include <immintrin.h> instead." -#endif // __IMMINTRIN_H - -#ifndef __AMX_COMPLEXTRANSPOSEINTRIN_H -#define __AMX_COMPLEXTRANSPOSEINTRIN_H -#ifdef __x86_64__ - -#define __DEFAULT_FN_ATTRS \ - __attribute__((__always_inline__, __nodebug__, \ - __target__("amx-complex,amx-transpose"))) - -/// Perform matrix multiplication of two tiles containing complex elements and -/// accumulate the results into a packed single precision tile. Each dword -/// element in input tiles \a a and \a b is interpreted as a complex number -/// with FP16 real part and FP16 imaginary part. -/// Calculates the imaginary part of the result. For each possible combination -/// of (transposed column of \a a, column of \a b), it performs a set of -/// multiplication and accumulations on all corresponding complex numbers -/// (one from \a a and one from \a b). The imaginary part of the \a a element -/// is multiplied with the real part of the corresponding \a b element, and -/// the real part of the \a a element is multiplied with the imaginary part -/// of the corresponding \a b elements. The two accumulated results are -/// added, and then accumulated into the corresponding row and column of -/// \a dst. -/// -/// \headerfile <x86intrin.h> -/// -/// \code -/// void _tile_tcmmimfp16ps(__tile dst, __tile a, __tile b); -/// \endcode -/// -/// \code{.operation} -/// FOR m := 0 TO dst.rows - 1 -/// tmp := dst.row[m] -/// FOR k := 0 TO a.rows - 1 -/// FOR n := 0 TO (dst.colsb / 4) - 1 -/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1]) -/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0]) -/// ENDFOR -/// ENDFOR -/// write_row_and_zero(dst, m, tmp, dst.colsb) -/// ENDFOR -/// zero_upper_rows(dst, dst.rows) -/// zero_tileconfig_start() -/// \endcode -/// -/// This intrinsic corresponds to the \c TTCMMIMFP16PS instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param a -/// The 1st source tile. Max size is 1024 Bytes. -/// \param b -/// The 2nd source tile. Max size is 1024 Bytes. -#define _tile_tcmmimfp16ps(dst, a, b) \ - __builtin_ia32_ttcmmimfp16ps((dst), (a), (b)) - -/// Perform matrix multiplication of two tiles containing complex elements and -/// accumulate the results into a packed single precision tile. Each dword -/// element in input tiles \a a and \a b is interpreted as a complex number -/// with FP16 real part and FP16 imaginary part. -/// Calculates the real part of the result. For each possible combination -/// of (rtransposed colum of \a a, column of \a b), it performs a set of -/// multiplication and accumulations on all corresponding complex numbers -/// (one from \a a and one from \a b). The real part of the \a a element is -/// multiplied with the real part of the corresponding \a b element, and the -/// negated imaginary part of the \a a element is multiplied with the -/// imaginary part of the corresponding \a b elements. The two accumulated -/// results are added, and then accumulated into the corresponding row and -/// column of \a dst. -/// -/// \headerfile <x86intrin.h> -/// -/// \code -/// void _tile_tcmmrlfp16ps(__tile dst, __tile a, __tile b); -/// \endcode -/// -/// \code{.operation} -/// FOR m := 0 TO dst.rows - 1 -/// tmp := dst.row[m] -/// FOR k := 0 TO a.rows - 1 -/// FOR n := 0 TO (dst.colsb / 4) - 1 -/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+0]) -/// tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+1]) -/// ENDFOR -/// ENDFOR -/// write_row_and_zero(dst, m, tmp, dst.colsb) -/// ENDFOR -/// zero_upper_rows(dst, dst.rows) -/// zero_tileconfig_start() -/// \endcode -/// -/// This intrinsic corresponds to the \c TTCMMIMFP16PS instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param a -/// The 1st source tile. Max size is 1024 Bytes. -/// \param b -/// The 2nd source tile. Max size is 1024 Bytes. -#define _tile_tcmmrlfp16ps(dst, a, b) \ - __builtin_ia32_ttcmmrlfp16ps((dst), (a), (b)) - -/// Perform matrix conjugate transpose and multiplication of two tiles -/// containing complex elements and accumulate the results into a packed -/// single precision tile. Each dword element in input tiles \a a and \a b -/// is interpreted as a complex number with FP16 real part and FP16 imaginary -/// part. -/// Calculates the imaginary part of the result. For each possible combination -/// of (transposed column of \a a, column of \a b), it performs a set of -/// multiplication and accumulations on all corresponding complex numbers -/// (one from \a a and one from \a b). The negated imaginary part of the \a a -/// element is multiplied with the real part of the corresponding \a b -/// element, and the real part of the \a a element is multiplied with the -/// imaginary part of the corresponding \a b elements. The two accumulated -/// results are added, and then accumulated into the corresponding row and -/// column of \a dst. -/// -/// \headerfile <x86intrin.h> -/// -/// \code -/// void _tile_conjtcmmimfp16ps(__tile dst, __tile a, __tile b); -/// \endcode -/// -/// \code{.operation} -/// FOR m := 0 TO dst.rows - 1 -/// tmp := dst.row[m] -/// FOR k := 0 TO a.rows - 1 -/// FOR n := 0 TO (dst.colsb / 4) - 1 -/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1]) -/// tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0]) -/// ENDFOR -/// ENDFOR -/// write_row_and_zero(dst, m, tmp, dst.colsb) -/// ENDFOR -/// zero_upper_rows(dst, dst.rows) -/// zero_tileconfig_start() -/// \endcode -/// -/// This intrinsic corresponds to the \c TCONJTCMMIMFP16PS instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param a -/// The 1st source tile. Max size is 1024 Bytes. -/// \param b -/// The 2nd source tile. Max size is 1024 Bytes. -#define _tile_conjtcmmimfp16ps(dst, a, b) \ - __builtin_ia32_tconjtcmmimfp16ps((dst), (a), (b)) - -/// Perform conjugate transpose of an FP16-pair of complex elements from \a a -/// and writes the result to \a dst. -/// -/// \headerfile <x86intrin.h> -/// -/// \code -/// void _tile_conjtfp16(__tile dst, __tile a); -/// \endcode -/// -/// \code{.operation} -/// FOR i := 0 TO dst.rows - 1 -/// FOR j := 0 TO (dst.colsb / 4) - 1 -/// tmp.fp16[2*j+0] := a.row[j].fp16[2*i+0] -/// tmp.fp16[2*j+1] := -a.row[j].fp16[2*i+1] -/// ENDFOR -/// write_row_and_zero(dst, i, tmp, dst.colsb) -/// ENDFOR -/// zero_upper_rows(dst, dst.rows) -/// zero_tileconfig_start() -/// \endcode -/// -/// This intrinsic corresponds to the \c TCONJTFP16 instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param a -/// The source tile. Max size is 1024 Bytes. -#define _tile_conjtfp16(dst, a) __builtin_ia32_tconjtfp16((dst), (a)) - -static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_tcmmimfp16ps_internal( - unsigned short m, unsigned short n, unsigned short k, _tile1024i dst, - _tile1024i src1, _tile1024i src2) { - return __builtin_ia32_ttcmmimfp16ps_internal(m, n, k, dst, src1, src2); -} - -static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_tcmmrlfp16ps_internal( - unsigned short m, unsigned short n, unsigned short k, _tile1024i dst, - _tile1024i src1, _tile1024i src2) { - return __builtin_ia32_ttcmmrlfp16ps_internal(m, n, k, dst, src1, src2); -} - -static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_conjtcmmimfp16ps_internal( - unsigned short m, unsigned short n, unsigned short k, _tile1024i dst, - _tile1024i src1, _tile1024i src2) { - return __builtin_ia32_tconjtcmmimfp16ps_internal(m, n, k, dst, src1, src2); -} - -static __inline__ _tile1024i __DEFAULT_FN_ATTRS -_tile_conjtfp16_internal(unsigned short m, unsigned short n, _tile1024i src) { - return __builtin_ia32_tconjtfp16_internal(m, n, src); -} - -/// Perform matrix multiplication of two tiles containing complex elements and -/// accumulate the results into a packed single precision tile. Each dword -/// element in input tiles src0 and src1 is interpreted as a complex number -/// with FP16 real part and FP16 imaginary part. -/// This function calculates the imaginary part of the result. -/// -/// \headerfile <immintrin.h> -/// -/// This intrinsic corresponds to the <c> TTCMMIMFP16PS </c> instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param src0 -/// The 1st source tile. Max size is 1024 Bytes. -/// \param src1 -/// The 2nd source tile. Max size is 1024 Bytes. -__DEFAULT_FN_ATTRS -static void __tile_tcmmimfp16ps(__tile1024i *dst, __tile1024i src0, - __tile1024i src1) { - dst->tile = _tile_tcmmimfp16ps_internal(src0.row, src1.col, src0.col, - dst->tile, src0.tile, src1.tile); -} - -/// Perform matrix multiplication of two tiles containing complex elements and -/// accumulate the results into a packed single precision tile. Each dword -/// element in input tiles src0 and src1 is interpreted as a complex number -/// with FP16 real part and FP16 imaginary part. -/// This function calculates the real part of the result. -/// -/// \headerfile <immintrin.h> -/// -/// This intrinsic corresponds to the <c> TTCMMRLFP16PS </c> instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param src0 -/// The 1st source tile. Max size is 1024 Bytes. -/// \param src1 -/// The 2nd source tile. Max size is 1024 Bytes. -__DEFAULT_FN_ATTRS -static void __tile_tcmmrlfp16ps(__tile1024i *dst, __tile1024i src0, - __tile1024i src1) { - dst->tile = _tile_tcmmrlfp16ps_internal(src0.row, src1.col, src0.col, - dst->tile, src0.tile, src1.tile); -} - -/// Perform matrix conjugate transpose and multiplication of two tiles -/// containing complex elements and accumulate the results into a packed -/// single precision tile. Each dword element in input tiles src0 and src1 -/// is interpreted as a complex number with FP16 real part and FP16 imaginary -/// part. -/// This function calculates the imaginary part of the result. -/// -/// \headerfile <immintrin.h> -/// -/// This intrinsic corresponds to the <c> TCONJTCMMIMFP16PS </c> instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param src0 -/// The 1st source tile. Max size is 1024 Bytes. -/// \param src1 -/// The 2nd source tile. Max size is 1024 Bytes. -__DEFAULT_FN_ATTRS -static void __tile_conjtcmmimfp16ps(__tile1024i *dst, __tile1024i src0, - __tile1024i src1) { - dst->tile = _tile_conjtcmmimfp16ps_internal(src0.row, src1.col, src0.col, - dst->tile, src0.tile, src1.tile); -} - -/// Perform conjugate transpose of an FP16-pair of complex elements from src and -/// writes the result to dst. -/// -/// \headerfile <immintrin.h> -/// -/// This intrinsic corresponds to the <c> TCONJTFP16 </c> instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param src -/// The source tile. Max size is 1024 Bytes. -__DEFAULT_FN_ATTRS -static void __tile_conjtfp16(__tile1024i *dst, __tile1024i src) { - dst->tile = _tile_conjtfp16_internal(src.row, src.col, src.tile); -} - -#undef __DEFAULT_FN_ATTRS - -#endif // __x86_64__ -#endif // __AMX_COMPLEXTRANSPOSEINTRIN_H diff --git a/clang/lib/Headers/amxfp16transposeintrin.h b/clang/lib/Headers/amxfp16transposeintrin.h deleted file mode 100644 index 191f8c6..0000000 --- a/clang/lib/Headers/amxfp16transposeintrin.h +++ /dev/null @@ -1,94 +0,0 @@ -/*===----- amxfp16transposeintrin.h - AMX-FP16 and AMX-TRANSPOSE ------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===------------------------------------------------------------------------=== - */ - -#ifndef __IMMINTRIN_H -#error \ - "Never use <amxfp16transposeintrin.h> directly; use <immintrin.h> instead." -#endif /* __IMMINTRIN_H */ - -#ifndef __AMX_FP16TRANSPOSEINTRIN_H -#define __AMX_FP16TRANSPOSEINTRIN_H -#ifdef __x86_64__ - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS \ - __attribute__((__always_inline__, __nodebug__, \ - __target__("amx-fp16,amx-transpose"))) - -/// Compute transpose and dot-product of FP16 (16-bit) floating-point pairs in -/// tiles \a a and \a b, accumulating the intermediate single-precision -/// (32-bit) floating-point elements with elements in \a dst, and store the -/// 32-bit result back to tile \a dst. -/// -/// \headerfile <immintrin.h> -/// -/// \code -/// void _tile_tdpfp16ps (__tile dst, __tile a, __tile b) -/// \endcode -/// -/// \code{.operation} -/// FOR m := 0 TO dst.rows - 1 -/// tmp := dst.row[m] -/// FOR k := 0 TO (a.colsb / 4) - 1 -/// FOR n := 0 TO (dst.colsb / 4) - 1 -/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * -/// FP32(b.row[k].fp16[2*n+0]) -/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * -/// FP32(b.row[k].fp16[2*n+1]) -/// ENDFOR -/// ENDFOR -/// write_row_and_zero(dst, m, tmp, dst.colsb) -/// ENDFOR -/// zero_upper_rows(dst, dst.rows) -/// zero_tileconfig_start() -/// \endcode -/// -/// This intrinsic corresponds to the \c TTDPFP16PS instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param a -/// The 1st source tile. Max size is 1024 Bytes. -/// \param b -/// The 2nd source tile. Max size is 1024 Bytes. -#define _tile_tdpfp16ps(dst, a, b) __builtin_ia32_ttdpfp16ps((dst), (a), (b)) - -/// This is internal intrinsic. C/C++ user should avoid calling it directly. -static __inline__ _tile1024i __DEFAULT_FN_ATTRS -_tile_tdpfp16ps_internal(unsigned short m, unsigned short n, unsigned short k, - _tile1024i dst, _tile1024i src1, _tile1024i src2) { - return __builtin_ia32_ttdpfp16ps_internal(m, n, k, dst, src1, src2); -} - -/// Compute transpose and dot-product of FP16 (16-bit) floating-point pairs in -/// tiles src0 and src1, accumulating the intermediate single-precision -/// (32-bit) floating-point elements with elements in "dst", and store the -/// 32-bit result back to tile "dst". -/// -/// \headerfile <immintrin.h> -/// -/// This intrinsic corresponds to the <c> TTDPFP16PS </c> instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param src0 -/// The 1st source tile. Max size is 1024 Bytes. -/// \param src1 -/// The 2nd source tile. Max size is 1024 Bytes. -__DEFAULT_FN_ATTRS -static __inline__ void __tile_tdpfp16ps(__tile1024i *dst, __tile1024i src0, - __tile1024i src1) { - dst->tile = _tile_tdpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile, - src0.tile, src1.tile); -} - -#undef __DEFAULT_FN_ATTRS - -#endif /* __x86_64__ */ -#endif /* __AMX_FP16TRANSPOSEINTRIN_H */ diff --git a/clang/lib/Headers/amxintrin.h b/clang/lib/Headers/amxintrin.h index a7da10d..208aa358 100644 --- a/clang/lib/Headers/amxintrin.h +++ b/clang/lib/Headers/amxintrin.h @@ -230,8 +230,6 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) { /// bytes. Since there is no 2D type in llvm IR, we use vector type to /// represent 2D tile and the fixed size is maximum amx tile register size. typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64))); -typedef int _tile1024i_1024a - __attribute__((__vector_size__(1024), __aligned__(1024))); /// This is internal intrinsic. C/C++ user should avoid calling it directly. static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TILE diff --git a/clang/lib/Headers/amxmovrstransposeintrin.h b/clang/lib/Headers/amxmovrstransposeintrin.h deleted file mode 100644 index 5f48cba..0000000 --- a/clang/lib/Headers/amxmovrstransposeintrin.h +++ /dev/null @@ -1,200 +0,0 @@ -/* ===--- amxmovrstransposeintrin.h - AMX_MOVRS_TRANSPOSE intrinsics --------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - * ===-----------------------------------------------------------------------=== - */ - -#ifndef __IMMINTRIN_H -#error \ - "Never use <amxmovrstransposeintrin.h> directly; use <immintrin.h> instead." -#endif /* __IMMINTRIN_H */ - -#ifndef __AMX_MOVRS_TRANSPOSEINTRIN_H -#define __AMX_MOVRS_TRANSPOSEINTRIN_H -#ifdef __x86_64__ - -#define __DEFAULT_FN_ATTRS \ - __attribute__((__always_inline__, __nodebug__, \ - __target__("amx-transpose,amx-movrs"))) - -#define _tile_2rpntlvwz0rs(tdst, base, stride) \ - __builtin_ia32_t2rpntlvwz0rs(tdst, base, stride) -#define _tile_2rpntlvwz0rst1(tdst, base, stride) \ - __builtin_ia32_t2rpntlvwz0rst1(tdst, base, stride) -#define _tile_2rpntlvwz1rs(tdst, base, stride) \ - __builtin_ia32_t2rpntlvwz1rs(tdst, base, stride) -#define _tile_2rpntlvwz1rst1(tdst, base, stride) \ - __builtin_ia32_t2rpntlvwz1rst1(tdst, base, stride) - -static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz0rs_internal( - unsigned short row, unsigned short col0, unsigned short col1, - _tile1024i *dst0, _tile1024i *dst1, const void *base, - __SIZE_TYPE__ stride) { - // Use __tile1024i_1024a* to escape the alignment check in - // clang/test/Headers/x86-intrinsics-headers-clean.cpp - __builtin_ia32_t2rpntlvwz0rs_internal( - row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, - (__SIZE_TYPE__)(stride)); -} - -static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz0rst1_internal( - unsigned short row, unsigned short col0, unsigned short col1, - _tile1024i *dst0, _tile1024i *dst1, const void *base, - __SIZE_TYPE__ stride) { - __builtin_ia32_t2rpntlvwz0rst1_internal( - row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, - (__SIZE_TYPE__)(stride)); -} - -static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz1rs_internal( - unsigned short row, unsigned short col0, unsigned short col1, - _tile1024i *dst0, _tile1024i *dst1, const void *base, - __SIZE_TYPE__ stride) { - __builtin_ia32_t2rpntlvwz1rs_internal( - row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, - (__SIZE_TYPE__)(stride)); -} - -static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz1rst1_internal( - unsigned short row, unsigned short col0, unsigned short col1, - _tile1024i *dst0, _tile1024i *dst1, const void *base, - __SIZE_TYPE__ stride) { - __builtin_ia32_t2rpntlvwz1rst1_internal( - row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, - (__SIZE_TYPE__)(stride)); -} - -/// Converts a pair of tiles from memory into VNNI format, and places the -/// results in a pair of destinations specified by dst. The pair of tiles -/// in memory is specified via a tsib; the second tile is after the first -/// one, separated by the same stride that separates each row. -/// The tile configuration for the destination tiles indicates the amount -/// of data to read from memory. The instruction will load a number of rows -/// that is equal to twice the number of rows in tmm1. The size of each row -/// is equal to the average width of the destination tiles. If the second -/// tile is configured with zero rows and columns, only the first tile will -/// be written. -/// Provides a hint to the implementation that the data will likely become -/// read shared in the near future and the data caching can be optimized. -/// -/// \headerfile <immintrin.h> -/// -/// This intrinsic corresponds to the <c> T2RPNTLVWZ0RS </c> instruction. -/// -/// \param dst0 -/// First tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param dst1 -/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param base -/// A pointer to base address. -/// \param stride -/// The stride between the rows' data to be loaded in memory. -__DEFAULT_FN_ATTRS -static void __tile_2rpntlvwz0rs(__tile1024i *dst0, __tile1024i *dst1, - const void *base, __SIZE_TYPE__ stride) { - _tile_2rpntlvwz0rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, - &dst1->tile, base, stride); -} - -/// Converts a pair of tiles from memory into VNNI format, and places the -/// results in a pair of destinations specified by dst. The pair of tiles -/// in memory is specified via a tsib; the second tile is after the first -/// one, separated by the same stride that separates each row. -/// The tile configuration for the destination tiles indicates the amount -/// of data to read from memory. The instruction will load a number of rows -/// that is equal to twice the number of rows in tmm1. The size of each row -/// is equal to the average width of the destination tiles. If the second -/// tile is configured with zero rows and columns, only the first tile will -/// be written. -/// -/// \headerfile <immintrin.h> -/// -/// This intrinsic corresponds to the <c> T2RPNTLVWZ0T1RS </c> instruction. -/// -/// \param dst0 -/// First tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param dst1 -/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param base -/// A pointer to base address. -/// \param stride -/// The stride between the rows' data to be loaded in memory. -__DEFAULT_FN_ATTRS -static void __tile_2rpntlvwz0rst1(__tile1024i *dst0, __tile1024i *dst1, - const void *base, __SIZE_TYPE__ stride) { - _tile_2rpntlvwz0rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, - &dst1->tile, base, stride); -} - -/// Converts a pair of tiles from memory into VNNI format, and places the -/// results in a pair of destinations specified by dst. The pair of tiles -/// in memory is specified via a tsib; the second tile is after the first -/// one, separated by the same stride that separates each row. -/// The tile configuration for the destination tiles indicates the amount -/// of data to read from memory. The instruction will load a number of rows -/// that is equal to twice the number of rows in tmm1. The size of each row -/// is equal to the average width of the destination tiles. If the second -/// tile is configured with zero rows and columns, only the first tile will -/// be written. The last row will be not be read from memory but instead -/// filled with zeros. -/// Provides a hint to the implementation that the data will likely become -/// read shared in the near future and the data caching can be optimized. -/// -/// \headerfile <immintrin.h> -/// -/// This intrinsic corresponds to the <c> T2RPNTLVWZ1 </c> instruction. -/// -/// \param dst0 -/// First tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param dst1 -/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param base -/// A pointer to base address. -/// \param stride -/// The stride between the rows' data to be loaded in memory. -__DEFAULT_FN_ATTRS -static void __tile_2rpntlvwz1rs(__tile1024i *dst0, __tile1024i *dst1, - const void *base, __SIZE_TYPE__ stride) { - _tile_2rpntlvwz1rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, - &dst1->tile, base, stride); -} - -/// Converts a pair of tiles from memory into VNNI format, and places the -/// results in a pair of destinations specified by dst. The pair of tiles -/// in memory is specified via a tsib; the second tile is after the first -/// one, separated by the same stride that separates each row. -/// The tile configuration for the destination tiles indicates the amount -/// of data to read from memory. The instruction will load a number of rows -/// that is equal to twice the number of rows in tmm1. The size of each row -/// is equal to the average width of the destination tiles. If the second -/// tile is configured with zero rows and columns, only the first tile will -/// be written. The last row will be not be read from memory but instead -/// filled with zeros. -/// Provides a hint to the implementation that the data will likely become -/// read shared in the near future and the data caching can be optimized. -/// -/// \headerfile <immintrin.h> -/// -/// This intrinsic corresponds to the <c> T2RPNTLVWZ1T1RS </c> instruction. -/// -/// \param dst0 -/// First tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param dst1 -/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param base -/// A pointer to base address. -/// \param stride -/// The stride between the rows' data to be loaded in memory. -__DEFAULT_FN_ATTRS -static void __tile_2rpntlvwz1rst1(__tile1024i *dst0, __tile1024i *dst1, - const void *base, __SIZE_TYPE__ stride) { - _tile_2rpntlvwz1rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, - &dst1->tile, base, stride); -} - -#undef __DEFAULT_FN_ATTRS -#endif /* __x86_64__ */ -#endif /* __AMX_MOVRS_TRANSPOSEINTRIN_H */ diff --git a/clang/lib/Headers/amxtf32transposeintrin.h b/clang/lib/Headers/amxtf32transposeintrin.h deleted file mode 100644 index e1b90c1..0000000 --- a/clang/lib/Headers/amxtf32transposeintrin.h +++ /dev/null @@ -1,105 +0,0 @@ -/*===--------- amxtf32transposeintrin.h - AMX-TF32 and AMX-TRANSPOSE --------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===------------------------------------------------------------------------=== - */ -#ifndef __IMMINTRIN_H -#error \ - "Never use <amxtf32transposeintrin.h> directly; include <immintrin.h> instead." -#endif // __IMMINTRIN_H - -#ifndef __AMX_TF32TRANSPOSEINTRIN_H -#define __AMX_TF32TRANSPOSEINTRIN_H -#ifdef __x86_64__ - -#define __DEFAULT_FN_ATTRS_TF32_TRANSPOSE \ - __attribute__((__always_inline__, __nodebug__, \ - __target__("amx-tf32,amx-transpose"))) - -/// \code -/// void _tile_tmmultf32ps(constexpr int srcdst, constexpr int a, \ -/// constexpr int b); -/// \endcode -/// -/// This intrinsic corresponds to the <c> TTMMULTF32PS </c> instruction. -/// -/// \param srcdst -/// The destination tile. Max size is 1024 Bytes. -/// \param a -/// The 1st source tile. Max size is 1024 Bytes. -/// \param b -/// The 2nd source tile. Max size is 1024 Bytes. -/// -/// \code{.operation} -/// DEFINE zero_lower_mantissa_bits_fp32(x[31:0]) { -/// dword[12:0] := 0 -/// dword[31:13] := x[31:13] -/// return dword -/// } -/// -/// DEFINE silence_snan_fp32(x[31:0]) { -/// IF (x.exponent == 255 and x.fraction != 0 and x.fraction[22] == 0) -/// x.fraction[22] := 1 -/// return x -/// } -/// -/// elements_dest:= srcdst.colsb/4 -/// -/// FOR m := 0 TO (srcdst.rows-1) -/// tmp[511:0] := 0 -/// FOR k := 0 TO (a.rows-1) -/// FOR n := 0 TO (elements_dest-1) -/// a1e := silence_snan_fp32(a.row[k].fp32[m]) -/// a2e := silence_snan_fp32(b.row[k].fp32[n]) -/// s1e := zero_lower_mantissa_bits_fp32(a1e) -/// s2e := zero_lower_mantissa_bits_fp32(a2e) -/// tmp.fp32[n] += s1e * s2e -/// ENDFOR -/// ENDFOR -/// -/// FOR n := 0 TO (elements_dest-1) -/// tmp.fp32[n] += srcdst.row[m].fp32[n] -/// ENDFOR -/// write_row_and_zero(srcdst, m, tmp, srcdst.colsb) -/// -/// ENDFOR -/// -/// zero_upper_rows(srcdst, srcdst.rows) -/// zero_tileconfig_start() -/// \endcode -#define _tile_tmmultf32ps(srcdst, a, b) \ - __builtin_ia32_ttmmultf32ps((srcdst), (a), (b)) - -// dst = m x n (srcdest), src1 = k x m, src2 = k x n -static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TF32_TRANSPOSE -_tile_tmmultf32ps_internal(unsigned short m, unsigned short n, unsigned short k, - _tile1024i dst, _tile1024i src1, _tile1024i src2) { - return __builtin_ia32_ttmmultf32ps_internal(m, n, k, dst, src1, src2); -} - -/// Compute transpose and do Matrix Multiplication of src0 and src1, and then do -/// Matrix Plus with dst. All the calculation is base on float32 but with the -/// lower 13-bit set to 0. -/// -/// \headerfile <immintrin.h> -/// -/// This intrinsic corresponds to the <c> TTMMULTF32PS </c> instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param src0 -/// The 1st source tile. Max size is 1024 Bytes. -/// \param src1 -/// The 2nd source tile. Max size is 1024 Bytes. -__DEFAULT_FN_ATTRS_TF32_TRANSPOSE -static void __tile_tmmultf32ps(__tile1024i *dst, __tile1024i src0, - __tile1024i src1) { - dst->tile = _tile_tmmultf32ps_internal(src0.row, src1.col, src0.col, - dst->tile, src0.tile, src1.tile); -} - -#endif // __x86_64__ -#endif // __AMX_TF32TRANSPOSEINTRIN_H diff --git a/clang/lib/Headers/amxtransposeintrin.h b/clang/lib/Headers/amxtransposeintrin.h deleted file mode 100644 index b3fa37d..0000000 --- a/clang/lib/Headers/amxtransposeintrin.h +++ /dev/null @@ -1,248 +0,0 @@ -/* ===--- amxtransposeintrin.h - AMX_TRANSPOSE intrinsics -*- C++ -*---------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - * ===-----------------------------------------------------------------------=== - */ - -#ifndef __IMMINTRIN_H -#error "Never use <amxtransposeintrin.h> directly; use <immintrin.h> instead." -#endif /* __IMMINTRIN_H */ - -#ifndef __AMX_TRANSPOSEINTRIN_H -#define __AMX_TRANSPOSEINTRIN_H -#ifdef __x86_64__ - -#define __DEFAULT_FN_ATTRS_TRANSPOSE \ - __attribute__((__always_inline__, __nodebug__, __target__("amx-transpose"))) - -#define _tile_2rpntlvwz0(tdst, base, stride) \ - __builtin_ia32_t2rpntlvwz0(tdst, base, stride) -#define _tile_2rpntlvwz0t1(tdst, base, stride) \ - __builtin_ia32_t2rpntlvwz0t1(tdst, base, stride) -#define _tile_2rpntlvwz1(tdst, base, stride) \ - __builtin_ia32_t2rpntlvwz1(tdst, base, stride) -#define _tile_2rpntlvwz1t1(tdst, base, stride) \ - __builtin_ia32_t2rpntlvwz1t1(tdst, base, stride) - -/// Transpose 32-bit elements from \a src and write the result to \a dst. -/// -/// \headerfile <immintrin.h> -/// -/// \code -/// void _tile_transposed(__tile dst, __tile src); -/// \endcode -/// -/// This intrinsic corresponds to the <c> TTRANSPOSED </c> instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param src -/// The source tile. Max size is 1024 Bytes. -/// -/// \code{.operation} -/// -/// FOR i := 0 TO (dst.rows-1) -/// tmp[511:0] := 0 -/// FOR j := 0 TO (dst.colsb/4-1) -/// tmp.dword[j] := src.row[j].dword[i] -/// ENDFOR -/// dst.row[i] := tmp -/// ENDFOR -/// -/// zero_upper_rows(dst, dst.rows) -/// zero_tileconfig_start() -/// \endcode -#define _tile_transposed(dst, src) __builtin_ia32_ttransposed(dst, src) - -static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0_internal( - unsigned short row, unsigned short col0, unsigned short col1, - _tile1024i *dst0, _tile1024i *dst1, const void *base, - __SIZE_TYPE__ stride) { - // Use __tile1024i_1024a* to escape the alignment check in - // clang/test/Headers/x86-intrinsics-headers-clean.cpp - __builtin_ia32_t2rpntlvwz0_internal(row, col0, col1, (_tile1024i_1024a *)dst0, - (_tile1024i_1024a *)dst1, base, - (__SIZE_TYPE__)(stride)); -} - -static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0t1_internal( - unsigned short row, unsigned short col0, unsigned short col1, - _tile1024i *dst0, _tile1024i *dst1, const void *base, - __SIZE_TYPE__ stride) { - __builtin_ia32_t2rpntlvwz0t1_internal( - row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, - (__SIZE_TYPE__)(stride)); -} - -static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1_internal( - unsigned short row, unsigned short col0, unsigned short col1, - _tile1024i *dst0, _tile1024i *dst1, const void *base, - __SIZE_TYPE__ stride) { - __builtin_ia32_t2rpntlvwz1_internal(row, col0, col1, (_tile1024i_1024a *)dst0, - (_tile1024i_1024a *)dst1, base, - (__SIZE_TYPE__)(stride)); -} - -static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1t1_internal( - unsigned short row, unsigned short col0, unsigned short col1, - _tile1024i *dst0, _tile1024i *dst1, const void *base, - __SIZE_TYPE__ stride) { - __builtin_ia32_t2rpntlvwz1t1_internal( - row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, - (__SIZE_TYPE__)(stride)); -} - -// This is internal intrinsic. C/C++ user should avoid calling it directly. -static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TRANSPOSE -_tile_transposed_internal(unsigned short m, unsigned short n, _tile1024i src) { - return __builtin_ia32_ttransposed_internal(m, n, src); -} - -/// Converts a pair of tiles from memory into VNNI format, and places the -/// results in a pair of destinations specified by dst. The pair of tiles -/// in memory is specified via a tsib; the second tile is after the first -/// one, separated by the same stride that separates each row. -/// The tile configuration for the destination tiles indicates the amount -/// of data to read from memory. The instruction will load a number of rows -/// that is equal to twice the number of rows in tmm1. The size of each row -/// is equal to the average width of the destination tiles. If the second -/// tile is configured with zero rows and columns, only the first tile will -/// be written. -/// Provides a hint to the implementation that the data will likely not be -/// reused in the near future and the data caching can be optimized. -/// -/// \headerfile <immintrin.h> -/// -/// This intrinsic corresponds to the <c> T2RPNTLVWZ0 </c> instruction. -/// -/// \param dst0 -/// First tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param dst1 -/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param base -/// A pointer to base address. -/// \param stride -/// The stride between the rows' data to be loaded in memory. -__DEFAULT_FN_ATTRS_TRANSPOSE -static void __tile_2rpntlvwz0(__tile1024i *dst0, __tile1024i *dst1, - const void *base, __SIZE_TYPE__ stride) { - _tile_2rpntlvwz0_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, - &dst1->tile, base, stride); -} - -/// Converts a pair of tiles from memory into VNNI format, and places the -/// results in a pair of destinations specified by dst. The pair of tiles -/// in memory is specified via a tsib; the second tile is after the first -/// one, separated by the same stride that separates each row. -/// The tile configuration for the destination tiles indicates the amount -/// of data to read from memory. The instruction will load a number of rows -/// that is equal to twice the number of rows in tmm1. The size of each row -/// is equal to the average width of the destination tiles. If the second -/// tile is configured with zero rows and columns, only the first tile will -/// be written. -/// -/// \headerfile <immintrin.h> -/// -/// This intrinsic corresponds to the <c> T2RPNTLVWZ0T1 </c> instruction. -/// -/// \param dst0 -/// First tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param dst1 -/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param base -/// A pointer to base address. -/// \param stride -/// The stride between the rows' data to be loaded in memory. -__DEFAULT_FN_ATTRS_TRANSPOSE -static void __tile_2rpntlvwz0t1(__tile1024i *dst0, __tile1024i *dst1, - const void *base, __SIZE_TYPE__ stride) { - _tile_2rpntlvwz0t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, - &dst1->tile, base, stride); -} - -/// Converts a pair of tiles from memory into VNNI format, and places the -/// results in a pair of destinations specified by dst. The pair of tiles -/// in memory is specified via a tsib; the second tile is after the first -/// one, separated by the same stride that separates each row. -/// The tile configuration for the destination tiles indicates the amount -/// of data to read from memory. The instruction will load a number of rows -/// that is equal to twice the number of rows in tmm1. The size of each row -/// is equal to the average width of the destination tiles. If the second -/// tile is configured with zero rows and columns, only the first tile will -/// be written. The last row will be not be read from memory but instead -/// filled with zeros. -/// Provides a hint to the implementation that the data will likely not be -/// reused in the near future and the data caching can be optimized. -/// -/// \headerfile <immintrin.h> -/// -/// This intrinsic corresponds to the <c> T2RPNTLVWZ1 </c> instruction. -/// -/// \param dst0 -/// First tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param dst1 -/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param base -/// A pointer to base address. -/// \param stride -/// The stride between the rows' data to be loaded in memory. -__DEFAULT_FN_ATTRS_TRANSPOSE -static void __tile_2rpntlvwz1(__tile1024i *dst0, __tile1024i *dst1, - const void *base, __SIZE_TYPE__ stride) { - _tile_2rpntlvwz1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, - &dst1->tile, base, stride); -} - -/// Converts a pair of tiles from memory into VNNI format, and places the -/// results in a pair of destinations specified by dst. The pair of tiles -/// in memory is specified via a tsib; the second tile is after the first -/// one, separated by the same stride that separates each row. -/// The tile configuration for the destination tiles indicates the amount -/// of data to read from memory. The instruction will load a number of rows -/// that is equal to twice the number of rows in tmm1. The size of each row -/// is equal to the average width of the destination tiles. If the second -/// tile is configured with zero rows and columns, only the first tile will -/// be written. The last row will be not be read from memory but instead -/// filled with zeros. -/// Provides a hint to the implementation that the data will likely not be -/// reused in the near future and the data caching can be optimized. -/// -/// \headerfile <immintrin.h> -/// -/// This intrinsic corresponds to the <c> T2RPNTLVWZ1T1 </c> instruction. -/// -/// \param dst0 -/// First tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param dst1 -/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param base -/// A pointer to base address. -/// \param stride -/// The stride between the rows' data to be loaded in memory. -__DEFAULT_FN_ATTRS_TRANSPOSE -static void __tile_2rpntlvwz1t1(__tile1024i *dst0, __tile1024i *dst1, - const void *base, __SIZE_TYPE__ stride) { - _tile_2rpntlvwz1t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, - &dst1->tile, base, stride); -} - -/// Transpose 32-bit elements from src and write the result to dst. -/// -/// \headerfile <immintrin.h> -/// -/// This intrinsic corresponds to the <c> TTRANSPOSED </c> instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param src -/// The source tile. Max size is 1024 Bytes. -__DEFAULT_FN_ATTRS_TRANSPOSE -static void __tile_transposed(__tile1024i *dst, __tile1024i src) { - dst->tile = _tile_transposed_internal(dst->row, dst->col, src.tile); -} - -#endif /* __x86_64__ */ -#endif /* __AMX_TRANSPOSEINTRIN_H */ diff --git a/clang/lib/Headers/hlsl/hlsl_compat_overloads.h b/clang/lib/Headers/hlsl/hlsl_compat_overloads.h index fe4277e..ee243ab 100644 --- a/clang/lib/Headers/hlsl/hlsl_compat_overloads.h +++ b/clang/lib/Headers/hlsl/hlsl_compat_overloads.h @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// #ifndef _HLSL_COMPAT_OVERLOADS_H_ -#define _HLSl_COMPAT_OVERLOADS_H_ +#define _HLSL_COMPAT_OVERLOADS_H_ namespace hlsl { diff --git a/clang/lib/Headers/hvx_hexagon_protos.h b/clang/lib/Headers/hvx_hexagon_protos.h index fd120a5..19309a4 100644 --- a/clang/lib/Headers/hvx_hexagon_protos.h +++ b/clang/lib/Headers/hvx_hexagon_protos.h @@ -5605,6 +5605,399 @@ __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vsub_hf_f8)(Vu, Vv) #endif /* __HEXAGON_ARCH___ >= 79 */ +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.qf16=vabs(Vu32.hf) + C Intrinsic Prototype: HVX_Vector Q6_Vqf16_vabs_Vhf(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vqf16_vabs_Vhf(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vabs_qf16_hf)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.qf16=vabs(Vu32.qf16) + C Intrinsic Prototype: HVX_Vector Q6_Vqf16_vabs_Vqf16(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vqf16_vabs_Vqf16(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vabs_qf16_qf16)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.qf32=vabs(Vu32.qf32) + C Intrinsic Prototype: HVX_Vector Q6_Vqf32_vabs_Vqf32(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vqf32_vabs_Vqf32(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vabs_qf32_qf32)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.qf32=vabs(Vu32.sf) + C Intrinsic Prototype: HVX_Vector Q6_Vqf32_vabs_Vsf(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vqf32_vabs_Vsf(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vabs_qf32_sf)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32=valign4(Vu32,Vv32,Rt8) + C Intrinsic Prototype: HVX_Vector Q6_V_valign4_VVR(HVX_Vector Vu, HVX_Vector + Vv, Word32 Rt) Instruction Type: CVI_VA Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_V_valign4_VVR(Vu, Vv, Rt) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_valign4)(Vu, Vv, Rt) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.bf=Vuu32.qf32 + C Intrinsic Prototype: HVX_Vector Q6_Vbf_equals_Wqf32(HVX_VectorPair Vuu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vbf_equals_Wqf32(Vuu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_bf_qf32)(Vuu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.f8=Vu32.qf16 + C Intrinsic Prototype: HVX_Vector Q6_V_equals_Vqf16(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_V_equals_Vqf16(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_f8_qf16)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.h=Vu32.hf:rnd + C Intrinsic Prototype: HVX_Vector Q6_Vh_equals_Vhf_rnd(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vh_equals_Vhf_rnd(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_h_hf_rnd)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vdd32.qf16=Vu32.f8 + C Intrinsic Prototype: HVX_VectorPair Q6_Wqf16_equals_V(HVX_Vector Vu) + Instruction Type: CVI_VP_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Wqf16_equals_V(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_qf16_f8)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.qf16=Vu32.hf + C Intrinsic Prototype: HVX_Vector Q6_Vqf16_equals_Vhf(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vqf16_equals_Vhf(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_qf16_hf)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.qf16=Vu32.qf16 + C Intrinsic Prototype: HVX_Vector Q6_Vqf16_equals_Vqf16(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vqf16_equals_Vqf16(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_qf16_qf16)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.qf32=Vu32.qf32 + C Intrinsic Prototype: HVX_Vector Q6_Vqf32_equals_Vqf32(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vqf32_equals_Vqf32(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_qf32_qf32)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.qf32=Vu32.sf + C Intrinsic Prototype: HVX_Vector Q6_Vqf32_equals_Vsf(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vqf32_equals_Vsf(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_qf32_sf)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Qd4=vcmp.eq(Vu32.hf,Vv32.hf) + C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eq_VhfVhf(HVX_Vector Vu, + HVX_Vector Vv) Instruction Type: CVI_VA Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Q_vcmp_eq_VhfVhf(Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)( \ + (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqhf)(Vu, Vv)), -1) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Qx4&=vcmp.eq(Vu32.hf,Vv32.hf) + C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eqand_QVhfVhf(HVX_VectorPred + Qx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type: CVI_VA Execution + Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Q_vcmp_eqand_QVhfVhf(Qx, Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)( \ + (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqhf_and)( \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandvrt)((Qx), -1), Vu, \ + Vv)), \ + -1) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Qx4|=vcmp.eq(Vu32.hf,Vv32.hf) + C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eqor_QVhfVhf(HVX_VectorPred + Qx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type: CVI_VA Execution + Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Q_vcmp_eqor_QVhfVhf(Qx, Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)( \ + (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqhf_or)( \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandvrt)((Qx), -1), Vu, \ + Vv)), \ + -1) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Qx4^=vcmp.eq(Vu32.hf,Vv32.hf) + C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eqxacc_QVhfVhf(HVX_VectorPred + Qx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type: CVI_VA Execution + Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Q_vcmp_eqxacc_QVhfVhf(Qx, Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)( \ + (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqhf_xor)( \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandvrt)((Qx), -1), Vu, \ + Vv)), \ + -1) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Qd4=vcmp.eq(Vu32.sf,Vv32.sf) + C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eq_VsfVsf(HVX_Vector Vu, + HVX_Vector Vv) Instruction Type: CVI_VA Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Q_vcmp_eq_VsfVsf(Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)( \ + (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqsf)(Vu, Vv)), -1) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Qx4&=vcmp.eq(Vu32.sf,Vv32.sf) + C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eqand_QVsfVsf(HVX_VectorPred + Qx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type: CVI_VA Execution + Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Q_vcmp_eqand_QVsfVsf(Qx, Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)( \ + (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqsf_and)( \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandvrt)((Qx), -1), Vu, \ + Vv)), \ + -1) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Qx4|=vcmp.eq(Vu32.sf,Vv32.sf) + C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eqor_QVsfVsf(HVX_VectorPred + Qx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type: CVI_VA Execution + Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Q_vcmp_eqor_QVsfVsf(Qx, Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)( \ + (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqsf_or)( \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandvrt)((Qx), -1), Vu, \ + Vv)), \ + -1) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Qx4^=vcmp.eq(Vu32.sf,Vv32.sf) + C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eqxacc_QVsfVsf(HVX_VectorPred + Qx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type: CVI_VA Execution + Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Q_vcmp_eqxacc_QVsfVsf(Qx, Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)( \ + (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqsf_xor)( \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandvrt)((Qx), -1), Vu, \ + Vv)), \ + -1) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.w=vilog2(Vu32.hf) + C Intrinsic Prototype: HVX_Vector Q6_Vw_vilog2_Vhf(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vw_vilog2_Vhf(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vilog2_hf)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.w=vilog2(Vu32.qf16) + C Intrinsic Prototype: HVX_Vector Q6_Vw_vilog2_Vqf16(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vw_vilog2_Vqf16(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vilog2_qf16)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.w=vilog2(Vu32.qf32) + C Intrinsic Prototype: HVX_Vector Q6_Vw_vilog2_Vqf32(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vw_vilog2_Vqf32(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vilog2_qf32)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.w=vilog2(Vu32.sf) + C Intrinsic Prototype: HVX_Vector Q6_Vw_vilog2_Vsf(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vw_vilog2_Vsf(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vilog2_sf)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.qf16=vneg(Vu32.hf) + C Intrinsic Prototype: HVX_Vector Q6_Vqf16_vneg_Vhf(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vqf16_vneg_Vhf(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vneg_qf16_hf)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.qf16=vneg(Vu32.qf16) + C Intrinsic Prototype: HVX_Vector Q6_Vqf16_vneg_Vqf16(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vqf16_vneg_Vqf16(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vneg_qf16_qf16)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.qf32=vneg(Vu32.qf32) + C Intrinsic Prototype: HVX_Vector Q6_Vqf32_vneg_Vqf32(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vqf32_vneg_Vqf32(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vneg_qf32_qf32)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.qf32=vneg(Vu32.sf) + C Intrinsic Prototype: HVX_Vector Q6_Vqf32_vneg_Vsf(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vqf32_vneg_Vsf(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vneg_qf32_sf)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.qf16=vsub(Vu32.hf,Vv32.qf16) + C Intrinsic Prototype: HVX_Vector Q6_Vqf16_vsub_VhfVqf16(HVX_Vector Vu, + HVX_Vector Vv) Instruction Type: CVI_VS Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vqf16_vsub_VhfVqf16(Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vsub_hf_mix)(Vu, Vv) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.qf32=vsub(Vu32.sf,Vv32.qf32) + C Intrinsic Prototype: HVX_Vector Q6_Vqf32_vsub_VsfVqf32(HVX_Vector Vu, + HVX_Vector Vv) Instruction Type: CVI_VS Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vqf32_vsub_VsfVqf32(Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vsub_sf_mix)(Vu, Vv) +#endif /* __HEXAGON_ARCH___ >= 81 */ + #endif /* __HVX__ */ #endif diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h index 35f012c..19064a4 100644 --- a/clang/lib/Headers/immintrin.h +++ b/clang/lib/Headers/immintrin.h @@ -475,24 +475,12 @@ _storebe_i64(void * __P, long long __D) { #include <amxfp8intrin.h> -#include <amxtransposeintrin.h> - #include <amxmovrsintrin.h> -#include <amxmovrstransposeintrin.h> - #include <amxavx512intrin.h> #include <amxtf32intrin.h> -#include <amxtf32transposeintrin.h> - -#include <amxbf16transposeintrin.h> - -#include <amxfp16transposeintrin.h> - -#include <amxcomplextransposeintrin.h> - #include <avx512vp2intersectintrin.h> #include <avx512vlvp2intersectintrin.h> diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp index e4b158e..7e4a164 100644 --- a/clang/lib/Parse/ParseDecl.cpp +++ b/clang/lib/Parse/ParseDecl.cpp @@ -4248,6 +4248,13 @@ void Parser::ParseDeclarationSpecifiers( // type-specifier case tok::kw_short: + if (!getLangOpts().NativeInt16Type) { + Diag(Tok, diag::err_unknown_typename) << Tok.getName(); + DS.SetTypeSpecError(); + DS.SetRangeEnd(Tok.getLocation()); + ConsumeToken(); + goto DoneWithDeclSpec; + } isInvalid = DS.SetTypeSpecWidth(TypeSpecifierWidth::Short, Loc, PrevSpec, DiagID, Policy); break; diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index f451787..ad2c2e4 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -3542,9 +3542,7 @@ bool Sema::ValueIsRunOfOnes(CallExpr *TheCall, unsigned ArgNum) { bool Sema::getFormatStringInfo(const Decl *D, unsigned FormatIdx, unsigned FirstArg, FormatStringInfo *FSI) { - bool IsCXXMember = false; - if (const auto *MD = dyn_cast<CXXMethodDecl>(D)) - IsCXXMember = MD->isInstance(); + bool HasImplicitThisParam = hasImplicitObjectParameter(D); bool IsVariadic = false; if (const FunctionType *FnTy = D->getFunctionType()) IsVariadic = cast<FunctionProtoType>(FnTy)->isVariadic(); @@ -3553,11 +3551,12 @@ bool Sema::getFormatStringInfo(const Decl *D, unsigned FormatIdx, else if (const auto *OMD = dyn_cast<ObjCMethodDecl>(D)) IsVariadic = OMD->isVariadic(); - return getFormatStringInfo(FormatIdx, FirstArg, IsCXXMember, IsVariadic, FSI); + return getFormatStringInfo(FormatIdx, FirstArg, HasImplicitThisParam, + IsVariadic, FSI); } bool Sema::getFormatStringInfo(unsigned FormatIdx, unsigned FirstArg, - bool IsCXXMember, bool IsVariadic, + bool HasImplicitThisParam, bool IsVariadic, FormatStringInfo *FSI) { if (FirstArg == 0) FSI->ArgPassingKind = FAPK_VAList; @@ -3571,7 +3570,7 @@ bool Sema::getFormatStringInfo(unsigned FormatIdx, unsigned FirstArg, // The way the format attribute works in GCC, the implicit this argument // of member functions is counted. However, it doesn't appear in our own // lists, so decrement format_idx in that case. - if (IsCXXMember) { + if (HasImplicitThisParam) { if(FSI->FormatIdx == 0) return false; --FSI->FormatIdx; diff --git a/clang/lib/Sema/SemaCodeComplete.cpp b/clang/lib/Sema/SemaCodeComplete.cpp index 0514d10..aa93507 100644 --- a/clang/lib/Sema/SemaCodeComplete.cpp +++ b/clang/lib/Sema/SemaCodeComplete.cpp @@ -10208,6 +10208,24 @@ void SemaCodeCompletion::CodeCompletePreprocessorDirective(bool InConditional) { Builder.AddPlaceholderChunk("message"); Results.AddResult(Builder.TakeString()); + if (getLangOpts().C23) { + // #embed "file" + Builder.AddTypedTextChunk("embed"); + Builder.AddChunk(CodeCompletionString::CK_HorizontalSpace); + Builder.AddTextChunk("\""); + Builder.AddPlaceholderChunk("file"); + Builder.AddTextChunk("\""); + Results.AddResult(Builder.TakeString()); + + // #embed <file> + Builder.AddTypedTextChunk("embed"); + Builder.AddChunk(CodeCompletionString::CK_HorizontalSpace); + Builder.AddTextChunk("<"); + Builder.AddPlaceholderChunk("file"); + Builder.AddTextChunk(">"); + Results.AddResult(Builder.TakeString()); + } + // Note: #ident and #sccs are such crazy anachronisms that we don't provide // completions for them. And __include_macros is a Clang-internal extension // that we don't want to encourage anyone to use. diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index 964a2a7..a9e7b44 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -3785,7 +3785,7 @@ static bool handleFormatAttrCommon(Sema &S, Decl *D, const ParsedAttr &AL, // In C++ the implicit 'this' function parameter also counts, and they are // counted from one. - bool HasImplicitThisParam = isInstanceMethod(D); + bool HasImplicitThisParam = hasImplicitObjectParameter(D); Info->NumArgs = getFunctionOrMethodNumParams(D) + HasImplicitThisParam; Info->Identifier = AL.getArgAsIdent(0)->getIdentifierInfo(); @@ -3926,7 +3926,7 @@ static void handleCallbackAttr(Sema &S, Decl *D, const ParsedAttr &AL) { return; } - bool HasImplicitThisParam = isInstanceMethod(D); + bool HasImplicitThisParam = hasImplicitObjectParameter(D); int32_t NumArgs = getFunctionOrMethodNumParams(D); FunctionDecl *FD = D->getAsFunction(); @@ -4110,7 +4110,7 @@ static void handleLifetimeCaptureByAttr(Sema &S, Decl *D, } void Sema::LazyProcessLifetimeCaptureByParams(FunctionDecl *FD) { - bool HasImplicitThisParam = isInstanceMethod(FD); + bool HasImplicitThisParam = hasImplicitObjectParameter(FD); SmallVector<LifetimeCaptureByAttr *, 1> Attrs; for (ParmVarDecl *PVD : FD->parameters()) if (auto *A = PVD->getAttr<LifetimeCaptureByAttr>()) diff --git a/clang/lib/Sema/SemaX86.cpp b/clang/lib/Sema/SemaX86.cpp index 850bcb1..2f61bdd 100644 --- a/clang/lib/Sema/SemaX86.cpp +++ b/clang/lib/Sema/SemaX86.cpp @@ -489,14 +489,6 @@ bool SemaX86::CheckBuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall) { case X86::BI__builtin_ia32_tileloaddrst164: case X86::BI__builtin_ia32_tilestored64: case X86::BI__builtin_ia32_tilezero: - case X86::BI__builtin_ia32_t2rpntlvwz0: - case X86::BI__builtin_ia32_t2rpntlvwz0t1: - case X86::BI__builtin_ia32_t2rpntlvwz1: - case X86::BI__builtin_ia32_t2rpntlvwz1t1: - case X86::BI__builtin_ia32_t2rpntlvwz0rst1: - case X86::BI__builtin_ia32_t2rpntlvwz1rs: - case X86::BI__builtin_ia32_t2rpntlvwz1rst1: - case X86::BI__builtin_ia32_t2rpntlvwz0rs: case X86::BI__builtin_ia32_tcvtrowps2bf16h: case X86::BI__builtin_ia32_tcvtrowps2bf16l: case X86::BI__builtin_ia32_tcvtrowps2phh: @@ -516,17 +508,8 @@ bool SemaX86::CheckBuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall) { case X86::BI__builtin_ia32_tdpbhf8ps: case X86::BI__builtin_ia32_tdphbf8ps: case X86::BI__builtin_ia32_tdphf8ps: - case X86::BI__builtin_ia32_ttdpbf16ps: - case X86::BI__builtin_ia32_ttdpfp16ps: - case X86::BI__builtin_ia32_ttcmmimfp16ps: - case X86::BI__builtin_ia32_ttcmmrlfp16ps: - case X86::BI__builtin_ia32_tconjtcmmimfp16ps: case X86::BI__builtin_ia32_tmmultf32ps: - case X86::BI__builtin_ia32_ttmmultf32ps: return CheckBuiltinTileRangeAndDuplicate(TheCall, {0, 1, 2}); - case X86::BI__builtin_ia32_ttransposed: - case X86::BI__builtin_ia32_tconjtfp16: - return CheckBuiltinTileArgumentsRange(TheCall, {0, 1}); } } static bool isX86_32Builtin(unsigned BuiltinID) { diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp index 42f52d0..eebecdb 100644 --- a/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp +++ b/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp @@ -350,7 +350,7 @@ void sanitizeDiagOpts(DiagnosticOptions &DiagOpts) { // See `test/ClangScanDeps/diagnostic-pragmas.c` for an example. llvm::erase_if(DiagOpts.Warnings, [](StringRef Warning) { return llvm::StringSwitch<bool>(Warning) - .Cases("pch-vfs-diff", "error=pch-vfs-diff", false) + .Cases({"pch-vfs-diff", "error=pch-vfs-diff"}, false) .StartsWith("no-error=", false) .Default(true); }); |
