diff options
Diffstat (limited to 'clang/lib')
48 files changed, 917 insertions, 1385 deletions
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index b3ab82d..8b57b96 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -3411,7 +3411,7 @@ static bool interp__builtin_x86_byteshift( static bool interp__builtin_ia32_shuffle_generic( InterpState &S, CodePtr OpPC, const CallExpr *Call, - llvm::function_ref<std::pair<unsigned, unsigned>(unsigned, unsigned)> + llvm::function_ref<std::pair<unsigned, int>(unsigned, unsigned)> GetSourceIndex) { assert(Call->getNumArgs() == 3); @@ -3428,8 +3428,19 @@ static bool interp__builtin_ia32_shuffle_generic( for (unsigned DstIdx = 0; DstIdx != NumElems; ++DstIdx) { auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask); - const Pointer &Src = (SrcVecIdx == 0) ? A : B; - TYPE_SWITCH(ElemT, { Dst.elem<T>(DstIdx) = Src.elem<T>(SrcIdx); }); + + if (SrcIdx < 0) { + // Zero out this element + if (ElemT == PT_Float) { + Dst.elem<Floating>(DstIdx) = Floating( + S.getASTContext().getFloatTypeSemantics(VecT->getElementType())); + } else { + INT_TYPE_SWITCH_NO_BOOL(ElemT, { Dst.elem<T>(DstIdx) = T::from(0); }); + } + } else { + const Pointer &Src = (SrcVecIdx == 0) ? A : B; + TYPE_SWITCH(ElemT, { Dst.elem<T>(DstIdx) = Src.elem<T>(SrcIdx); }); + } } Dst.initializeAllElements(); @@ -4382,7 +4393,8 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, unsigned SrcIdx = ElemInLane >= NumSelectableElems ? 1 : 0; unsigned BitIndex = (DstIdx * BitsPerElem) % MaskBits; unsigned Index = (ShuffleMask >> BitIndex) & IndexMask; - return std::pair<unsigned, unsigned>{SrcIdx, LaneOffset + Index}; + return std::pair<unsigned, int>{SrcIdx, + static_cast<int>(LaneOffset + Index)}; }); case X86::BI__builtin_ia32_shufpd: case X86::BI__builtin_ia32_shufpd256: @@ -4400,7 +4412,27 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, unsigned SrcIdx = ElemInLane >= NumSelectableElems ? 1 : 0; unsigned BitIndex = (DstIdx * BitsPerElem) % MaskBits; unsigned Index = (ShuffleMask >> BitIndex) & IndexMask; - return std::pair<unsigned, unsigned>{SrcIdx, LaneOffset + Index}; + return std::pair<unsigned, int>{SrcIdx, + static_cast<int>(LaneOffset + Index)}; + }); + case X86::BI__builtin_ia32_insertps128: + return interp__builtin_ia32_shuffle_generic( + S, OpPC, Call, [](unsigned DstIdx, unsigned Mask) { + // Bits [3:0]: zero mask - if bit is set, zero this element + if ((Mask & (1 << DstIdx)) != 0) { + return std::pair<unsigned, int>{0, -1}; + } + // Bits [7:6]: select element from source vector Y (0-3) + // Bits [5:4]: select destination position (0-3) + unsigned SrcElem = (Mask >> 6) & 0x3; + unsigned DstElem = (Mask >> 4) & 0x3; + if (DstIdx == DstElem) { + // Insert element from source vector (B) at this position + return std::pair<unsigned, int>{1, static_cast<int>(SrcElem)}; + } else { + // Copy from destination vector (A) + return std::pair<unsigned, int>{0, static_cast<int>(DstIdx)}; + } }); case X86::BI__builtin_ia32_pshufb128: case X86::BI__builtin_ia32_pshufb256: diff --git a/clang/lib/AST/CommentSema.cpp b/clang/lib/AST/CommentSema.cpp index 27ff5ab..d5ba240 100644 --- a/clang/lib/AST/CommentSema.cpp +++ b/clang/lib/AST/CommentSema.cpp @@ -225,7 +225,7 @@ static ParamCommandPassDirection getParamPassDirection(StringRef Arg) { return llvm::StringSwitch<ParamCommandPassDirection>(Arg) .Case("[in]", ParamCommandPassDirection::In) .Case("[out]", ParamCommandPassDirection::Out) - .Cases("[in,out]", "[out,in]", ParamCommandPassDirection::InOut) + .Cases({"[in,out]", "[out,in]"}, ParamCommandPassDirection::InOut) .Default(static_cast<ParamCommandPassDirection>(-1)); } diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index d0404b9..97eeba8 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -11621,7 +11621,7 @@ static bool evalPackBuiltin(const CallExpr *E, EvalInfo &Info, APValue &Result, static bool evalShuffleGeneric( EvalInfo &Info, const CallExpr *Call, APValue &Out, - llvm::function_ref<std::pair<unsigned, unsigned>(unsigned, unsigned)> + llvm::function_ref<std::pair<unsigned, int>(unsigned, unsigned)> GetSourceIndex) { const auto *VT = Call->getType()->getAs<VectorType>(); @@ -11644,8 +11644,16 @@ static bool evalShuffleGeneric( for (unsigned DstIdx = 0; DstIdx != NumElts; ++DstIdx) { auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask); - const APValue &Src = (SrcVecIdx == 0) ? A : B; - ResultElements.push_back(Src.getVectorElt(SrcIdx)); + + if (SrcIdx < 0) { + // Zero out this element + QualType ElemTy = VT->getElementType(); + ResultElements.push_back( + APValue(APFloat::getZero(Info.Ctx.getFloatTypeSemantics(ElemTy)))); + } else { + const APValue &Src = (SrcVecIdx == 0) ? A : B; + ResultElements.push_back(Src.getVectorElt(SrcIdx)); + } } Out = APValue(ResultElements.data(), ResultElements.size()); @@ -12438,7 +12446,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { if (!evalShuffleGeneric( Info, E, R, [](unsigned DstIdx, - unsigned ShuffleMask) -> std::pair<unsigned, unsigned> { + unsigned ShuffleMask) -> std::pair<unsigned, int> { constexpr unsigned LaneBits = 128u; unsigned NumElemPerLane = LaneBits / 32; unsigned NumSelectableElems = NumElemPerLane / 2; @@ -12451,7 +12459,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { unsigned BitIndex = (DstIdx * BitsPerElem) % MaskBits; unsigned SrcIdx = (ElemInLane < NumSelectableElems) ? 0 : 1; unsigned Index = (ShuffleMask >> BitIndex) & IndexMask; - return {SrcIdx, LaneOffset + Index}; + return {SrcIdx, static_cast<int>(LaneOffset + Index)}; })) return false; return Success(R, E); @@ -12463,7 +12471,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { if (!evalShuffleGeneric( Info, E, R, [](unsigned DstIdx, - unsigned ShuffleMask) -> std::pair<unsigned, unsigned> { + unsigned ShuffleMask) -> std::pair<unsigned, int> { constexpr unsigned LaneBits = 128u; unsigned NumElemPerLane = LaneBits / 64; unsigned NumSelectableElems = NumElemPerLane / 2; @@ -12476,7 +12484,31 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { unsigned BitIndex = (DstIdx * BitsPerElem) % MaskBits; unsigned SrcIdx = (ElemInLane < NumSelectableElems) ? 0 : 1; unsigned Index = (ShuffleMask >> BitIndex) & IndexMask; - return {SrcIdx, LaneOffset + Index}; + return {SrcIdx, static_cast<int>(LaneOffset + Index)}; + })) + return false; + return Success(R, E); + } + case X86::BI__builtin_ia32_insertps128: { + APValue R; + if (!evalShuffleGeneric( + Info, E, R, + [](unsigned DstIdx, unsigned Mask) -> std::pair<unsigned, int> { + // Bits [3:0]: zero mask - if bit is set, zero this element + if ((Mask & (1 << DstIdx)) != 0) { + return {0, -1}; + } + // Bits [7:6]: select element from source vector Y (0-3) + // Bits [5:4]: select destination position (0-3) + unsigned SrcElem = (Mask >> 6) & 0x3; + unsigned DstElem = (Mask >> 4) & 0x3; + if (DstIdx == DstElem) { + // Insert element from source vector (B) at this position + return {1, static_cast<int>(SrcElem)}; + } else { + // Copy from destination vector (A) + return {0, static_cast<int>(DstIdx)}; + } })) return false; return Success(R, E); diff --git a/clang/lib/Basic/Targets/AVR.cpp b/clang/lib/Basic/Targets/AVR.cpp index 2673669..90b4ac1 100644 --- a/clang/lib/Basic/Targets/AVR.cpp +++ b/clang/lib/Basic/Targets/AVR.cpp @@ -30,13 +30,13 @@ struct LLVM_LIBRARY_VISIBILITY MCUInfo { // NOTE: This list has been synchronized with gcc-avr 5.4.0 and avr-libc 2.0.0. static MCUInfo AVRMcus[] = { - {"avr1", NULL, "1", 0}, + {"avr1", nullptr, "1", 0}, {"at90s1200", "__AVR_AT90S1200__", "1", 0}, {"attiny11", "__AVR_ATtiny11__", "1", 0}, {"attiny12", "__AVR_ATtiny12__", "1", 0}, {"attiny15", "__AVR_ATtiny15__", "1", 0}, {"attiny28", "__AVR_ATtiny28__", "1", 0}, - {"avr2", NULL, "2", 1}, + {"avr2", nullptr, "2", 1}, {"at90s2313", "__AVR_AT90S2313__", "2", 1}, {"at90s2323", "__AVR_AT90S2323__", "2", 1}, {"at90s2333", "__AVR_AT90S2333__", "2", 1}, @@ -50,7 +50,7 @@ static MCUInfo AVRMcus[] = { {"at90s8515", "__AVR_AT90S8515__", "2", 1}, {"at90c8534", "__AVR_AT90c8534__", "2", 1}, {"at90s8535", "__AVR_AT90S8535__", "2", 1}, - {"avr25", NULL, "25", 1}, + {"avr25", nullptr, "25", 1}, {"ata5272", "__AVR_ATA5272__", "25", 1}, {"ata6616c", "__AVR_ATA6616c__", "25", 1}, {"attiny13", "__AVR_ATtiny13__", "25", 1}, @@ -80,13 +80,13 @@ static MCUInfo AVRMcus[] = { {"attiny48", "__AVR_ATtiny48__", "25", 1}, {"attiny88", "__AVR_ATtiny88__", "25", 1}, {"attiny828", "__AVR_ATtiny828__", "25", 1}, - {"avr3", NULL, "3", 1}, + {"avr3", nullptr, "3", 1}, {"at43usb355", "__AVR_AT43USB355__", "3", 1}, {"at76c711", "__AVR_AT76C711__", "3", 1}, - {"avr31", NULL, "31", 1}, + {"avr31", nullptr, "31", 1}, {"atmega103", "__AVR_ATmega103__", "31", 1}, {"at43usb320", "__AVR_AT43USB320__", "31", 1}, - {"avr35", NULL, "35", 1}, + {"avr35", nullptr, "35", 1}, {"attiny167", "__AVR_ATtiny167__", "35", 1}, {"at90usb82", "__AVR_AT90USB82__", "35", 1}, {"at90usb162", "__AVR_AT90USB162__", "35", 1}, @@ -97,7 +97,7 @@ static MCUInfo AVRMcus[] = { {"atmega16u2", "__AVR_ATmega16U2__", "35", 1}, {"atmega32u2", "__AVR_ATmega32U2__", "35", 1}, {"attiny1634", "__AVR_ATtiny1634__", "35", 1}, - {"avr4", NULL, "4", 1}, + {"avr4", nullptr, "4", 1}, {"atmega8", "__AVR_ATmega8__", "4", 1}, {"ata6289", "__AVR_ATA6289__", "4", 1}, {"atmega8a", "__AVR_ATmega8A__", "4", 1}, @@ -123,7 +123,7 @@ static MCUInfo AVRMcus[] = { {"at90pwm3", "__AVR_AT90PWM3__", "4", 1}, {"at90pwm3b", "__AVR_AT90PWM3B__", "4", 1}, {"at90pwm81", "__AVR_AT90PWM81__", "4", 1}, - {"avr5", NULL, "5", 1}, + {"avr5", nullptr, "5", 1}, {"ata5702m322", "__AVR_ATA5702M322__", "5", 1}, {"ata5782", "__AVR_ATA5782__", "5", 1}, {"ata5790", "__AVR_ATA5790__", "5", 1}, @@ -230,7 +230,7 @@ static MCUInfo AVRMcus[] = { {"at90scr100", "__AVR_AT90SCR100__", "5", 1}, {"at94k", "__AVR_AT94K__", "5", 1}, {"m3000", "__AVR_AT000__", "5", 1}, - {"avr51", NULL, "51", 2}, + {"avr51", nullptr, "51", 2}, {"atmega128", "__AVR_ATmega128__", "51", 2}, {"atmega128a", "__AVR_ATmega128A__", "51", 2}, {"atmega1280", "__AVR_ATmega1280__", "51", 2}, @@ -243,12 +243,12 @@ static MCUInfo AVRMcus[] = { {"at90can128", "__AVR_AT90CAN128__", "51", 2}, {"at90usb1286", "__AVR_AT90USB1286__", "51", 2}, {"at90usb1287", "__AVR_AT90USB1287__", "51", 2}, - {"avr6", NULL, "6", 4}, + {"avr6", nullptr, "6", 4}, {"atmega2560", "__AVR_ATmega2560__", "6", 4}, {"atmega2561", "__AVR_ATmega2561__", "6", 4}, {"atmega256rfr2", "__AVR_ATmega256RFR2__", "6", 4}, {"atmega2564rfr2", "__AVR_ATmega2564RFR2__", "6", 4}, - {"avrxmega2", NULL, "102", 1}, + {"avrxmega2", nullptr, "102", 1}, {"atxmega16a4", "__AVR_ATxmega16A4__", "102", 1}, {"atxmega16a4u", "__AVR_ATxmega16A4U__", "102", 1}, {"atxmega16c4", "__AVR_ATxmega16C4__", "102", 1}, @@ -262,7 +262,7 @@ static MCUInfo AVRMcus[] = { {"atxmega32e5", "__AVR_ATxmega32E5__", "102", 1}, {"atxmega16e5", "__AVR_ATxmega16E5__", "102", 1}, {"atxmega8e5", "__AVR_ATxmega8E5__", "102", 1}, - {"avrxmega4", NULL, "104", 1}, + {"avrxmega4", nullptr, "104", 1}, {"atxmega64a3", "__AVR_ATxmega64A3__", "104", 1}, {"atxmega64a3u", "__AVR_ATxmega64A3U__", "104", 1}, {"atxmega64a4u", "__AVR_ATxmega64A4U__", "104", 1}, @@ -271,10 +271,10 @@ static MCUInfo AVRMcus[] = { {"atxmega64c3", "__AVR_ATxmega64C3__", "104", 1}, {"atxmega64d3", "__AVR_ATxmega64D3__", "104", 1}, {"atxmega64d4", "__AVR_ATxmega64D4__", "104", 1}, - {"avrxmega5", NULL, "105", 1}, + {"avrxmega5", nullptr, "105", 1}, {"atxmega64a1", "__AVR_ATxmega64A1__", "105", 1}, {"atxmega64a1u", "__AVR_ATxmega64A1U__", "105", 1}, - {"avrxmega6", NULL, "106", 6}, + {"avrxmega6", nullptr, "106", 6}, {"atxmega128a3", "__AVR_ATxmega128A3__", "106", 2}, {"atxmega128a3u", "__AVR_ATxmega128A3U__", "106", 2}, {"atxmega128b1", "__AVR_ATxmega128B1__", "106", 2}, @@ -294,11 +294,11 @@ static MCUInfo AVRMcus[] = { {"atxmega256d3", "__AVR_ATxmega256D3__", "106", 4}, {"atxmega384c3", "__AVR_ATxmega384C3__", "106", 6}, {"atxmega384d3", "__AVR_ATxmega384D3__", "106", 6}, - {"avrxmega7", NULL, "107", 2}, + {"avrxmega7", nullptr, "107", 2}, {"atxmega128a1", "__AVR_ATxmega128A1__", "107", 2}, {"atxmega128a1u", "__AVR_ATxmega128A1U__", "107", 2}, {"atxmega128a4u", "__AVR_ATxmega128A4U__", "107", 2}, - {"avrtiny", NULL, "100", 0}, + {"avrtiny", nullptr, "100", 0}, {"attiny4", "__AVR_ATtiny4__", "100", 0}, {"attiny5", "__AVR_ATtiny5__", "100", 0}, {"attiny9", "__AVR_ATtiny9__", "100", 0}, @@ -307,7 +307,7 @@ static MCUInfo AVRMcus[] = { {"attiny40", "__AVR_ATtiny40__", "100", 0}, {"attiny102", "__AVR_ATtiny102__", "100", 0}, {"attiny104", "__AVR_ATtiny104__", "100", 0}, - {"avrxmega3", NULL, "103", 1}, + {"avrxmega3", nullptr, "103", 1}, {"attiny202", "__AVR_ATtiny202__", "103", 1}, {"attiny402", "__AVR_ATtiny402__", "103", 1}, {"attiny204", "__AVR_ATtiny204__", "103", 1}, diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp index 9651c38..ec4e40b 100644 --- a/clang/lib/Basic/Targets/NVPTX.cpp +++ b/clang/lib/Basic/Targets/NVPTX.cpp @@ -171,7 +171,7 @@ ArrayRef<const char *> NVPTXTargetInfo::getGCCRegNames() const { bool NVPTXTargetInfo::hasFeature(StringRef Feature) const { return llvm::StringSwitch<bool>(Feature) - .Cases("ptx", "nvptx", true) + .Cases({"ptx", "nvptx"}, true) .Default(false); } diff --git a/clang/lib/Basic/Targets/PPC.h b/clang/lib/Basic/Targets/PPC.h index 846b240..d4ada2a 100644 --- a/clang/lib/Basic/Targets/PPC.h +++ b/clang/lib/Basic/Targets/PPC.h @@ -125,9 +125,8 @@ public: .Cases({"power3", "pwr3"}, ArchDefinePpcgr) .Cases({"power4", "pwr4"}, ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq) - .Cases("power5", "pwr5", - ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr | - ArchDefinePpcsq) + .Cases({"power5", "pwr5"}, ArchDefinePwr5 | ArchDefinePwr4 | + ArchDefinePpcgr | ArchDefinePpcsq) .Cases({"power5x", "pwr5x"}, ArchDefinePwr5x | ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq) @@ -166,7 +165,7 @@ public: ArchDefinePwr9 | ArchDefinePwr8 | ArchDefinePwr7 | ArchDefinePwr6 | ArchDefinePwr5x | ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq) - .Cases("8548", "e500", ArchDefineE500) + .Cases({"8548", "e500"}, ArchDefineE500) .Default(ArchDefineNone); } return CPUKnown; @@ -445,27 +444,17 @@ public: LongWidth = LongAlign = PointerWidth = PointerAlign = 64; IntMaxType = SignedLong; Int64Type = SignedLong; - std::string DataLayout; if (Triple.isOSAIX()) { // TODO: Set appropriate ABI for AIX platform. - DataLayout = "E-m:a-Fi64-i64:64-i128:128-n32:64"; LongDoubleWidth = 64; LongDoubleAlign = DoubleAlign = 32; LongDoubleFormat = &llvm::APFloat::IEEEdouble(); - } else if ((Triple.getArch() == llvm::Triple::ppc64le)) { - DataLayout = "e-m:e-Fn32-i64:64-i128:128-n32:64"; + } else if ((Triple.getArch() == llvm::Triple::ppc64le) || + Triple.isPPC64ELFv2ABI()) { ABI = "elfv2"; } else { - DataLayout = "E-m:e"; - if (Triple.isPPC64ELFv2ABI()) { - ABI = "elfv2"; - DataLayout += "-Fn32"; - } else { - ABI = "elfv1"; - DataLayout += "-Fi64"; - } - DataLayout += "-i64:64-i128:128-n32:64"; + ABI = "elfv1"; } if (Triple.isOSFreeBSD() || Triple.isOSOpenBSD() || Triple.isMusl()) { @@ -473,14 +462,12 @@ public: LongDoubleFormat = &llvm::APFloat::IEEEdouble(); } - if (Triple.isOSAIX() || Triple.isOSLinux()) - DataLayout += "-S128-v256:256:256-v512:512:512"; - resetDataLayout(DataLayout); - // Newer PPC64 instruction sets support atomics up to 16 bytes. MaxAtomicPromoteWidth = 128; // Baseline PPC64 supports inlining atomics up to 8 bytes. MaxAtomicInlineWidth = 64; + + calculateDataLayout(); } void setMaxAtomicWidth() override { @@ -495,10 +482,33 @@ public: return TargetInfo::CharPtrBuiltinVaList; } + void calculateDataLayout() { + std::string DataLayout; + + if (getTriple().isOSAIX()) { + DataLayout = "E-m:a-Fi64-i64:64-i128:128-n32:64"; + } else if ((getTriple().getArch() == llvm::Triple::ppc64le)) { + DataLayout = "e-m:e-Fn32-i64:64-i128:128-n32:64"; + } else { + DataLayout = "E-m:e"; + if (ABI == "elfv2") { + DataLayout += "-Fn32"; + } else { + DataLayout += "-Fi64"; + } + DataLayout += "-i64:64-i128:128-n32:64"; + } + + if (getTriple().isOSAIX() || getTriple().isOSLinux()) + DataLayout += "-S128-v256:256:256-v512:512:512"; + resetDataLayout(DataLayout); + } + // PPC64 Linux-specific ABI options. bool setABI(const std::string &Name) override { if (Name == "elfv1" || Name == "elfv2") { ABI = Name; + calculateDataLayout(); return true; } return false; diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp index e71f10c..7a90c89 100644 --- a/clang/lib/Basic/Targets/X86.cpp +++ b/clang/lib/Basic/Targets/X86.cpp @@ -396,8 +396,6 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features, HasAMXFP8 = true; } else if (Feature == "+amx-movrs") { HasAMXMOVRS = true; - } else if (Feature == "+amx-transpose") { - HasAMXTRANSPOSE = true; } else if (Feature == "+amx-avx512") { HasAMXAVX512 = true; } else if (Feature == "+amx-tf32") { @@ -925,8 +923,6 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts, Builder.defineMacro("__AMX_FP8__"); if (HasAMXMOVRS) Builder.defineMacro("__AMX_MOVRS__"); - if (HasAMXTRANSPOSE) - Builder.defineMacro("__AMX_TRANSPOSE__"); if (HasAMXAVX512) Builder.defineMacro("__AMX_AVX512__"); if (HasAMXTF32) @@ -1068,7 +1064,6 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const { .Case("amx-movrs", true) .Case("amx-tf32", true) .Case("amx-tile", true) - .Case("amx-transpose", true) .Case("avx", true) .Case("avx10.1", true) .Case("avx10.2", true) @@ -1189,7 +1184,6 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const { .Case("amx-movrs", HasAMXMOVRS) .Case("amx-tf32", HasAMXTF32) .Case("amx-tile", HasAMXTILE) - .Case("amx-transpose", HasAMXTRANSPOSE) .Case("avx", SSELevel >= AVX) .Case("avx10.1", HasAVX10_1) .Case("avx10.2", HasAVX10_2) diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h index be3a473..e7da262 100644 --- a/clang/lib/Basic/Targets/X86.h +++ b/clang/lib/Basic/Targets/X86.h @@ -160,7 +160,6 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo { bool HasAMXCOMPLEX = false; bool HasAMXFP8 = false; bool HasAMXMOVRS = false; - bool HasAMXTRANSPOSE = false; bool HasAMXAVX512 = false; bool HasAMXTF32 = false; bool HasSERIALIZE = false; diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp index 3c9c7ec..0198a9d 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp @@ -771,14 +771,6 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, case X86::BI_WriteBarrier: case X86::BI_AddressOfReturnAddress: case X86::BI__stosb: - case X86::BI__builtin_ia32_t2rpntlvwz0_internal: - case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal: - case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal: - case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal: - case X86::BI__builtin_ia32_t2rpntlvwz1_internal: - case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal: - case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal: - case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal: case X86::BI__ud2: case X86::BI__int2c: case X86::BI__readfsbyte: diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp index 71ff20a..5d5209b 100644 --- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp @@ -242,12 +242,19 @@ void CIRGenFunction::LexicalScope::cleanup() { } }; - if (returnBlock != nullptr) { - // Write out the return block, which loads the value from `__retval` and - // issues the `cir.return`. + // Cleanup are done right before codegen resumes a scope. This is where + // objects are destroyed. Process all return blocks. + // TODO(cir): Handle returning from a switch statement through a cleanup + // block. We can't simply jump to the cleanup block, because the cleanup block + // is not part of the case region. Either reemit all cleanups in the return + // block or wait for MLIR structured control flow to support early exits. + llvm::SmallVector<mlir::Block *> retBlocks; + for (mlir::Block *retBlock : localScope->getRetBlocks()) { mlir::OpBuilder::InsertionGuard guard(builder); - builder.setInsertionPointToEnd(returnBlock); - (void)emitReturn(*returnLoc); + builder.setInsertionPointToEnd(retBlock); + retBlocks.push_back(retBlock); + mlir::Location retLoc = localScope->getRetLoc(retBlock); + emitReturn(retLoc); } auto insertCleanupAndLeave = [&](mlir::Block *insPt) { @@ -274,19 +281,22 @@ void CIRGenFunction::LexicalScope::cleanup() { if (localScope->depth == 0) { // Reached the end of the function. - if (returnBlock != nullptr) { - if (returnBlock->getUses().empty()) { - returnBlock->erase(); + // Special handling only for single return block case + if (localScope->getRetBlocks().size() == 1) { + mlir::Block *retBlock = localScope->getRetBlocks()[0]; + mlir::Location retLoc = localScope->getRetLoc(retBlock); + if (retBlock->getUses().empty()) { + retBlock->erase(); } else { // Thread return block via cleanup block. if (cleanupBlock) { - for (mlir::BlockOperand &blockUse : returnBlock->getUses()) { + for (mlir::BlockOperand &blockUse : retBlock->getUses()) { cir::BrOp brOp = mlir::cast<cir::BrOp>(blockUse.getOwner()); brOp.setSuccessor(cleanupBlock); } } - cir::BrOp::create(builder, *returnLoc, returnBlock); + cir::BrOp::create(builder, retLoc, retBlock); return; } } @@ -324,8 +334,10 @@ void CIRGenFunction::LexicalScope::cleanup() { bool entryBlock = builder.getInsertionBlock()->isEntryBlock(); if (!entryBlock && curBlock->empty()) { curBlock->erase(); - if (returnBlock != nullptr && returnBlock->getUses().empty()) - returnBlock->erase(); + for (mlir::Block *retBlock : retBlocks) { + if (retBlock->getUses().empty()) + retBlock->erase(); + } return; } diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h index c3fcd1a6..e5cecaa5 100644 --- a/clang/lib/CIR/CodeGen/CIRGenFunction.h +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h @@ -1103,44 +1103,69 @@ public: // --- private: - // `returnBlock`, `returnLoc`, and all the functions that deal with them - // will change and become more complicated when `switch` statements are - // upstreamed. `case` statements within the `switch` are in the same scope - // but have their own regions. Therefore the LexicalScope will need to - // keep track of multiple return blocks. - mlir::Block *returnBlock = nullptr; - std::optional<mlir::Location> returnLoc; - - // See the comment on `getOrCreateRetBlock`. + // On switches we need one return block per region, since cases don't + // have their own scopes but are distinct regions nonetheless. + + // TODO: This implementation should change once we have support for early + // exits in MLIR structured control flow (llvm-project#161575) + llvm::SmallVector<mlir::Block *> retBlocks; + llvm::DenseMap<mlir::Block *, mlir::Location> retLocs; + llvm::DenseMap<cir::CaseOp, unsigned> retBlockInCaseIndex; + std::optional<unsigned> normalRetBlockIndex; + + // There's usually only one ret block per scope, but this needs to be + // get or create because of potential unreachable return statements, note + // that for those, all source location maps to the first one found. mlir::Block *createRetBlock(CIRGenFunction &cgf, mlir::Location loc) { - assert(returnBlock == nullptr && "only one return block per scope"); - // Create the cleanup block but don't hook it up just yet. + assert((isa_and_nonnull<cir::CaseOp>( + cgf.builder.getBlock()->getParentOp()) || + retBlocks.size() == 0) && + "only switches can hold more than one ret block"); + + // Create the return block but don't hook it up just yet. mlir::OpBuilder::InsertionGuard guard(cgf.builder); - returnBlock = - cgf.builder.createBlock(cgf.builder.getBlock()->getParent()); - updateRetLoc(returnBlock, loc); - return returnBlock; + auto *b = cgf.builder.createBlock(cgf.builder.getBlock()->getParent()); + retBlocks.push_back(b); + updateRetLoc(b, loc); + return b; } cir::ReturnOp emitReturn(mlir::Location loc); void emitImplicitReturn(); public: - mlir::Block *getRetBlock() { return returnBlock; } - mlir::Location getRetLoc(mlir::Block *b) { return *returnLoc; } - void updateRetLoc(mlir::Block *b, mlir::Location loc) { returnLoc = loc; } - - // Create the return block for this scope, or return the existing one. - // This get-or-create logic is necessary to handle multiple return - // statements within the same scope, which can happen if some of them are - // dead code or if there is a `goto` into the middle of the scope. + llvm::ArrayRef<mlir::Block *> getRetBlocks() { return retBlocks; } + mlir::Location getRetLoc(mlir::Block *b) { return retLocs.at(b); } + void updateRetLoc(mlir::Block *b, mlir::Location loc) { + retLocs.insert_or_assign(b, loc); + } + mlir::Block *getOrCreateRetBlock(CIRGenFunction &cgf, mlir::Location loc) { - if (returnBlock == nullptr) { - returnBlock = createRetBlock(cgf, loc); - return returnBlock; + // Check if we're inside a case region + if (auto caseOp = mlir::dyn_cast_if_present<cir::CaseOp>( + cgf.builder.getBlock()->getParentOp())) { + auto iter = retBlockInCaseIndex.find(caseOp); + if (iter != retBlockInCaseIndex.end()) { + // Reuse existing return block + mlir::Block *ret = retBlocks[iter->second]; + updateRetLoc(ret, loc); + return ret; + } + // Create new return block + mlir::Block *ret = createRetBlock(cgf, loc); + retBlockInCaseIndex[caseOp] = retBlocks.size() - 1; + return ret; } - updateRetLoc(returnBlock, loc); - return returnBlock; + + if (normalRetBlockIndex) { + mlir::Block *ret = retBlocks[*normalRetBlockIndex]; + updateRetLoc(ret, loc); + return ret; + } + + mlir::Block *ret = createRetBlock(cgf, loc); + normalRetBlockIndex = retBlocks.size() - 1; + return ret; } mlir::Block *getEntryBlock() { return entryBlock; } diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp index 5010137..527dfd2 100644 --- a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp @@ -126,7 +126,7 @@ class OpenACCClauseCIREmitter final .CaseLower("default", mlir::acc::DeviceType::Default) .CaseLower("host", mlir::acc::DeviceType::Host) .CaseLower("multicore", mlir::acc::DeviceType::Multicore) - .CasesLower("nvidia", "acc_device_nvidia", + .CasesLower({"nvidia", "acc_device_nvidia"}, mlir::acc::DeviceType::Nvidia) .CaseLower("radeon", mlir::acc::DeviceType::Radeon); } diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index 3c31314..b967a26 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -313,7 +313,7 @@ getCodeModel(const CodeGenOptions &CodeGenOpts) { .Case("kernel", llvm::CodeModel::Kernel) .Case("medium", llvm::CodeModel::Medium) .Case("large", llvm::CodeModel::Large) - .Cases("default", "", ~1u) + .Cases({"default", ""}, ~1u) .Default(~0u); assert(CodeModel != ~0u && "invalid code model!"); if (CodeModel == ~1u) diff --git a/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp b/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp index 6da65b6..8a1cab3 100644 --- a/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp @@ -375,28 +375,28 @@ static Value *MakeCpAsync(unsigned IntrinsicID, unsigned IntrinsicIDS, CGF.EmitScalarExpr(E->getArg(1))}); } -static Value *MakeHalfType(unsigned IntrinsicID, unsigned BuiltinID, - const CallExpr *E, CodeGenFunction &CGF) { +static bool EnsureNativeHalfSupport(unsigned BuiltinID, const CallExpr *E, + CodeGenFunction &CGF) { auto &C = CGF.CGM.getContext(); - if (!(C.getLangOpts().NativeHalfType || - !C.getTargetInfo().useFP16ConversionIntrinsics())) { + if (!C.getLangOpts().NativeHalfType && + C.getTargetInfo().useFP16ConversionIntrinsics()) { CGF.CGM.Error(E->getExprLoc(), C.BuiltinInfo.getQuotedName(BuiltinID) + " requires native half type support."); - return nullptr; + return false; } + return true; +} - if (BuiltinID == NVPTX::BI__nvvm_ldg_h || BuiltinID == NVPTX::BI__nvvm_ldg_h2) - return MakeLdg(CGF, E); - - if (IntrinsicID == Intrinsic::nvvm_ldu_global_f) - return MakeLdu(IntrinsicID, CGF, E); +static Value *MakeHalfType(Function *Intrinsic, unsigned BuiltinID, + const CallExpr *E, CodeGenFunction &CGF) { + if (!EnsureNativeHalfSupport(BuiltinID, E, CGF)) + return nullptr; SmallVector<Value *, 16> Args; - auto *F = CGF.CGM.getIntrinsic(IntrinsicID); - auto *FTy = F->getFunctionType(); + auto *FTy = Intrinsic->getFunctionType(); unsigned ICEArguments = 0; ASTContext::GetBuiltinTypeError Error; - C.GetBuiltinType(BuiltinID, Error, &ICEArguments); + CGF.CGM.getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments); assert(Error == ASTContext::GE_None && "Should not codegen an error"); for (unsigned i = 0, e = E->getNumArgs(); i != e; ++i) { assert((ICEArguments & (1 << i)) == 0); @@ -407,8 +407,14 @@ static Value *MakeHalfType(unsigned IntrinsicID, unsigned BuiltinID, Args.push_back(ArgValue); } - return CGF.Builder.CreateCall(F, Args); + return CGF.Builder.CreateCall(Intrinsic, Args); } + +static Value *MakeHalfType(unsigned IntrinsicID, unsigned BuiltinID, + const CallExpr *E, CodeGenFunction &CGF) { + return MakeHalfType(CGF.CGM.getIntrinsic(IntrinsicID), BuiltinID, E, CGF); +} + } // namespace Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, @@ -913,9 +919,14 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, } // The following builtins require half type support case NVPTX::BI__nvvm_ex2_approx_f16: - return MakeHalfType(Intrinsic::nvvm_ex2_approx_f16, BuiltinID, E, *this); + return MakeHalfType( + CGM.getIntrinsic(Intrinsic::nvvm_ex2_approx, Builder.getHalfTy()), + BuiltinID, E, *this); case NVPTX::BI__nvvm_ex2_approx_f16x2: - return MakeHalfType(Intrinsic::nvvm_ex2_approx_f16x2, BuiltinID, E, *this); + return MakeHalfType( + CGM.getIntrinsic(Intrinsic::nvvm_ex2_approx, + FixedVectorType::get(Builder.getHalfTy(), 2)), + BuiltinID, E, *this); case NVPTX::BI__nvvm_ff2f16x2_rn: return MakeHalfType(Intrinsic::nvvm_ff2f16x2_rn, BuiltinID, E, *this); case NVPTX::BI__nvvm_ff2f16x2_rn_relu: @@ -1049,12 +1060,22 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, case NVPTX::BI__nvvm_fabs_d: return Builder.CreateUnaryIntrinsic(Intrinsic::fabs, EmitScalarExpr(E->getArg(0))); + case NVPTX::BI__nvvm_ex2_approx_d: + case NVPTX::BI__nvvm_ex2_approx_f: + return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_ex2_approx, + EmitScalarExpr(E->getArg(0))); + case NVPTX::BI__nvvm_ex2_approx_ftz_f: + return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_ex2_approx_ftz, + EmitScalarExpr(E->getArg(0))); case NVPTX::BI__nvvm_ldg_h: case NVPTX::BI__nvvm_ldg_h2: - return MakeHalfType(Intrinsic::not_intrinsic, BuiltinID, E, *this); + return EnsureNativeHalfSupport(BuiltinID, E, *this) ? MakeLdg(*this, E) + : nullptr; case NVPTX::BI__nvvm_ldu_h: case NVPTX::BI__nvvm_ldu_h2: - return MakeHalfType(Intrinsic::nvvm_ldu_global_f, BuiltinID, E, *this); + return EnsureNativeHalfSupport(BuiltinID, E, *this) + ? MakeLdu(Intrinsic::nvvm_ldu_global_f, *this, E) + : nullptr; case NVPTX::BI__nvvm_cp_async_ca_shared_global_4: return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_4, Intrinsic::nvvm_cp_async_ca_shared_global_4_s, *this, E, diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp index b924407..2381b2e 100644 --- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp @@ -2931,74 +2931,6 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, // instruction, but it will create a memset that won't be optimized away. return Builder.CreateMemSet(Ops[0], Ops[1], Ops[2], Align(1), true); } - // Corresponding to intrisics which will return 2 tiles (tile0_tile1). - case X86::BI__builtin_ia32_t2rpntlvwz0_internal: - case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal: - case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal: - case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal: - case X86::BI__builtin_ia32_t2rpntlvwz1_internal: - case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal: - case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal: - case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal: { - Intrinsic::ID IID; - switch (BuiltinID) { - default: - llvm_unreachable("Unsupported intrinsic!"); - case X86::BI__builtin_ia32_t2rpntlvwz0_internal: - IID = Intrinsic::x86_t2rpntlvwz0_internal; - break; - case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal: - IID = Intrinsic::x86_t2rpntlvwz0rs_internal; - break; - case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal: - IID = Intrinsic::x86_t2rpntlvwz0t1_internal; - break; - case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal: - IID = Intrinsic::x86_t2rpntlvwz0rst1_internal; - break; - case X86::BI__builtin_ia32_t2rpntlvwz1_internal: - IID = Intrinsic::x86_t2rpntlvwz1_internal; - break; - case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal: - IID = Intrinsic::x86_t2rpntlvwz1rs_internal; - break; - case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal: - IID = Intrinsic::x86_t2rpntlvwz1t1_internal; - break; - case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal: - IID = Intrinsic::x86_t2rpntlvwz1rst1_internal; - break; - } - - // Ops = (Row0, Col0, Col1, DstPtr0, DstPtr1, SrcPtr, Stride) - Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), - {Ops[0], Ops[1], Ops[2], Ops[5], Ops[6]}); - - auto *PtrTy = E->getArg(3)->getType()->getAs<PointerType>(); - assert(PtrTy && "arg3 must be of pointer type"); - QualType PtreeTy = PtrTy->getPointeeType(); - llvm::Type *TyPtee = ConvertType(PtreeTy); - - // Bitcast amx type (x86_amx) to vector type (256 x i32) - // Then store tile0 into DstPtr0 - Value *T0 = Builder.CreateExtractValue(Call, 0); - Value *VecT0 = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector, - {TyPtee}, {T0}); - Builder.CreateDefaultAlignedStore(VecT0, Ops[3]); - - // Then store tile1 into DstPtr1 - Value *T1 = Builder.CreateExtractValue(Call, 1); - Value *VecT1 = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector, - {TyPtee}, {T1}); - Value *Store = Builder.CreateDefaultAlignedStore(VecT1, Ops[4]); - - // Note: Here we escape directly use x86_tilestored64_internal to store - // the results due to it can't make sure the Mem written scope. This may - // cause shapes reloads after first amx intrinsic, which current amx reg- - // ister allocation has no ability to handle it. - - return Store; - } case X86::BI__ud2: // llvm.trap makes a ud2a instruction on x86. return EmitTrapCall(Intrinsic::trap); diff --git a/clang/lib/CodeGen/Targets/SPIR.cpp b/clang/lib/CodeGen/Targets/SPIR.cpp index 15d0b35..abd049a 100644 --- a/clang/lib/CodeGen/Targets/SPIR.cpp +++ b/clang/lib/CodeGen/Targets/SPIR.cpp @@ -260,7 +260,8 @@ CommonSPIRTargetCodeGenInfo::getNullPointer(const CodeGen::CodeGenModule &CGM, LangAS AS = QT->getUnqualifiedDesugaredType()->isNullPtrType() ? LangAS::Default : QT->getPointeeType().getAddressSpace(); - if (AS == LangAS::Default || AS == LangAS::opencl_generic) + if (AS == LangAS::Default || AS == LangAS::opencl_generic || + AS == LangAS::opencl_constant) return llvm::ConstantPointerNull::get(PT); auto &Ctx = CGM.getContext(); diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index 71c5280..51618d1 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -2540,10 +2540,14 @@ bool Driver::HandleImmediateArgs(Compilation &C) { } if (C.getArgs().hasArg(options::OPT_print_runtime_dir)) { - if (std::optional<std::string> RuntimePath = TC.getRuntimePath()) - llvm::outs() << *RuntimePath << '\n'; - else - llvm::outs() << TC.getCompilerRTPath() << '\n'; + for (auto RuntimePath : + {TC.getRuntimePath(), std::make_optional(TC.getCompilerRTPath())}) { + if (RuntimePath && getVFS().exists(*RuntimePath)) { + llvm::outs() << *RuntimePath << '\n'; + return false; + } + } + llvm::outs() << "(runtime dir is not present)" << '\n'; return false; } diff --git a/clang/lib/Driver/ToolChains/Arch/M68k.cpp b/clang/lib/Driver/ToolChains/Arch/M68k.cpp index 1037c0e..708ec84 100644 --- a/clang/lib/Driver/ToolChains/Arch/M68k.cpp +++ b/clang/lib/Driver/ToolChains/Arch/M68k.cpp @@ -36,12 +36,12 @@ std::string m68k::getM68kTargetCPU(const ArgList &Args) { return "generic"; return llvm::StringSwitch<std::string>(CPUName) - .Cases("m68000", "68000", "M68000") - .Cases("m68010", "68010", "M68010") - .Cases("m68020", "68020", "M68020") - .Cases("m68030", "68030", "M68030") - .Cases("m68040", "68040", "M68040") - .Cases("m68060", "68060", "M68060") + .Cases({"m68000", "68000"}, "M68000") + .Cases({"m68010", "68010"}, "M68010") + .Cases({"m68020", "68020"}, "M68020") + .Cases({"m68030", "68030"}, "M68030") + .Cases({"m68040", "68040"}, "M68040") + .Cases({"m68060", "68060"}, "M68060") .Default(CPUName.str()); } // FIXME: Throw error when multiple sub-architecture flag exist diff --git a/clang/lib/Driver/ToolChains/Arch/Mips.cpp b/clang/lib/Driver/ToolChains/Arch/Mips.cpp index 6a6a4ee..8d7b85d 100644 --- a/clang/lib/Driver/ToolChains/Arch/Mips.cpp +++ b/clang/lib/Driver/ToolChains/Arch/Mips.cpp @@ -117,7 +117,7 @@ void mips::getMipsCPUAndABI(const ArgList &Args, const llvm::Triple &Triple, // Deduce CPU name from ABI name. CPUName = llvm::StringSwitch<const char *>(ABIName) .Case("o32", DefMips32CPU) - .Cases("n32", "n64", DefMips64CPU) + .Cases({"n32", "n64"}, DefMips64CPU) .Default(""); } @@ -467,7 +467,7 @@ bool mips::isNaN2008(const Driver &D, const ArgList &Args, // NaN2008 is the default for MIPS32r6/MIPS64r6. return llvm::StringSwitch<bool>(getCPUName(D, Args, Triple)) - .Cases("mips32r6", "mips64r6", true) + .Cases({"mips32r6", "mips64r6"}, true) .Default(false); } diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 4e8f63e..d3ab6f1 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -3708,6 +3708,7 @@ static void RenderHLSLOptions(const ArgList &Args, ArgStringList &CmdArgs, options::OPT_emit_obj, options::OPT_disable_llvm_passes, options::OPT_fnative_half_type, + options::OPT_fnative_int16_type, options::OPT_hlsl_entrypoint, options::OPT_fdx_rootsignature_define, options::OPT_fdx_rootsignature_version, diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp index cc5bcd1..2fb7652 100644 --- a/clang/lib/Driver/ToolChains/Darwin.cpp +++ b/clang/lib/Driver/ToolChains/Darwin.cpp @@ -1035,12 +1035,12 @@ static const char *ArmMachOArchName(StringRef Arch) { .Case("xscale", "xscale") .Case("armv4t", "armv4t") .Case("armv7", "armv7") - .Cases("armv7a", "armv7-a", "armv7") - .Cases("armv7r", "armv7-r", "armv7") - .Cases("armv7em", "armv7e-m", "armv7em") - .Cases("armv7k", "armv7-k", "armv7k") - .Cases("armv7m", "armv7-m", "armv7m") - .Cases("armv7s", "armv7-s", "armv7s") + .Cases({"armv7a", "armv7-a"}, "armv7") + .Cases({"armv7r", "armv7-r"}, "armv7") + .Cases({"armv7em", "armv7e-m"}, "armv7em") + .Cases({"armv7k", "armv7-k"}, "armv7k") + .Cases({"armv7m", "armv7-m"}, "armv7m") + .Cases({"armv7s", "armv7-s"}, "armv7s") .Default(nullptr); } diff --git a/clang/lib/Driver/ToolChains/HLSL.cpp b/clang/lib/Driver/ToolChains/HLSL.cpp index 20a320e..8d3fba7 100644 --- a/clang/lib/Driver/ToolChains/HLSL.cpp +++ b/clang/lib/Driver/ToolChains/HLSL.cpp @@ -498,6 +498,15 @@ HLSLToolChain::TranslateArgs(const DerivedArgList &Args, StringRef BoundArch, continue; } + if (A->getOption().getID() == options::OPT_enable_16bit_types) { + // Translate -enable-16bit-types into -fnative-half-type and + // -fnative-int16-type + DAL->AddFlagArg(nullptr, Opts.getOption(options::OPT_fnative_half_type)); + DAL->AddFlagArg(nullptr, Opts.getOption(options::OPT_fnative_int16_type)); + A->claim(); + continue; + } + DAL->append(A); } diff --git a/clang/lib/Driver/ToolChains/Solaris.cpp b/clang/lib/Driver/ToolChains/Solaris.cpp index 02aa598..64c7d1c 100644 --- a/clang/lib/Driver/ToolChains/Solaris.cpp +++ b/clang/lib/Driver/ToolChains/Solaris.cpp @@ -346,7 +346,7 @@ SanitizerMask Solaris::getSupportedSanitizers() const { const char *Solaris::getDefaultLinker() const { // FIXME: Only handle Solaris ld and GNU ld here. return llvm::StringSwitch<const char *>(getDriver().getPreferredLinker()) - .Cases("bfd", "gld", "/usr/gnu/bin/ld") + .Cases({"bfd", "gld"}, "/usr/gnu/bin/ld") .Default("/usr/bin/ld"); } diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp index e5abf83..9ab024a 100644 --- a/clang/lib/Format/ContinuationIndenter.cpp +++ b/clang/lib/Format/ContinuationIndenter.cpp @@ -356,9 +356,11 @@ bool ContinuationIndenter::canBreak(const LineState &State) { return CurrentState.BreakBeforeClosingBrace; } - // Allow breaking before the right parens with block indentation if there was - // a break after the left parens, which is tracked by BreakBeforeClosingParen. - if (Style.AlignAfterOpenBracket == FormatStyle::BAS_BlockIndent && + // Check need to break before the right parens if there was a break after + // the left parens, which is tracked by BreakBeforeClosingParen. + if ((Style.BreakBeforeCloseBracketFunction || + Style.BreakBeforeCloseBracketIf || Style.BreakBeforeCloseBracketLoop || + Style.BreakBeforeCloseBracketSwitch) && Current.is(tok::r_paren)) { return CurrentState.BreakBeforeClosingParen; } @@ -837,32 +839,38 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun, return Tok.is(tok::l_brace) && Tok.isNot(BK_Block) && Style.Cpp11BracedListStyle != FormatStyle::BLS_Block; }; - if (Tok.isNoneOf(tok::l_paren, TT_TemplateOpener, tok::l_square) && - !IsStartOfBracedList()) { + if (IsStartOfBracedList()) + return Style.BreakAfterOpenBracketBracedList; + if (Tok.isNoneOf(tok::l_paren, TT_TemplateOpener, tok::l_square)) return false; - } if (!Tok.Previous) return true; if (Tok.Previous->isIf()) - return Style.AlignAfterOpenBracket == FormatStyle::BAS_AlwaysBreak; - return Tok.Previous->isNoneOf(TT_CastRParen, tok::kw_for, tok::kw_while, - tok::kw_switch) && - !(Style.isJavaScript() && Tok.Previous->is(Keywords.kw_await)); + return Style.BreakAfterOpenBracketIf; + if (Tok.Previous->isLoop(Style)) + return Style.BreakAfterOpenBracketLoop; + if (Tok.Previous->is(tok::kw_switch)) + return Style.BreakAfterOpenBracketSwitch; + if (Style.BreakAfterOpenBracketFunction) { + return !Tok.Previous->is(TT_CastRParen) && + !(Style.isJavaScript() && Tok.is(Keywords.kw_await)); + } + return false; }; auto IsFunctionCallParen = [](const FormatToken &Tok) { return Tok.is(tok::l_paren) && Tok.ParameterCount > 0 && Tok.Previous && Tok.Previous->is(tok::identifier); }; - auto IsInTemplateString = [this](const FormatToken &Tok) { + auto IsInTemplateString = [this](const FormatToken &Tok, bool NestBlocks) { if (!Style.isJavaScript()) return false; for (const auto *Prev = &Tok; Prev; Prev = Prev->Previous) { if (Prev->is(TT_TemplateString) && Prev->opensScope()) return true; - if (Prev->opensScope() || - (Prev->is(TT_TemplateString) && Prev->closesScope())) { - break; - } + if (Prev->opensScope() && !NestBlocks) + return false; + if (Prev->is(TT_TemplateString) && Prev->closesScope()) + return false; } return false; }; @@ -884,21 +892,25 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun, Tok.isOneOf(tok::ellipsis, Keywords.kw_await))) { return true; } - if (const auto *Previous = Tok.Previous; - !Previous || (Previous->isNoneOf(TT_FunctionDeclarationLParen, - TT_LambdaDefinitionLParen) && - !IsFunctionCallParen(*Previous))) { + const auto *Previous = TokAfterLParen.Previous; + assert(Previous); // IsOpeningBracket(Previous) + if (Previous->Previous && + (Previous->Previous->isIf() || Previous->Previous->isLoop(Style) || + Previous->Previous->is(tok::kw_switch))) { + return false; + } + if (Previous->isNoneOf(TT_FunctionDeclarationLParen, + TT_LambdaDefinitionLParen) && + !IsFunctionCallParen(*Previous)) { return true; } - if (IsOpeningBracket(Tok) || IsInTemplateString(Tok)) + if (IsOpeningBracket(Tok) || IsInTemplateString(Tok, true)) return true; const auto *Next = Tok.Next; return !Next || Next->isMemberAccess() || Next->is(TT_FunctionDeclarationLParen) || IsFunctionCallParen(*Next); }; - if ((Style.AlignAfterOpenBracket == FormatStyle::BAS_AlwaysBreak || - Style.AlignAfterOpenBracket == FormatStyle::BAS_BlockIndent) && - IsOpeningBracket(Previous) && State.Column > getNewLineColumn(State) && + if (IsOpeningBracket(Previous) && State.Column > getNewLineColumn(State) && // Don't do this for simple (no expressions) one-argument function calls // as that feels like needlessly wasting whitespace, e.g.: // @@ -920,7 +932,7 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun, // Note: This doesn't apply to macro expansion lines, which are MACRO( , , ) // with args as children of the '(' and ',' tokens. It does not make sense to // align the commas with the opening paren. - if (Style.AlignAfterOpenBracket != FormatStyle::BAS_DontAlign && + if (Style.AlignAfterOpenBracket && !CurrentState.IsCSharpGenericTypeConstraint && Previous.opensScope() && Previous.isNoneOf(TT_ObjCMethodExpr, TT_RequiresClause, TT_TableGenDAGArgOpener, @@ -933,7 +945,7 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun, Previous.Previous->isNoneOf(tok::identifier, tok::l_paren, BK_BracedInit))) || Previous.is(TT_VerilogMultiLineListLParen)) && - !IsInTemplateString(Current)) { + !IsInTemplateString(Current, false)) { CurrentState.Indent = State.Column + Spaces; CurrentState.IsAligned = true; } @@ -1271,8 +1283,20 @@ unsigned ContinuationIndenter::addTokenOnNewLine(LineState &State, } if (PreviousNonComment && PreviousNonComment->is(tok::l_paren)) { - CurrentState.BreakBeforeClosingParen = - Style.AlignAfterOpenBracket == FormatStyle::BAS_BlockIndent; + if (auto Previous = PreviousNonComment->Previous) { + if (Previous->isIf()) { + CurrentState.BreakBeforeClosingParen = Style.BreakBeforeCloseBracketIf; + } else if (Previous->isLoop(Style)) { + CurrentState.BreakBeforeClosingParen = + Style.BreakBeforeCloseBracketLoop; + } else if (Previous->is(tok::kw_switch)) { + CurrentState.BreakBeforeClosingParen = + Style.BreakBeforeCloseBracketSwitch; + } else { + CurrentState.BreakBeforeClosingParen = + Style.BreakBeforeCloseBracketFunction; + } + } } if (PreviousNonComment && PreviousNonComment->is(TT_TemplateOpener)) @@ -1416,13 +1440,17 @@ unsigned ContinuationIndenter::getNewLineColumn(const LineState &State) { State.Stack.size() > 1) { return State.Stack[State.Stack.size() - 2].LastSpace; } - if (Style.AlignAfterOpenBracket == FormatStyle::BAS_BlockIndent && - (Current.is(tok::r_paren) || - (Current.is(tok::r_brace) && Current.MatchingParen && - Current.MatchingParen->is(BK_BracedInit))) && + if (Style.BreakBeforeCloseBracketBracedList && Current.is(tok::r_brace) && + Current.MatchingParen && Current.MatchingParen->is(BK_BracedInit) && State.Stack.size() > 1) { return State.Stack[State.Stack.size() - 2].LastSpace; } + if ((Style.BreakBeforeCloseBracketFunction || + Style.BreakBeforeCloseBracketIf || Style.BreakBeforeCloseBracketLoop || + Style.BreakBeforeCloseBracketSwitch) && + Current.is(tok::r_paren) && State.Stack.size() > 1) { + return State.Stack[State.Stack.size() - 2].LastSpace; + } if (Style.BreakBeforeTemplateCloser && Current.is(TT_TemplateCloser) && State.Stack.size() > 1) { return State.Stack[State.Stack.size() - 2].LastSpace; @@ -1844,8 +1872,8 @@ void ContinuationIndenter::moveStatePastFakeLParens(LineState &State, PrecedenceLevel < prec::Assignment) && (!Previous || Previous->isNot(tok::kw_return) || (!Style.isJava() && PrecedenceLevel > 0)) && - (Style.AlignAfterOpenBracket != FormatStyle::BAS_DontAlign || - PrecedenceLevel > prec::Comma || Current.NestingLevel == 0) && + (Style.AlignAfterOpenBracket || PrecedenceLevel > prec::Comma || + Current.NestingLevel == 0) && (!Style.isTableGen() || (Previous && Previous->isOneOf(TT_TableGenDAGArgListComma, TT_TableGenDAGArgListCommaToBreak)))) { @@ -1885,8 +1913,7 @@ void ContinuationIndenter::moveStatePastFakeLParens(LineState &State, if (PrecedenceLevel > prec::Unknown) NewParenState.LastSpace = std::max(NewParenState.LastSpace, State.Column); if (PrecedenceLevel != prec::Conditional && - Current.isNot(TT_UnaryOperator) && - Style.AlignAfterOpenBracket != FormatStyle::BAS_DontAlign) { + Current.isNot(TT_UnaryOperator) && Style.AlignAfterOpenBracket) { NewParenState.StartOfFunctionCall = State.Column; } diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp index edd126c..dd14fcd 100644 --- a/clang/lib/Format/Format.cpp +++ b/clang/lib/Format/Format.cpp @@ -32,6 +32,13 @@ using clang::format::FormatStyle; LLVM_YAML_IS_SEQUENCE_VECTOR(FormatStyle::RawStringFormat) +enum BracketAlignmentStyle : int8_t { + BAS_Align, + BAS_DontAlign, + BAS_AlwaysBreak, + BAS_BlockIndent +}; + namespace llvm { namespace yaml { template <> @@ -204,16 +211,16 @@ template <> struct MappingTraits<FormatStyle::BraceWrappingFlags> { } }; -template <> struct ScalarEnumerationTraits<FormatStyle::BracketAlignmentStyle> { - static void enumeration(IO &IO, FormatStyle::BracketAlignmentStyle &Value) { - IO.enumCase(Value, "Align", FormatStyle::BAS_Align); - IO.enumCase(Value, "DontAlign", FormatStyle::BAS_DontAlign); - IO.enumCase(Value, "AlwaysBreak", FormatStyle::BAS_AlwaysBreak); - IO.enumCase(Value, "BlockIndent", FormatStyle::BAS_BlockIndent); +template <> struct ScalarEnumerationTraits<BracketAlignmentStyle> { + static void enumeration(IO &IO, BracketAlignmentStyle &Value) { + IO.enumCase(Value, "Align", BAS_Align); + IO.enumCase(Value, "DontAlign", BAS_DontAlign); // For backward compatibility. - IO.enumCase(Value, "true", FormatStyle::BAS_Align); - IO.enumCase(Value, "false", FormatStyle::BAS_DontAlign); + IO.enumCase(Value, "true", BAS_Align); + IO.enumCase(Value, "false", BAS_DontAlign); + IO.enumCase(Value, "AlwaysBreak", BAS_AlwaysBreak); + IO.enumCase(Value, "BlockIndent", BAS_BlockIndent); } }; @@ -979,6 +986,54 @@ template <> struct MappingTraits<FormatStyle> { bool SpacesInCStyleCastParentheses = false; bool SpacesInParentheses = false; + if (IO.outputting()) { + IO.mapOptional("AlignAfterOpenBracket", Style.AlignAfterOpenBracket); + } else { + // For backward compatibility. + BracketAlignmentStyle LocalBAS = BAS_Align; + if (IsGoogleOrChromium) { + FormatStyle::LanguageKind Language = Style.Language; + if (Language == FormatStyle::LK_None) + Language = ((FormatStyle *)IO.getContext())->Language; + if (Language == FormatStyle::LK_JavaScript) + LocalBAS = BAS_AlwaysBreak; + else if (Language == FormatStyle::LK_Java) + LocalBAS = BAS_DontAlign; + } else if (BasedOnStyle.equals_insensitive("webkit")) { + LocalBAS = BAS_DontAlign; + } + IO.mapOptional("AlignAfterOpenBracket", LocalBAS); + Style.BreakAfterOpenBracketBracedList = false; + Style.BreakAfterOpenBracketFunction = false; + Style.BreakAfterOpenBracketIf = false; + Style.BreakAfterOpenBracketLoop = false; + Style.BreakAfterOpenBracketSwitch = false; + Style.BreakBeforeCloseBracketBracedList = false; + Style.BreakBeforeCloseBracketFunction = false; + Style.BreakBeforeCloseBracketIf = false; + Style.BreakBeforeCloseBracketLoop = false; + Style.BreakBeforeCloseBracketSwitch = false; + + switch (LocalBAS) { + case BAS_DontAlign: + Style.AlignAfterOpenBracket = false; + break; + case BAS_BlockIndent: + Style.BreakBeforeCloseBracketBracedList = true; + Style.BreakBeforeCloseBracketFunction = true; + Style.BreakBeforeCloseBracketIf = true; + [[fallthrough]]; + case BAS_AlwaysBreak: + Style.BreakAfterOpenBracketBracedList = true; + Style.BreakAfterOpenBracketFunction = true; + Style.BreakAfterOpenBracketIf = true; + [[fallthrough]]; + case BAS_Align: + Style.AlignAfterOpenBracket = true; + break; + } + } + // For backward compatibility. if (!IO.outputting()) { IO.mapOptional("AlignEscapedNewlinesLeft", Style.AlignEscapedNewlines); @@ -1014,7 +1069,6 @@ template <> struct MappingTraits<FormatStyle> { } IO.mapOptional("AccessModifierOffset", Style.AccessModifierOffset); - IO.mapOptional("AlignAfterOpenBracket", Style.AlignAfterOpenBracket); IO.mapOptional("AlignArrayOfStructures", Style.AlignArrayOfStructures); IO.mapOptional("AlignConsecutiveAssignments", Style.AlignConsecutiveAssignments); @@ -1079,10 +1133,29 @@ template <> struct MappingTraits<FormatStyle> { IO.mapOptional("BreakAfterAttributes", Style.BreakAfterAttributes); IO.mapOptional("BreakAfterJavaFieldAnnotations", Style.BreakAfterJavaFieldAnnotations); + IO.mapOptional("BreakAfterOpenBracketBracedList", + Style.BreakAfterOpenBracketBracedList); + IO.mapOptional("BreakAfterOpenBracketFunction", + Style.BreakAfterOpenBracketFunction); + IO.mapOptional("BreakAfterOpenBracketIf", Style.BreakAfterOpenBracketIf); + IO.mapOptional("BreakAfterOpenBracketLoop", + Style.BreakAfterOpenBracketLoop); + IO.mapOptional("BreakAfterOpenBracketSwitch", + Style.BreakAfterOpenBracketSwitch); IO.mapOptional("BreakAfterReturnType", Style.BreakAfterReturnType); IO.mapOptional("BreakArrays", Style.BreakArrays); IO.mapOptional("BreakBeforeBinaryOperators", Style.BreakBeforeBinaryOperators); + IO.mapOptional("BreakBeforeCloseBracketBracedList", + Style.BreakBeforeCloseBracketBracedList); + IO.mapOptional("BreakBeforeCloseBracketFunction", + Style.BreakBeforeCloseBracketFunction); + IO.mapOptional("BreakBeforeCloseBracketIf", + Style.BreakBeforeCloseBracketIf); + IO.mapOptional("BreakBeforeCloseBracketLoop", + Style.BreakBeforeCloseBracketLoop); + IO.mapOptional("BreakBeforeCloseBracketSwitch", + Style.BreakBeforeCloseBracketSwitch); IO.mapOptional("BreakBeforeConceptDeclarations", Style.BreakBeforeConceptDeclarations); IO.mapOptional("BreakBeforeBraces", Style.BreakBeforeBraces); @@ -1561,7 +1634,7 @@ static void expandPresetsSpacesInParens(FormatStyle &Expanded) { FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) { FormatStyle LLVMStyle; LLVMStyle.AccessModifierOffset = -2; - LLVMStyle.AlignAfterOpenBracket = FormatStyle::BAS_Align; + LLVMStyle.AlignAfterOpenBracket = true; LLVMStyle.AlignArrayOfStructures = FormatStyle::AIAS_None; LLVMStyle.AlignConsecutiveAssignments = {}; LLVMStyle.AlignConsecutiveAssignments.PadOperators = true; @@ -1621,10 +1694,20 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) { LLVMStyle.BreakAdjacentStringLiterals = true; LLVMStyle.BreakAfterAttributes = FormatStyle::ABS_Leave; LLVMStyle.BreakAfterJavaFieldAnnotations = false; + LLVMStyle.BreakAfterOpenBracketBracedList = false; + LLVMStyle.BreakAfterOpenBracketFunction = false; + LLVMStyle.BreakAfterOpenBracketIf = false; + LLVMStyle.BreakAfterOpenBracketLoop = false; + LLVMStyle.BreakAfterOpenBracketSwitch = false; LLVMStyle.BreakAfterReturnType = FormatStyle::RTBS_None; LLVMStyle.BreakArrays = true; LLVMStyle.BreakBeforeBinaryOperators = FormatStyle::BOS_None; LLVMStyle.BreakBeforeBraces = FormatStyle::BS_Attach; + LLVMStyle.BreakBeforeCloseBracketBracedList = false; + LLVMStyle.BreakBeforeCloseBracketFunction = false; + LLVMStyle.BreakBeforeCloseBracketIf = false; + LLVMStyle.BreakBeforeCloseBracketLoop = false; + LLVMStyle.BreakBeforeCloseBracketSwitch = false; LLVMStyle.BreakBeforeConceptDeclarations = FormatStyle::BBCDS_Always; LLVMStyle.BreakBeforeInlineASMColon = FormatStyle::BBIAS_OnlyMultiline; LLVMStyle.BreakBeforeTemplateCloser = false; @@ -1877,7 +1960,7 @@ FormatStyle getGoogleStyle(FormatStyle::LanguageKind Language) { GoogleStyle.PenaltyReturnTypeOnItsOwnLine = 200; if (Language == FormatStyle::LK_Java) { - GoogleStyle.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign; + GoogleStyle.AlignAfterOpenBracket = false; GoogleStyle.AlignOperands = FormatStyle::OAS_DontAlign; GoogleStyle.AlignTrailingComments = {}; GoogleStyle.AlignTrailingComments.Kind = FormatStyle::TCAS_Never; @@ -1889,7 +1972,9 @@ FormatStyle getGoogleStyle(FormatStyle::LanguageKind Language) { GoogleStyle.SpaceAfterCStyleCast = true; GoogleStyle.SpacesBeforeTrailingComments = 1; } else if (Language == FormatStyle::LK_JavaScript) { - GoogleStyle.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak; + GoogleStyle.BreakAfterOpenBracketBracedList = true; + GoogleStyle.BreakAfterOpenBracketFunction = true; + GoogleStyle.BreakAfterOpenBracketIf = true; GoogleStyle.AlignOperands = FormatStyle::OAS_DontAlign; GoogleStyle.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_Empty; // TODO: still under discussion whether to switch to SLS_All. @@ -2026,7 +2111,7 @@ FormatStyle getMozillaStyle() { FormatStyle getWebKitStyle() { FormatStyle Style = getLLVMStyle(); Style.AccessModifierOffset = -4; - Style.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign; + Style.AlignAfterOpenBracket = false; Style.AlignOperands = FormatStyle::OAS_DontAlign; Style.AlignTrailingComments = {}; Style.AlignTrailingComments.Kind = FormatStyle::TCAS_Never; diff --git a/clang/lib/Format/FormatToken.cpp b/clang/lib/Format/FormatToken.cpp index d1c6264..28fdbcb 100644 --- a/clang/lib/Format/FormatToken.cpp +++ b/clang/lib/Format/FormatToken.cpp @@ -68,7 +68,7 @@ bool FormatToken::isBlockIndentedInitRBrace(const FormatStyle &Style) const { assert(MatchingParen); assert(MatchingParen->is(tok::l_brace)); if (Style.Cpp11BracedListStyle == FormatStyle::BLS_Block || - Style.AlignAfterOpenBracket != FormatStyle::BAS_BlockIndent) { + !Style.BreakBeforeCloseBracketBracedList) { return false; } const auto *LBrace = MatchingParen; @@ -198,7 +198,7 @@ void CommaSeparatedList::precomputeFormattingInfos(const FormatToken *Token) { return; // Column format doesn't really make sense if we don't align after brackets. - if (Style.AlignAfterOpenBracket == FormatStyle::BAS_DontAlign) + if (!Style.AlignAfterOpenBracket) return; FormatToken *ItemBegin = Token->Next; diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h index 6f3d24a..d833130 100644 --- a/clang/lib/Format/FormatToken.h +++ b/clang/lib/Format/FormatToken.h @@ -666,6 +666,12 @@ public: (endsSequence(tok::identifier, tok::kw_if) && AllowConstexprMacro); } + bool isLoop(const FormatStyle &Style) const { + return isOneOf(tok::kw_for, tok::kw_while) || + (Style.isJavaScript() && isNot(tok::l_paren) && Previous && + Previous->is(tok::kw_for)); + } + bool closesScopeAfterBlock() const { if (getBlockKind() == BK_Block) return true; diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index 021d8c6..8e227da 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -4427,10 +4427,8 @@ unsigned TokenAnnotator::splitPenalty(const AnnotatedLine &Line, if (Left.is(tok::l_paren) && Style.PenaltyBreakOpenParenthesis != 0) return Style.PenaltyBreakOpenParenthesis; - if (Left.is(tok::l_paren) && InFunctionDecl && - Style.AlignAfterOpenBracket != FormatStyle::BAS_DontAlign) { + if (Left.is(tok::l_paren) && InFunctionDecl && Style.AlignAfterOpenBracket) return 100; - } if (Left.is(tok::l_paren) && Left.Previous && (Left.Previous->isOneOf(tok::kw_for, tok::kw__Generic) || Left.Previous->isIf())) { @@ -4446,7 +4444,7 @@ unsigned TokenAnnotator::splitPenalty(const AnnotatedLine &Line, // If we aren't aligning after opening parens/braces we can always break // here unless the style does not want us to place all arguments on the // next line. - if (Style.AlignAfterOpenBracket == FormatStyle::BAS_DontAlign && + if (!Style.AlignAfterOpenBracket && (Left.ParameterCount <= 1 || Style.AllowAllArgumentsOnNextLine)) { return 0; } @@ -6226,24 +6224,31 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line, (Right.isBlockIndentedInitRBrace(Style))); } - // We only break before r_paren if we're in a block indented context. + // We can break before r_paren if we're in a block indented context or + // a control statement with an explicit style option. if (Right.is(tok::r_paren)) { - if (Style.AlignAfterOpenBracket != FormatStyle::BAS_BlockIndent || - !Right.MatchingParen) { + if (!Right.MatchingParen) return false; - } auto Next = Right.Next; if (Next && Next->is(tok::r_paren)) Next = Next->Next; if (Next && Next->is(tok::l_paren)) return false; const FormatToken *Previous = Right.MatchingParen->Previous; - return !(Previous && (Previous->is(tok::kw_for) || Previous->isIf())); + if (!Previous) + return false; + if (Previous->isIf()) + return Style.BreakBeforeCloseBracketIf; + if (Previous->isLoop(Style)) + return Style.BreakBeforeCloseBracketLoop; + if (Previous->is(tok::kw_switch)) + return Style.BreakBeforeCloseBracketSwitch; + return Style.BreakBeforeCloseBracketFunction; } if (Left.isOneOf(tok::r_paren, TT_TrailingAnnotation) && Right.is(TT_TrailingAnnotation) && - Style.AlignAfterOpenBracket == FormatStyle::BAS_BlockIndent) { + Style.BreakBeforeCloseBracketFunction) { return false; } diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index bd36eb4..be7c1d3 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -4049,18 +4049,18 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args, // -cl-std only applies for OpenCL language standards. // Override the -std option in this case. if (const Arg *A = Args.getLastArg(OPT_cl_std_EQ)) { - LangStandard::Kind OpenCLLangStd - = llvm::StringSwitch<LangStandard::Kind>(A->getValue()) - .Cases("cl", "CL", LangStandard::lang_opencl10) - .Cases("cl1.0", "CL1.0", LangStandard::lang_opencl10) - .Cases("cl1.1", "CL1.1", LangStandard::lang_opencl11) - .Cases("cl1.2", "CL1.2", LangStandard::lang_opencl12) - .Cases("cl2.0", "CL2.0", LangStandard::lang_opencl20) - .Cases("cl3.0", "CL3.0", LangStandard::lang_opencl30) - .Cases("clc++", "CLC++", LangStandard::lang_openclcpp10) - .Cases("clc++1.0", "CLC++1.0", LangStandard::lang_openclcpp10) - .Cases("clc++2021", "CLC++2021", LangStandard::lang_openclcpp2021) - .Default(LangStandard::lang_unspecified); + LangStandard::Kind OpenCLLangStd = + llvm::StringSwitch<LangStandard::Kind>(A->getValue()) + .Cases({"cl", "CL"}, LangStandard::lang_opencl10) + .Cases({"cl1.0", "CL1.0"}, LangStandard::lang_opencl10) + .Cases({"cl1.1", "CL1.1"}, LangStandard::lang_opencl11) + .Cases({"cl1.2", "CL1.2"}, LangStandard::lang_opencl12) + .Cases({"cl2.0", "CL2.0"}, LangStandard::lang_opencl20) + .Cases({"cl3.0", "CL3.0"}, LangStandard::lang_opencl30) + .Cases({"clc++", "CLC++"}, LangStandard::lang_openclcpp10) + .Cases({"clc++1.0", "CLC++1.0"}, LangStandard::lang_openclcpp10) + .Cases({"clc++2021", "CLC++2021"}, LangStandard::lang_openclcpp2021) + .Default(LangStandard::lang_unspecified); if (OpenCLLangStd == LangStandard::lang_unspecified) { Diags.Report(diag::err_drv_invalid_value) @@ -4600,7 +4600,8 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args, // Validate that if fnative-half-type is given, that // the language standard is at least hlsl2018, and that // the target shader model is at least 6.2. - if (Args.getLastArg(OPT_fnative_half_type)) { + if (Args.getLastArg(OPT_fnative_half_type) || + Args.getLastArg(OPT_fnative_int16_type)) { const LangStandard &Std = LangStandard::getLangStandardForKind(Opts.LangStd); if (!(Opts.LangStd >= LangStandard::lang_hlsl2018 && @@ -4614,12 +4615,16 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args, Diags.Report(diag::err_drv_hlsl_bad_shader_unsupported) << VulkanEnv << T.getOSName() << T.str(); } - if (Args.getLastArg(OPT_fnative_half_type)) { + if (Args.getLastArg(OPT_fnative_half_type) || + Args.getLastArg(OPT_fnative_int16_type)) { + const char *Str = Args.getLastArg(OPT_fnative_half_type) + ? "-fnative-half-type" + : "-fnative-int16-type"; const LangStandard &Std = LangStandard::getLangStandardForKind(Opts.LangStd); if (!(Opts.LangStd >= LangStandard::lang_hlsl2018)) Diags.Report(diag::err_drv_hlsl_16bit_types_unsupported) - << "-fnative-half-type" << false << Std.getName(); + << Str << false << Std.getName(); } } else { llvm_unreachable("expected DXIL or SPIR-V target"); diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp index 47f1d5a..8602be1 100644 --- a/clang/lib/Frontend/InitPreprocessor.cpp +++ b/clang/lib/Frontend/InitPreprocessor.cpp @@ -399,7 +399,7 @@ static void InitializeStandardPredefinedMacros(const TargetInfo &TI, Builder.defineMacro("__HLSL_202y", Twine((unsigned)LangOptions::HLSLLangStd::HLSL_202y)); - if (LangOpts.NativeHalfType) + if (LangOpts.NativeHalfType && LangOpts.NativeInt16Type) Builder.defineMacro("__HLSL_ENABLE_16_BIT", "1"); // Shader target information diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt index 1858912..33fff76 100644 --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -162,18 +162,12 @@ set(x86_files adxintrin.h ammintrin.h amxavx512intrin.h - amxbf16transposeintrin.h amxcomplexintrin.h - amxcomplextransposeintrin.h amxfp16intrin.h - amxfp16transposeintrin.h amxfp8intrin.h amxintrin.h amxmovrsintrin.h - amxmovrstransposeintrin.h amxtf32intrin.h - amxtf32transposeintrin.h - amxtransposeintrin.h avx10_2_512bf16intrin.h avx10_2_512convertintrin.h avx10_2_512minmaxintrin.h diff --git a/clang/lib/Headers/amxbf16transposeintrin.h b/clang/lib/Headers/amxbf16transposeintrin.h deleted file mode 100644 index 86f09f2..0000000 --- a/clang/lib/Headers/amxbf16transposeintrin.h +++ /dev/null @@ -1,94 +0,0 @@ -/*===----- amxbf16transposeintrin.h - AMX-BF16 and AMX-TRANSPOSE ------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===------------------------------------------------------------------------=== - */ - -#ifndef __IMMINTRIN_H -#error \ - "Never use <amxbf16transposeintrin.h> directly; use <immintrin.h> instead." -#endif /* __IMMINTRIN_H */ - -#ifndef __AMX_BF16TRANSPOSEINTRIN_H -#define __AMX_BF16TRANSPOSEINTRIN_H -#ifdef __x86_64__ - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS \ - __attribute__((__always_inline__, __nodebug__, \ - __target__("amx-bf16,amx-transpose"))) - -/// Compute transpose and dot-product of BF16 (16-bit) floating-point pairs in -/// tiles \a a and \a b, accumulating the intermediate single-precision -/// (32-bit) floating-point elements with elements in \a dst, and store the -/// 32-bit result back to tile \a dst. -/// -/// \headerfile <immintrin.h> -/// -/// \code -/// void _tile_tdpbf16ps (__tile dst, __tile a, __tile b) -/// \endcode -/// -/// \code{.operation} -/// FOR m := 0 TO dst.rows - 1 -/// tmp := dst.row[m] -/// FOR k := 0 TO (a.colsb / 4) - 1 -/// FOR n := 0 TO (dst.colsb / 4) - 1 -/// tmp.bf32[n] += FP32(a.row[m].bf16[2*k+0]) * -/// FP32(b.row[k].bf16[2*n+0]) -/// tmp.bf32[n] += FP32(a.row[m].bf16[2*k+1]) * -/// FP32(b.row[k].bf16[2*n+1]) -/// ENDFOR -/// ENDFOR -/// write_row_and_zero(dst, m, tmp, dst.colsb) -/// ENDFOR -/// zero_upper_rows(dst, dst.rows) -/// zero_tileconfig_start() -/// \endcode -/// -/// This intrinsic corresponds to the \c TTDPBF16PS instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param a -/// The 1st source tile. Max size is 1024 Bytes. -/// \param b -/// The 2nd source tile. Max size is 1024 Bytes. -#define _tile_tdpbf16ps(dst, a, b) __builtin_ia32_ttdpbf16ps((dst), (a), (b)) - -/// This is internal intrinsic. C/C++ user should avoid calling it directly. -static __inline__ _tile1024i __DEFAULT_FN_ATTRS -_tile_tdpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k, - _tile1024i dst, _tile1024i src1, _tile1024i src2) { - return __builtin_ia32_ttdpbf16ps_internal(m, n, k, dst, src1, src2); -} - -/// Compute transpose and dot-product of BF16 (16-bit) floating-point pairs in -/// tiles src0 and src1, accumulating the intermediate single-precision -/// (32-bit) floating-point elements with elements in "dst", and store the -/// 32-bit result back to tile "dst". -/// -/// \headerfile <immintrin.h> -/// -/// This intrinsic corresponds to the <c> TTDPBF16PS </c> instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param src0 -/// The 1st source tile. Max size is 1024 Bytes. -/// \param src1 -/// The 2nd source tile. Max size is 1024 Bytes. -__DEFAULT_FN_ATTRS -static __inline__ void __tile_tdpbf16ps(__tile1024i *dst, __tile1024i src0, - __tile1024i src1) { - dst->tile = _tile_tdpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile, - src0.tile, src1.tile); -} - -#undef __DEFAULT_FN_ATTRS - -#endif /* __x86_64__ */ -#endif /* __AMX_BF16TRANSPOSEINTRIN_H */ diff --git a/clang/lib/Headers/amxcomplextransposeintrin.h b/clang/lib/Headers/amxcomplextransposeintrin.h deleted file mode 100644 index 11abaf9..0000000 --- a/clang/lib/Headers/amxcomplextransposeintrin.h +++ /dev/null @@ -1,303 +0,0 @@ -/*===----- amxcomplextransposeintrin.h - AMX-COMPLEX and AMX-TRANSPOSE ------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===------------------------------------------------------------------------=== - */ - -#ifndef __IMMINTRIN_H -#error \ - "Never use <amxcomplextransposeintrin.h> directly; include <immintrin.h> instead." -#endif // __IMMINTRIN_H - -#ifndef __AMX_COMPLEXTRANSPOSEINTRIN_H -#define __AMX_COMPLEXTRANSPOSEINTRIN_H -#ifdef __x86_64__ - -#define __DEFAULT_FN_ATTRS \ - __attribute__((__always_inline__, __nodebug__, \ - __target__("amx-complex,amx-transpose"))) - -/// Perform matrix multiplication of two tiles containing complex elements and -/// accumulate the results into a packed single precision tile. Each dword -/// element in input tiles \a a and \a b is interpreted as a complex number -/// with FP16 real part and FP16 imaginary part. -/// Calculates the imaginary part of the result. For each possible combination -/// of (transposed column of \a a, column of \a b), it performs a set of -/// multiplication and accumulations on all corresponding complex numbers -/// (one from \a a and one from \a b). The imaginary part of the \a a element -/// is multiplied with the real part of the corresponding \a b element, and -/// the real part of the \a a element is multiplied with the imaginary part -/// of the corresponding \a b elements. The two accumulated results are -/// added, and then accumulated into the corresponding row and column of -/// \a dst. -/// -/// \headerfile <x86intrin.h> -/// -/// \code -/// void _tile_tcmmimfp16ps(__tile dst, __tile a, __tile b); -/// \endcode -/// -/// \code{.operation} -/// FOR m := 0 TO dst.rows - 1 -/// tmp := dst.row[m] -/// FOR k := 0 TO a.rows - 1 -/// FOR n := 0 TO (dst.colsb / 4) - 1 -/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1]) -/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0]) -/// ENDFOR -/// ENDFOR -/// write_row_and_zero(dst, m, tmp, dst.colsb) -/// ENDFOR -/// zero_upper_rows(dst, dst.rows) -/// zero_tileconfig_start() -/// \endcode -/// -/// This intrinsic corresponds to the \c TTCMMIMFP16PS instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param a -/// The 1st source tile. Max size is 1024 Bytes. -/// \param b -/// The 2nd source tile. Max size is 1024 Bytes. -#define _tile_tcmmimfp16ps(dst, a, b) \ - __builtin_ia32_ttcmmimfp16ps((dst), (a), (b)) - -/// Perform matrix multiplication of two tiles containing complex elements and -/// accumulate the results into a packed single precision tile. Each dword -/// element in input tiles \a a and \a b is interpreted as a complex number -/// with FP16 real part and FP16 imaginary part. -/// Calculates the real part of the result. For each possible combination -/// of (rtransposed colum of \a a, column of \a b), it performs a set of -/// multiplication and accumulations on all corresponding complex numbers -/// (one from \a a and one from \a b). The real part of the \a a element is -/// multiplied with the real part of the corresponding \a b element, and the -/// negated imaginary part of the \a a element is multiplied with the -/// imaginary part of the corresponding \a b elements. The two accumulated -/// results are added, and then accumulated into the corresponding row and -/// column of \a dst. -/// -/// \headerfile <x86intrin.h> -/// -/// \code -/// void _tile_tcmmrlfp16ps(__tile dst, __tile a, __tile b); -/// \endcode -/// -/// \code{.operation} -/// FOR m := 0 TO dst.rows - 1 -/// tmp := dst.row[m] -/// FOR k := 0 TO a.rows - 1 -/// FOR n := 0 TO (dst.colsb / 4) - 1 -/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+0]) -/// tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+1]) -/// ENDFOR -/// ENDFOR -/// write_row_and_zero(dst, m, tmp, dst.colsb) -/// ENDFOR -/// zero_upper_rows(dst, dst.rows) -/// zero_tileconfig_start() -/// \endcode -/// -/// This intrinsic corresponds to the \c TTCMMIMFP16PS instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param a -/// The 1st source tile. Max size is 1024 Bytes. -/// \param b -/// The 2nd source tile. Max size is 1024 Bytes. -#define _tile_tcmmrlfp16ps(dst, a, b) \ - __builtin_ia32_ttcmmrlfp16ps((dst), (a), (b)) - -/// Perform matrix conjugate transpose and multiplication of two tiles -/// containing complex elements and accumulate the results into a packed -/// single precision tile. Each dword element in input tiles \a a and \a b -/// is interpreted as a complex number with FP16 real part and FP16 imaginary -/// part. -/// Calculates the imaginary part of the result. For each possible combination -/// of (transposed column of \a a, column of \a b), it performs a set of -/// multiplication and accumulations on all corresponding complex numbers -/// (one from \a a and one from \a b). The negated imaginary part of the \a a -/// element is multiplied with the real part of the corresponding \a b -/// element, and the real part of the \a a element is multiplied with the -/// imaginary part of the corresponding \a b elements. The two accumulated -/// results are added, and then accumulated into the corresponding row and -/// column of \a dst. -/// -/// \headerfile <x86intrin.h> -/// -/// \code -/// void _tile_conjtcmmimfp16ps(__tile dst, __tile a, __tile b); -/// \endcode -/// -/// \code{.operation} -/// FOR m := 0 TO dst.rows - 1 -/// tmp := dst.row[m] -/// FOR k := 0 TO a.rows - 1 -/// FOR n := 0 TO (dst.colsb / 4) - 1 -/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1]) -/// tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0]) -/// ENDFOR -/// ENDFOR -/// write_row_and_zero(dst, m, tmp, dst.colsb) -/// ENDFOR -/// zero_upper_rows(dst, dst.rows) -/// zero_tileconfig_start() -/// \endcode -/// -/// This intrinsic corresponds to the \c TCONJTCMMIMFP16PS instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param a -/// The 1st source tile. Max size is 1024 Bytes. -/// \param b -/// The 2nd source tile. Max size is 1024 Bytes. -#define _tile_conjtcmmimfp16ps(dst, a, b) \ - __builtin_ia32_tconjtcmmimfp16ps((dst), (a), (b)) - -/// Perform conjugate transpose of an FP16-pair of complex elements from \a a -/// and writes the result to \a dst. -/// -/// \headerfile <x86intrin.h> -/// -/// \code -/// void _tile_conjtfp16(__tile dst, __tile a); -/// \endcode -/// -/// \code{.operation} -/// FOR i := 0 TO dst.rows - 1 -/// FOR j := 0 TO (dst.colsb / 4) - 1 -/// tmp.fp16[2*j+0] := a.row[j].fp16[2*i+0] -/// tmp.fp16[2*j+1] := -a.row[j].fp16[2*i+1] -/// ENDFOR -/// write_row_and_zero(dst, i, tmp, dst.colsb) -/// ENDFOR -/// zero_upper_rows(dst, dst.rows) -/// zero_tileconfig_start() -/// \endcode -/// -/// This intrinsic corresponds to the \c TCONJTFP16 instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param a -/// The source tile. Max size is 1024 Bytes. -#define _tile_conjtfp16(dst, a) __builtin_ia32_tconjtfp16((dst), (a)) - -static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_tcmmimfp16ps_internal( - unsigned short m, unsigned short n, unsigned short k, _tile1024i dst, - _tile1024i src1, _tile1024i src2) { - return __builtin_ia32_ttcmmimfp16ps_internal(m, n, k, dst, src1, src2); -} - -static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_tcmmrlfp16ps_internal( - unsigned short m, unsigned short n, unsigned short k, _tile1024i dst, - _tile1024i src1, _tile1024i src2) { - return __builtin_ia32_ttcmmrlfp16ps_internal(m, n, k, dst, src1, src2); -} - -static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_conjtcmmimfp16ps_internal( - unsigned short m, unsigned short n, unsigned short k, _tile1024i dst, - _tile1024i src1, _tile1024i src2) { - return __builtin_ia32_tconjtcmmimfp16ps_internal(m, n, k, dst, src1, src2); -} - -static __inline__ _tile1024i __DEFAULT_FN_ATTRS -_tile_conjtfp16_internal(unsigned short m, unsigned short n, _tile1024i src) { - return __builtin_ia32_tconjtfp16_internal(m, n, src); -} - -/// Perform matrix multiplication of two tiles containing complex elements and -/// accumulate the results into a packed single precision tile. Each dword -/// element in input tiles src0 and src1 is interpreted as a complex number -/// with FP16 real part and FP16 imaginary part. -/// This function calculates the imaginary part of the result. -/// -/// \headerfile <immintrin.h> -/// -/// This intrinsic corresponds to the <c> TTCMMIMFP16PS </c> instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param src0 -/// The 1st source tile. Max size is 1024 Bytes. -/// \param src1 -/// The 2nd source tile. Max size is 1024 Bytes. -__DEFAULT_FN_ATTRS -static void __tile_tcmmimfp16ps(__tile1024i *dst, __tile1024i src0, - __tile1024i src1) { - dst->tile = _tile_tcmmimfp16ps_internal(src0.row, src1.col, src0.col, - dst->tile, src0.tile, src1.tile); -} - -/// Perform matrix multiplication of two tiles containing complex elements and -/// accumulate the results into a packed single precision tile. Each dword -/// element in input tiles src0 and src1 is interpreted as a complex number -/// with FP16 real part and FP16 imaginary part. -/// This function calculates the real part of the result. -/// -/// \headerfile <immintrin.h> -/// -/// This intrinsic corresponds to the <c> TTCMMRLFP16PS </c> instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param src0 -/// The 1st source tile. Max size is 1024 Bytes. -/// \param src1 -/// The 2nd source tile. Max size is 1024 Bytes. -__DEFAULT_FN_ATTRS -static void __tile_tcmmrlfp16ps(__tile1024i *dst, __tile1024i src0, - __tile1024i src1) { - dst->tile = _tile_tcmmrlfp16ps_internal(src0.row, src1.col, src0.col, - dst->tile, src0.tile, src1.tile); -} - -/// Perform matrix conjugate transpose and multiplication of two tiles -/// containing complex elements and accumulate the results into a packed -/// single precision tile. Each dword element in input tiles src0 and src1 -/// is interpreted as a complex number with FP16 real part and FP16 imaginary -/// part. -/// This function calculates the imaginary part of the result. -/// -/// \headerfile <immintrin.h> -/// -/// This intrinsic corresponds to the <c> TCONJTCMMIMFP16PS </c> instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param src0 -/// The 1st source tile. Max size is 1024 Bytes. -/// \param src1 -/// The 2nd source tile. Max size is 1024 Bytes. -__DEFAULT_FN_ATTRS -static void __tile_conjtcmmimfp16ps(__tile1024i *dst, __tile1024i src0, - __tile1024i src1) { - dst->tile = _tile_conjtcmmimfp16ps_internal(src0.row, src1.col, src0.col, - dst->tile, src0.tile, src1.tile); -} - -/// Perform conjugate transpose of an FP16-pair of complex elements from src and -/// writes the result to dst. -/// -/// \headerfile <immintrin.h> -/// -/// This intrinsic corresponds to the <c> TCONJTFP16 </c> instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param src -/// The source tile. Max size is 1024 Bytes. -__DEFAULT_FN_ATTRS -static void __tile_conjtfp16(__tile1024i *dst, __tile1024i src) { - dst->tile = _tile_conjtfp16_internal(src.row, src.col, src.tile); -} - -#undef __DEFAULT_FN_ATTRS - -#endif // __x86_64__ -#endif // __AMX_COMPLEXTRANSPOSEINTRIN_H diff --git a/clang/lib/Headers/amxfp16transposeintrin.h b/clang/lib/Headers/amxfp16transposeintrin.h deleted file mode 100644 index 191f8c6..0000000 --- a/clang/lib/Headers/amxfp16transposeintrin.h +++ /dev/null @@ -1,94 +0,0 @@ -/*===----- amxfp16transposeintrin.h - AMX-FP16 and AMX-TRANSPOSE ------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===------------------------------------------------------------------------=== - */ - -#ifndef __IMMINTRIN_H -#error \ - "Never use <amxfp16transposeintrin.h> directly; use <immintrin.h> instead." -#endif /* __IMMINTRIN_H */ - -#ifndef __AMX_FP16TRANSPOSEINTRIN_H -#define __AMX_FP16TRANSPOSEINTRIN_H -#ifdef __x86_64__ - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS \ - __attribute__((__always_inline__, __nodebug__, \ - __target__("amx-fp16,amx-transpose"))) - -/// Compute transpose and dot-product of FP16 (16-bit) floating-point pairs in -/// tiles \a a and \a b, accumulating the intermediate single-precision -/// (32-bit) floating-point elements with elements in \a dst, and store the -/// 32-bit result back to tile \a dst. -/// -/// \headerfile <immintrin.h> -/// -/// \code -/// void _tile_tdpfp16ps (__tile dst, __tile a, __tile b) -/// \endcode -/// -/// \code{.operation} -/// FOR m := 0 TO dst.rows - 1 -/// tmp := dst.row[m] -/// FOR k := 0 TO (a.colsb / 4) - 1 -/// FOR n := 0 TO (dst.colsb / 4) - 1 -/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * -/// FP32(b.row[k].fp16[2*n+0]) -/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * -/// FP32(b.row[k].fp16[2*n+1]) -/// ENDFOR -/// ENDFOR -/// write_row_and_zero(dst, m, tmp, dst.colsb) -/// ENDFOR -/// zero_upper_rows(dst, dst.rows) -/// zero_tileconfig_start() -/// \endcode -/// -/// This intrinsic corresponds to the \c TTDPFP16PS instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param a -/// The 1st source tile. Max size is 1024 Bytes. -/// \param b -/// The 2nd source tile. Max size is 1024 Bytes. -#define _tile_tdpfp16ps(dst, a, b) __builtin_ia32_ttdpfp16ps((dst), (a), (b)) - -/// This is internal intrinsic. C/C++ user should avoid calling it directly. -static __inline__ _tile1024i __DEFAULT_FN_ATTRS -_tile_tdpfp16ps_internal(unsigned short m, unsigned short n, unsigned short k, - _tile1024i dst, _tile1024i src1, _tile1024i src2) { - return __builtin_ia32_ttdpfp16ps_internal(m, n, k, dst, src1, src2); -} - -/// Compute transpose and dot-product of FP16 (16-bit) floating-point pairs in -/// tiles src0 and src1, accumulating the intermediate single-precision -/// (32-bit) floating-point elements with elements in "dst", and store the -/// 32-bit result back to tile "dst". -/// -/// \headerfile <immintrin.h> -/// -/// This intrinsic corresponds to the <c> TTDPFP16PS </c> instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param src0 -/// The 1st source tile. Max size is 1024 Bytes. -/// \param src1 -/// The 2nd source tile. Max size is 1024 Bytes. -__DEFAULT_FN_ATTRS -static __inline__ void __tile_tdpfp16ps(__tile1024i *dst, __tile1024i src0, - __tile1024i src1) { - dst->tile = _tile_tdpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile, - src0.tile, src1.tile); -} - -#undef __DEFAULT_FN_ATTRS - -#endif /* __x86_64__ */ -#endif /* __AMX_FP16TRANSPOSEINTRIN_H */ diff --git a/clang/lib/Headers/amxintrin.h b/clang/lib/Headers/amxintrin.h index a7da10d..208aa358 100644 --- a/clang/lib/Headers/amxintrin.h +++ b/clang/lib/Headers/amxintrin.h @@ -230,8 +230,6 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) { /// bytes. Since there is no 2D type in llvm IR, we use vector type to /// represent 2D tile and the fixed size is maximum amx tile register size. typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64))); -typedef int _tile1024i_1024a - __attribute__((__vector_size__(1024), __aligned__(1024))); /// This is internal intrinsic. C/C++ user should avoid calling it directly. static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TILE diff --git a/clang/lib/Headers/amxmovrstransposeintrin.h b/clang/lib/Headers/amxmovrstransposeintrin.h deleted file mode 100644 index 5f48cba..0000000 --- a/clang/lib/Headers/amxmovrstransposeintrin.h +++ /dev/null @@ -1,200 +0,0 @@ -/* ===--- amxmovrstransposeintrin.h - AMX_MOVRS_TRANSPOSE intrinsics --------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - * ===-----------------------------------------------------------------------=== - */ - -#ifndef __IMMINTRIN_H -#error \ - "Never use <amxmovrstransposeintrin.h> directly; use <immintrin.h> instead." -#endif /* __IMMINTRIN_H */ - -#ifndef __AMX_MOVRS_TRANSPOSEINTRIN_H -#define __AMX_MOVRS_TRANSPOSEINTRIN_H -#ifdef __x86_64__ - -#define __DEFAULT_FN_ATTRS \ - __attribute__((__always_inline__, __nodebug__, \ - __target__("amx-transpose,amx-movrs"))) - -#define _tile_2rpntlvwz0rs(tdst, base, stride) \ - __builtin_ia32_t2rpntlvwz0rs(tdst, base, stride) -#define _tile_2rpntlvwz0rst1(tdst, base, stride) \ - __builtin_ia32_t2rpntlvwz0rst1(tdst, base, stride) -#define _tile_2rpntlvwz1rs(tdst, base, stride) \ - __builtin_ia32_t2rpntlvwz1rs(tdst, base, stride) -#define _tile_2rpntlvwz1rst1(tdst, base, stride) \ - __builtin_ia32_t2rpntlvwz1rst1(tdst, base, stride) - -static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz0rs_internal( - unsigned short row, unsigned short col0, unsigned short col1, - _tile1024i *dst0, _tile1024i *dst1, const void *base, - __SIZE_TYPE__ stride) { - // Use __tile1024i_1024a* to escape the alignment check in - // clang/test/Headers/x86-intrinsics-headers-clean.cpp - __builtin_ia32_t2rpntlvwz0rs_internal( - row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, - (__SIZE_TYPE__)(stride)); -} - -static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz0rst1_internal( - unsigned short row, unsigned short col0, unsigned short col1, - _tile1024i *dst0, _tile1024i *dst1, const void *base, - __SIZE_TYPE__ stride) { - __builtin_ia32_t2rpntlvwz0rst1_internal( - row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, - (__SIZE_TYPE__)(stride)); -} - -static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz1rs_internal( - unsigned short row, unsigned short col0, unsigned short col1, - _tile1024i *dst0, _tile1024i *dst1, const void *base, - __SIZE_TYPE__ stride) { - __builtin_ia32_t2rpntlvwz1rs_internal( - row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, - (__SIZE_TYPE__)(stride)); -} - -static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz1rst1_internal( - unsigned short row, unsigned short col0, unsigned short col1, - _tile1024i *dst0, _tile1024i *dst1, const void *base, - __SIZE_TYPE__ stride) { - __builtin_ia32_t2rpntlvwz1rst1_internal( - row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, - (__SIZE_TYPE__)(stride)); -} - -/// Converts a pair of tiles from memory into VNNI format, and places the -/// results in a pair of destinations specified by dst. The pair of tiles -/// in memory is specified via a tsib; the second tile is after the first -/// one, separated by the same stride that separates each row. -/// The tile configuration for the destination tiles indicates the amount -/// of data to read from memory. The instruction will load a number of rows -/// that is equal to twice the number of rows in tmm1. The size of each row -/// is equal to the average width of the destination tiles. If the second -/// tile is configured with zero rows and columns, only the first tile will -/// be written. -/// Provides a hint to the implementation that the data will likely become -/// read shared in the near future and the data caching can be optimized. -/// -/// \headerfile <immintrin.h> -/// -/// This intrinsic corresponds to the <c> T2RPNTLVWZ0RS </c> instruction. -/// -/// \param dst0 -/// First tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param dst1 -/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param base -/// A pointer to base address. -/// \param stride -/// The stride between the rows' data to be loaded in memory. -__DEFAULT_FN_ATTRS -static void __tile_2rpntlvwz0rs(__tile1024i *dst0, __tile1024i *dst1, - const void *base, __SIZE_TYPE__ stride) { - _tile_2rpntlvwz0rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, - &dst1->tile, base, stride); -} - -/// Converts a pair of tiles from memory into VNNI format, and places the -/// results in a pair of destinations specified by dst. The pair of tiles -/// in memory is specified via a tsib; the second tile is after the first -/// one, separated by the same stride that separates each row. -/// The tile configuration for the destination tiles indicates the amount -/// of data to read from memory. The instruction will load a number of rows -/// that is equal to twice the number of rows in tmm1. The size of each row -/// is equal to the average width of the destination tiles. If the second -/// tile is configured with zero rows and columns, only the first tile will -/// be written. -/// -/// \headerfile <immintrin.h> -/// -/// This intrinsic corresponds to the <c> T2RPNTLVWZ0T1RS </c> instruction. -/// -/// \param dst0 -/// First tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param dst1 -/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param base -/// A pointer to base address. -/// \param stride -/// The stride between the rows' data to be loaded in memory. -__DEFAULT_FN_ATTRS -static void __tile_2rpntlvwz0rst1(__tile1024i *dst0, __tile1024i *dst1, - const void *base, __SIZE_TYPE__ stride) { - _tile_2rpntlvwz0rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, - &dst1->tile, base, stride); -} - -/// Converts a pair of tiles from memory into VNNI format, and places the -/// results in a pair of destinations specified by dst. The pair of tiles -/// in memory is specified via a tsib; the second tile is after the first -/// one, separated by the same stride that separates each row. -/// The tile configuration for the destination tiles indicates the amount -/// of data to read from memory. The instruction will load a number of rows -/// that is equal to twice the number of rows in tmm1. The size of each row -/// is equal to the average width of the destination tiles. If the second -/// tile is configured with zero rows and columns, only the first tile will -/// be written. The last row will be not be read from memory but instead -/// filled with zeros. -/// Provides a hint to the implementation that the data will likely become -/// read shared in the near future and the data caching can be optimized. -/// -/// \headerfile <immintrin.h> -/// -/// This intrinsic corresponds to the <c> T2RPNTLVWZ1 </c> instruction. -/// -/// \param dst0 -/// First tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param dst1 -/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param base -/// A pointer to base address. -/// \param stride -/// The stride between the rows' data to be loaded in memory. -__DEFAULT_FN_ATTRS -static void __tile_2rpntlvwz1rs(__tile1024i *dst0, __tile1024i *dst1, - const void *base, __SIZE_TYPE__ stride) { - _tile_2rpntlvwz1rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, - &dst1->tile, base, stride); -} - -/// Converts a pair of tiles from memory into VNNI format, and places the -/// results in a pair of destinations specified by dst. The pair of tiles -/// in memory is specified via a tsib; the second tile is after the first -/// one, separated by the same stride that separates each row. -/// The tile configuration for the destination tiles indicates the amount -/// of data to read from memory. The instruction will load a number of rows -/// that is equal to twice the number of rows in tmm1. The size of each row -/// is equal to the average width of the destination tiles. If the second -/// tile is configured with zero rows and columns, only the first tile will -/// be written. The last row will be not be read from memory but instead -/// filled with zeros. -/// Provides a hint to the implementation that the data will likely become -/// read shared in the near future and the data caching can be optimized. -/// -/// \headerfile <immintrin.h> -/// -/// This intrinsic corresponds to the <c> T2RPNTLVWZ1T1RS </c> instruction. -/// -/// \param dst0 -/// First tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param dst1 -/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param base -/// A pointer to base address. -/// \param stride -/// The stride between the rows' data to be loaded in memory. -__DEFAULT_FN_ATTRS -static void __tile_2rpntlvwz1rst1(__tile1024i *dst0, __tile1024i *dst1, - const void *base, __SIZE_TYPE__ stride) { - _tile_2rpntlvwz1rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, - &dst1->tile, base, stride); -} - -#undef __DEFAULT_FN_ATTRS -#endif /* __x86_64__ */ -#endif /* __AMX_MOVRS_TRANSPOSEINTRIN_H */ diff --git a/clang/lib/Headers/amxtf32transposeintrin.h b/clang/lib/Headers/amxtf32transposeintrin.h deleted file mode 100644 index e1b90c1..0000000 --- a/clang/lib/Headers/amxtf32transposeintrin.h +++ /dev/null @@ -1,105 +0,0 @@ -/*===--------- amxtf32transposeintrin.h - AMX-TF32 and AMX-TRANSPOSE --------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===------------------------------------------------------------------------=== - */ -#ifndef __IMMINTRIN_H -#error \ - "Never use <amxtf32transposeintrin.h> directly; include <immintrin.h> instead." -#endif // __IMMINTRIN_H - -#ifndef __AMX_TF32TRANSPOSEINTRIN_H -#define __AMX_TF32TRANSPOSEINTRIN_H -#ifdef __x86_64__ - -#define __DEFAULT_FN_ATTRS_TF32_TRANSPOSE \ - __attribute__((__always_inline__, __nodebug__, \ - __target__("amx-tf32,amx-transpose"))) - -/// \code -/// void _tile_tmmultf32ps(constexpr int srcdst, constexpr int a, \ -/// constexpr int b); -/// \endcode -/// -/// This intrinsic corresponds to the <c> TTMMULTF32PS </c> instruction. -/// -/// \param srcdst -/// The destination tile. Max size is 1024 Bytes. -/// \param a -/// The 1st source tile. Max size is 1024 Bytes. -/// \param b -/// The 2nd source tile. Max size is 1024 Bytes. -/// -/// \code{.operation} -/// DEFINE zero_lower_mantissa_bits_fp32(x[31:0]) { -/// dword[12:0] := 0 -/// dword[31:13] := x[31:13] -/// return dword -/// } -/// -/// DEFINE silence_snan_fp32(x[31:0]) { -/// IF (x.exponent == 255 and x.fraction != 0 and x.fraction[22] == 0) -/// x.fraction[22] := 1 -/// return x -/// } -/// -/// elements_dest:= srcdst.colsb/4 -/// -/// FOR m := 0 TO (srcdst.rows-1) -/// tmp[511:0] := 0 -/// FOR k := 0 TO (a.rows-1) -/// FOR n := 0 TO (elements_dest-1) -/// a1e := silence_snan_fp32(a.row[k].fp32[m]) -/// a2e := silence_snan_fp32(b.row[k].fp32[n]) -/// s1e := zero_lower_mantissa_bits_fp32(a1e) -/// s2e := zero_lower_mantissa_bits_fp32(a2e) -/// tmp.fp32[n] += s1e * s2e -/// ENDFOR -/// ENDFOR -/// -/// FOR n := 0 TO (elements_dest-1) -/// tmp.fp32[n] += srcdst.row[m].fp32[n] -/// ENDFOR -/// write_row_and_zero(srcdst, m, tmp, srcdst.colsb) -/// -/// ENDFOR -/// -/// zero_upper_rows(srcdst, srcdst.rows) -/// zero_tileconfig_start() -/// \endcode -#define _tile_tmmultf32ps(srcdst, a, b) \ - __builtin_ia32_ttmmultf32ps((srcdst), (a), (b)) - -// dst = m x n (srcdest), src1 = k x m, src2 = k x n -static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TF32_TRANSPOSE -_tile_tmmultf32ps_internal(unsigned short m, unsigned short n, unsigned short k, - _tile1024i dst, _tile1024i src1, _tile1024i src2) { - return __builtin_ia32_ttmmultf32ps_internal(m, n, k, dst, src1, src2); -} - -/// Compute transpose and do Matrix Multiplication of src0 and src1, and then do -/// Matrix Plus with dst. All the calculation is base on float32 but with the -/// lower 13-bit set to 0. -/// -/// \headerfile <immintrin.h> -/// -/// This intrinsic corresponds to the <c> TTMMULTF32PS </c> instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param src0 -/// The 1st source tile. Max size is 1024 Bytes. -/// \param src1 -/// The 2nd source tile. Max size is 1024 Bytes. -__DEFAULT_FN_ATTRS_TF32_TRANSPOSE -static void __tile_tmmultf32ps(__tile1024i *dst, __tile1024i src0, - __tile1024i src1) { - dst->tile = _tile_tmmultf32ps_internal(src0.row, src1.col, src0.col, - dst->tile, src0.tile, src1.tile); -} - -#endif // __x86_64__ -#endif // __AMX_TF32TRANSPOSEINTRIN_H diff --git a/clang/lib/Headers/amxtransposeintrin.h b/clang/lib/Headers/amxtransposeintrin.h deleted file mode 100644 index b3fa37d..0000000 --- a/clang/lib/Headers/amxtransposeintrin.h +++ /dev/null @@ -1,248 +0,0 @@ -/* ===--- amxtransposeintrin.h - AMX_TRANSPOSE intrinsics -*- C++ -*---------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - * ===-----------------------------------------------------------------------=== - */ - -#ifndef __IMMINTRIN_H -#error "Never use <amxtransposeintrin.h> directly; use <immintrin.h> instead." -#endif /* __IMMINTRIN_H */ - -#ifndef __AMX_TRANSPOSEINTRIN_H -#define __AMX_TRANSPOSEINTRIN_H -#ifdef __x86_64__ - -#define __DEFAULT_FN_ATTRS_TRANSPOSE \ - __attribute__((__always_inline__, __nodebug__, __target__("amx-transpose"))) - -#define _tile_2rpntlvwz0(tdst, base, stride) \ - __builtin_ia32_t2rpntlvwz0(tdst, base, stride) -#define _tile_2rpntlvwz0t1(tdst, base, stride) \ - __builtin_ia32_t2rpntlvwz0t1(tdst, base, stride) -#define _tile_2rpntlvwz1(tdst, base, stride) \ - __builtin_ia32_t2rpntlvwz1(tdst, base, stride) -#define _tile_2rpntlvwz1t1(tdst, base, stride) \ - __builtin_ia32_t2rpntlvwz1t1(tdst, base, stride) - -/// Transpose 32-bit elements from \a src and write the result to \a dst. -/// -/// \headerfile <immintrin.h> -/// -/// \code -/// void _tile_transposed(__tile dst, __tile src); -/// \endcode -/// -/// This intrinsic corresponds to the <c> TTRANSPOSED </c> instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param src -/// The source tile. Max size is 1024 Bytes. -/// -/// \code{.operation} -/// -/// FOR i := 0 TO (dst.rows-1) -/// tmp[511:0] := 0 -/// FOR j := 0 TO (dst.colsb/4-1) -/// tmp.dword[j] := src.row[j].dword[i] -/// ENDFOR -/// dst.row[i] := tmp -/// ENDFOR -/// -/// zero_upper_rows(dst, dst.rows) -/// zero_tileconfig_start() -/// \endcode -#define _tile_transposed(dst, src) __builtin_ia32_ttransposed(dst, src) - -static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0_internal( - unsigned short row, unsigned short col0, unsigned short col1, - _tile1024i *dst0, _tile1024i *dst1, const void *base, - __SIZE_TYPE__ stride) { - // Use __tile1024i_1024a* to escape the alignment check in - // clang/test/Headers/x86-intrinsics-headers-clean.cpp - __builtin_ia32_t2rpntlvwz0_internal(row, col0, col1, (_tile1024i_1024a *)dst0, - (_tile1024i_1024a *)dst1, base, - (__SIZE_TYPE__)(stride)); -} - -static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0t1_internal( - unsigned short row, unsigned short col0, unsigned short col1, - _tile1024i *dst0, _tile1024i *dst1, const void *base, - __SIZE_TYPE__ stride) { - __builtin_ia32_t2rpntlvwz0t1_internal( - row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, - (__SIZE_TYPE__)(stride)); -} - -static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1_internal( - unsigned short row, unsigned short col0, unsigned short col1, - _tile1024i *dst0, _tile1024i *dst1, const void *base, - __SIZE_TYPE__ stride) { - __builtin_ia32_t2rpntlvwz1_internal(row, col0, col1, (_tile1024i_1024a *)dst0, - (_tile1024i_1024a *)dst1, base, - (__SIZE_TYPE__)(stride)); -} - -static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1t1_internal( - unsigned short row, unsigned short col0, unsigned short col1, - _tile1024i *dst0, _tile1024i *dst1, const void *base, - __SIZE_TYPE__ stride) { - __builtin_ia32_t2rpntlvwz1t1_internal( - row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, - (__SIZE_TYPE__)(stride)); -} - -// This is internal intrinsic. C/C++ user should avoid calling it directly. -static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TRANSPOSE -_tile_transposed_internal(unsigned short m, unsigned short n, _tile1024i src) { - return __builtin_ia32_ttransposed_internal(m, n, src); -} - -/// Converts a pair of tiles from memory into VNNI format, and places the -/// results in a pair of destinations specified by dst. The pair of tiles -/// in memory is specified via a tsib; the second tile is after the first -/// one, separated by the same stride that separates each row. -/// The tile configuration for the destination tiles indicates the amount -/// of data to read from memory. The instruction will load a number of rows -/// that is equal to twice the number of rows in tmm1. The size of each row -/// is equal to the average width of the destination tiles. If the second -/// tile is configured with zero rows and columns, only the first tile will -/// be written. -/// Provides a hint to the implementation that the data will likely not be -/// reused in the near future and the data caching can be optimized. -/// -/// \headerfile <immintrin.h> -/// -/// This intrinsic corresponds to the <c> T2RPNTLVWZ0 </c> instruction. -/// -/// \param dst0 -/// First tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param dst1 -/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param base -/// A pointer to base address. -/// \param stride -/// The stride between the rows' data to be loaded in memory. -__DEFAULT_FN_ATTRS_TRANSPOSE -static void __tile_2rpntlvwz0(__tile1024i *dst0, __tile1024i *dst1, - const void *base, __SIZE_TYPE__ stride) { - _tile_2rpntlvwz0_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, - &dst1->tile, base, stride); -} - -/// Converts a pair of tiles from memory into VNNI format, and places the -/// results in a pair of destinations specified by dst. The pair of tiles -/// in memory is specified via a tsib; the second tile is after the first -/// one, separated by the same stride that separates each row. -/// The tile configuration for the destination tiles indicates the amount -/// of data to read from memory. The instruction will load a number of rows -/// that is equal to twice the number of rows in tmm1. The size of each row -/// is equal to the average width of the destination tiles. If the second -/// tile is configured with zero rows and columns, only the first tile will -/// be written. -/// -/// \headerfile <immintrin.h> -/// -/// This intrinsic corresponds to the <c> T2RPNTLVWZ0T1 </c> instruction. -/// -/// \param dst0 -/// First tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param dst1 -/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param base -/// A pointer to base address. -/// \param stride -/// The stride between the rows' data to be loaded in memory. -__DEFAULT_FN_ATTRS_TRANSPOSE -static void __tile_2rpntlvwz0t1(__tile1024i *dst0, __tile1024i *dst1, - const void *base, __SIZE_TYPE__ stride) { - _tile_2rpntlvwz0t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, - &dst1->tile, base, stride); -} - -/// Converts a pair of tiles from memory into VNNI format, and places the -/// results in a pair of destinations specified by dst. The pair of tiles -/// in memory is specified via a tsib; the second tile is after the first -/// one, separated by the same stride that separates each row. -/// The tile configuration for the destination tiles indicates the amount -/// of data to read from memory. The instruction will load a number of rows -/// that is equal to twice the number of rows in tmm1. The size of each row -/// is equal to the average width of the destination tiles. If the second -/// tile is configured with zero rows and columns, only the first tile will -/// be written. The last row will be not be read from memory but instead -/// filled with zeros. -/// Provides a hint to the implementation that the data will likely not be -/// reused in the near future and the data caching can be optimized. -/// -/// \headerfile <immintrin.h> -/// -/// This intrinsic corresponds to the <c> T2RPNTLVWZ1 </c> instruction. -/// -/// \param dst0 -/// First tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param dst1 -/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param base -/// A pointer to base address. -/// \param stride -/// The stride between the rows' data to be loaded in memory. -__DEFAULT_FN_ATTRS_TRANSPOSE -static void __tile_2rpntlvwz1(__tile1024i *dst0, __tile1024i *dst1, - const void *base, __SIZE_TYPE__ stride) { - _tile_2rpntlvwz1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, - &dst1->tile, base, stride); -} - -/// Converts a pair of tiles from memory into VNNI format, and places the -/// results in a pair of destinations specified by dst. The pair of tiles -/// in memory is specified via a tsib; the second tile is after the first -/// one, separated by the same stride that separates each row. -/// The tile configuration for the destination tiles indicates the amount -/// of data to read from memory. The instruction will load a number of rows -/// that is equal to twice the number of rows in tmm1. The size of each row -/// is equal to the average width of the destination tiles. If the second -/// tile is configured with zero rows and columns, only the first tile will -/// be written. The last row will be not be read from memory but instead -/// filled with zeros. -/// Provides a hint to the implementation that the data will likely not be -/// reused in the near future and the data caching can be optimized. -/// -/// \headerfile <immintrin.h> -/// -/// This intrinsic corresponds to the <c> T2RPNTLVWZ1T1 </c> instruction. -/// -/// \param dst0 -/// First tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param dst1 -/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param base -/// A pointer to base address. -/// \param stride -/// The stride between the rows' data to be loaded in memory. -__DEFAULT_FN_ATTRS_TRANSPOSE -static void __tile_2rpntlvwz1t1(__tile1024i *dst0, __tile1024i *dst1, - const void *base, __SIZE_TYPE__ stride) { - _tile_2rpntlvwz1t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, - &dst1->tile, base, stride); -} - -/// Transpose 32-bit elements from src and write the result to dst. -/// -/// \headerfile <immintrin.h> -/// -/// This intrinsic corresponds to the <c> TTRANSPOSED </c> instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param src -/// The source tile. Max size is 1024 Bytes. -__DEFAULT_FN_ATTRS_TRANSPOSE -static void __tile_transposed(__tile1024i *dst, __tile1024i src) { - dst->tile = _tile_transposed_internal(dst->row, dst->col, src.tile); -} - -#endif /* __x86_64__ */ -#endif /* __AMX_TRANSPOSEINTRIN_H */ diff --git a/clang/lib/Headers/hlsl/hlsl_compat_overloads.h b/clang/lib/Headers/hlsl/hlsl_compat_overloads.h index fe4277e..ee243ab 100644 --- a/clang/lib/Headers/hlsl/hlsl_compat_overloads.h +++ b/clang/lib/Headers/hlsl/hlsl_compat_overloads.h @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// #ifndef _HLSL_COMPAT_OVERLOADS_H_ -#define _HLSl_COMPAT_OVERLOADS_H_ +#define _HLSL_COMPAT_OVERLOADS_H_ namespace hlsl { diff --git a/clang/lib/Headers/hvx_hexagon_protos.h b/clang/lib/Headers/hvx_hexagon_protos.h index fd120a5..19309a4 100644 --- a/clang/lib/Headers/hvx_hexagon_protos.h +++ b/clang/lib/Headers/hvx_hexagon_protos.h @@ -5605,6 +5605,399 @@ __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vsub_hf_f8)(Vu, Vv) #endif /* __HEXAGON_ARCH___ >= 79 */ +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.qf16=vabs(Vu32.hf) + C Intrinsic Prototype: HVX_Vector Q6_Vqf16_vabs_Vhf(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vqf16_vabs_Vhf(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vabs_qf16_hf)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.qf16=vabs(Vu32.qf16) + C Intrinsic Prototype: HVX_Vector Q6_Vqf16_vabs_Vqf16(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vqf16_vabs_Vqf16(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vabs_qf16_qf16)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.qf32=vabs(Vu32.qf32) + C Intrinsic Prototype: HVX_Vector Q6_Vqf32_vabs_Vqf32(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vqf32_vabs_Vqf32(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vabs_qf32_qf32)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.qf32=vabs(Vu32.sf) + C Intrinsic Prototype: HVX_Vector Q6_Vqf32_vabs_Vsf(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vqf32_vabs_Vsf(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vabs_qf32_sf)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32=valign4(Vu32,Vv32,Rt8) + C Intrinsic Prototype: HVX_Vector Q6_V_valign4_VVR(HVX_Vector Vu, HVX_Vector + Vv, Word32 Rt) Instruction Type: CVI_VA Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_V_valign4_VVR(Vu, Vv, Rt) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_valign4)(Vu, Vv, Rt) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.bf=Vuu32.qf32 + C Intrinsic Prototype: HVX_Vector Q6_Vbf_equals_Wqf32(HVX_VectorPair Vuu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vbf_equals_Wqf32(Vuu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_bf_qf32)(Vuu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.f8=Vu32.qf16 + C Intrinsic Prototype: HVX_Vector Q6_V_equals_Vqf16(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_V_equals_Vqf16(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_f8_qf16)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.h=Vu32.hf:rnd + C Intrinsic Prototype: HVX_Vector Q6_Vh_equals_Vhf_rnd(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vh_equals_Vhf_rnd(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_h_hf_rnd)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vdd32.qf16=Vu32.f8 + C Intrinsic Prototype: HVX_VectorPair Q6_Wqf16_equals_V(HVX_Vector Vu) + Instruction Type: CVI_VP_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Wqf16_equals_V(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_qf16_f8)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.qf16=Vu32.hf + C Intrinsic Prototype: HVX_Vector Q6_Vqf16_equals_Vhf(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vqf16_equals_Vhf(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_qf16_hf)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.qf16=Vu32.qf16 + C Intrinsic Prototype: HVX_Vector Q6_Vqf16_equals_Vqf16(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vqf16_equals_Vqf16(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_qf16_qf16)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.qf32=Vu32.qf32 + C Intrinsic Prototype: HVX_Vector Q6_Vqf32_equals_Vqf32(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vqf32_equals_Vqf32(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_qf32_qf32)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.qf32=Vu32.sf + C Intrinsic Prototype: HVX_Vector Q6_Vqf32_equals_Vsf(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vqf32_equals_Vsf(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_qf32_sf)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Qd4=vcmp.eq(Vu32.hf,Vv32.hf) + C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eq_VhfVhf(HVX_Vector Vu, + HVX_Vector Vv) Instruction Type: CVI_VA Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Q_vcmp_eq_VhfVhf(Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)( \ + (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqhf)(Vu, Vv)), -1) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Qx4&=vcmp.eq(Vu32.hf,Vv32.hf) + C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eqand_QVhfVhf(HVX_VectorPred + Qx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type: CVI_VA Execution + Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Q_vcmp_eqand_QVhfVhf(Qx, Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)( \ + (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqhf_and)( \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandvrt)((Qx), -1), Vu, \ + Vv)), \ + -1) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Qx4|=vcmp.eq(Vu32.hf,Vv32.hf) + C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eqor_QVhfVhf(HVX_VectorPred + Qx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type: CVI_VA Execution + Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Q_vcmp_eqor_QVhfVhf(Qx, Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)( \ + (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqhf_or)( \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandvrt)((Qx), -1), Vu, \ + Vv)), \ + -1) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Qx4^=vcmp.eq(Vu32.hf,Vv32.hf) + C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eqxacc_QVhfVhf(HVX_VectorPred + Qx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type: CVI_VA Execution + Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Q_vcmp_eqxacc_QVhfVhf(Qx, Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)( \ + (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqhf_xor)( \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandvrt)((Qx), -1), Vu, \ + Vv)), \ + -1) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Qd4=vcmp.eq(Vu32.sf,Vv32.sf) + C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eq_VsfVsf(HVX_Vector Vu, + HVX_Vector Vv) Instruction Type: CVI_VA Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Q_vcmp_eq_VsfVsf(Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)( \ + (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqsf)(Vu, Vv)), -1) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Qx4&=vcmp.eq(Vu32.sf,Vv32.sf) + C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eqand_QVsfVsf(HVX_VectorPred + Qx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type: CVI_VA Execution + Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Q_vcmp_eqand_QVsfVsf(Qx, Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)( \ + (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqsf_and)( \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandvrt)((Qx), -1), Vu, \ + Vv)), \ + -1) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Qx4|=vcmp.eq(Vu32.sf,Vv32.sf) + C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eqor_QVsfVsf(HVX_VectorPred + Qx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type: CVI_VA Execution + Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Q_vcmp_eqor_QVsfVsf(Qx, Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)( \ + (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqsf_or)( \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandvrt)((Qx), -1), Vu, \ + Vv)), \ + -1) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Qx4^=vcmp.eq(Vu32.sf,Vv32.sf) + C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eqxacc_QVsfVsf(HVX_VectorPred + Qx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type: CVI_VA Execution + Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Q_vcmp_eqxacc_QVsfVsf(Qx, Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)( \ + (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqsf_xor)( \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandvrt)((Qx), -1), Vu, \ + Vv)), \ + -1) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.w=vilog2(Vu32.hf) + C Intrinsic Prototype: HVX_Vector Q6_Vw_vilog2_Vhf(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vw_vilog2_Vhf(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vilog2_hf)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.w=vilog2(Vu32.qf16) + C Intrinsic Prototype: HVX_Vector Q6_Vw_vilog2_Vqf16(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vw_vilog2_Vqf16(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vilog2_qf16)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.w=vilog2(Vu32.qf32) + C Intrinsic Prototype: HVX_Vector Q6_Vw_vilog2_Vqf32(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vw_vilog2_Vqf32(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vilog2_qf32)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.w=vilog2(Vu32.sf) + C Intrinsic Prototype: HVX_Vector Q6_Vw_vilog2_Vsf(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vw_vilog2_Vsf(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vilog2_sf)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.qf16=vneg(Vu32.hf) + C Intrinsic Prototype: HVX_Vector Q6_Vqf16_vneg_Vhf(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vqf16_vneg_Vhf(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vneg_qf16_hf)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.qf16=vneg(Vu32.qf16) + C Intrinsic Prototype: HVX_Vector Q6_Vqf16_vneg_Vqf16(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vqf16_vneg_Vqf16(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vneg_qf16_qf16)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.qf32=vneg(Vu32.qf32) + C Intrinsic Prototype: HVX_Vector Q6_Vqf32_vneg_Vqf32(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vqf32_vneg_Vqf32(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vneg_qf32_qf32)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.qf32=vneg(Vu32.sf) + C Intrinsic Prototype: HVX_Vector Q6_Vqf32_vneg_Vsf(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vqf32_vneg_Vsf(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vneg_qf32_sf)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.qf16=vsub(Vu32.hf,Vv32.qf16) + C Intrinsic Prototype: HVX_Vector Q6_Vqf16_vsub_VhfVqf16(HVX_Vector Vu, + HVX_Vector Vv) Instruction Type: CVI_VS Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vqf16_vsub_VhfVqf16(Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vsub_hf_mix)(Vu, Vv) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.qf32=vsub(Vu32.sf,Vv32.qf32) + C Intrinsic Prototype: HVX_Vector Q6_Vqf32_vsub_VsfVqf32(HVX_Vector Vu, + HVX_Vector Vv) Instruction Type: CVI_VS Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vqf32_vsub_VsfVqf32(Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vsub_sf_mix)(Vu, Vv) +#endif /* __HEXAGON_ARCH___ >= 81 */ + #endif /* __HVX__ */ #endif diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h index 35f012c..19064a4 100644 --- a/clang/lib/Headers/immintrin.h +++ b/clang/lib/Headers/immintrin.h @@ -475,24 +475,12 @@ _storebe_i64(void * __P, long long __D) { #include <amxfp8intrin.h> -#include <amxtransposeintrin.h> - #include <amxmovrsintrin.h> -#include <amxmovrstransposeintrin.h> - #include <amxavx512intrin.h> #include <amxtf32intrin.h> -#include <amxtf32transposeintrin.h> - -#include <amxbf16transposeintrin.h> - -#include <amxfp16transposeintrin.h> - -#include <amxcomplextransposeintrin.h> - #include <avx512vp2intersectintrin.h> #include <avx512vlvp2intersectintrin.h> diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp index e4b158e..7e4a164 100644 --- a/clang/lib/Parse/ParseDecl.cpp +++ b/clang/lib/Parse/ParseDecl.cpp @@ -4248,6 +4248,13 @@ void Parser::ParseDeclarationSpecifiers( // type-specifier case tok::kw_short: + if (!getLangOpts().NativeInt16Type) { + Diag(Tok, diag::err_unknown_typename) << Tok.getName(); + DS.SetTypeSpecError(); + DS.SetRangeEnd(Tok.getLocation()); + ConsumeToken(); + goto DoneWithDeclSpec; + } isInvalid = DS.SetTypeSpecWidth(TypeSpecifierWidth::Short, Loc, PrevSpec, DiagID, Policy); break; diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index f451787..ad2c2e4 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -3542,9 +3542,7 @@ bool Sema::ValueIsRunOfOnes(CallExpr *TheCall, unsigned ArgNum) { bool Sema::getFormatStringInfo(const Decl *D, unsigned FormatIdx, unsigned FirstArg, FormatStringInfo *FSI) { - bool IsCXXMember = false; - if (const auto *MD = dyn_cast<CXXMethodDecl>(D)) - IsCXXMember = MD->isInstance(); + bool HasImplicitThisParam = hasImplicitObjectParameter(D); bool IsVariadic = false; if (const FunctionType *FnTy = D->getFunctionType()) IsVariadic = cast<FunctionProtoType>(FnTy)->isVariadic(); @@ -3553,11 +3551,12 @@ bool Sema::getFormatStringInfo(const Decl *D, unsigned FormatIdx, else if (const auto *OMD = dyn_cast<ObjCMethodDecl>(D)) IsVariadic = OMD->isVariadic(); - return getFormatStringInfo(FormatIdx, FirstArg, IsCXXMember, IsVariadic, FSI); + return getFormatStringInfo(FormatIdx, FirstArg, HasImplicitThisParam, + IsVariadic, FSI); } bool Sema::getFormatStringInfo(unsigned FormatIdx, unsigned FirstArg, - bool IsCXXMember, bool IsVariadic, + bool HasImplicitThisParam, bool IsVariadic, FormatStringInfo *FSI) { if (FirstArg == 0) FSI->ArgPassingKind = FAPK_VAList; @@ -3571,7 +3570,7 @@ bool Sema::getFormatStringInfo(unsigned FormatIdx, unsigned FirstArg, // The way the format attribute works in GCC, the implicit this argument // of member functions is counted. However, it doesn't appear in our own // lists, so decrement format_idx in that case. - if (IsCXXMember) { + if (HasImplicitThisParam) { if(FSI->FormatIdx == 0) return false; --FSI->FormatIdx; diff --git a/clang/lib/Sema/SemaCodeComplete.cpp b/clang/lib/Sema/SemaCodeComplete.cpp index 0514d10..aa93507 100644 --- a/clang/lib/Sema/SemaCodeComplete.cpp +++ b/clang/lib/Sema/SemaCodeComplete.cpp @@ -10208,6 +10208,24 @@ void SemaCodeCompletion::CodeCompletePreprocessorDirective(bool InConditional) { Builder.AddPlaceholderChunk("message"); Results.AddResult(Builder.TakeString()); + if (getLangOpts().C23) { + // #embed "file" + Builder.AddTypedTextChunk("embed"); + Builder.AddChunk(CodeCompletionString::CK_HorizontalSpace); + Builder.AddTextChunk("\""); + Builder.AddPlaceholderChunk("file"); + Builder.AddTextChunk("\""); + Results.AddResult(Builder.TakeString()); + + // #embed <file> + Builder.AddTypedTextChunk("embed"); + Builder.AddChunk(CodeCompletionString::CK_HorizontalSpace); + Builder.AddTextChunk("<"); + Builder.AddPlaceholderChunk("file"); + Builder.AddTextChunk(">"); + Results.AddResult(Builder.TakeString()); + } + // Note: #ident and #sccs are such crazy anachronisms that we don't provide // completions for them. And __include_macros is a Clang-internal extension // that we don't want to encourage anyone to use. diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index 964a2a7..a9e7b44 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -3785,7 +3785,7 @@ static bool handleFormatAttrCommon(Sema &S, Decl *D, const ParsedAttr &AL, // In C++ the implicit 'this' function parameter also counts, and they are // counted from one. - bool HasImplicitThisParam = isInstanceMethod(D); + bool HasImplicitThisParam = hasImplicitObjectParameter(D); Info->NumArgs = getFunctionOrMethodNumParams(D) + HasImplicitThisParam; Info->Identifier = AL.getArgAsIdent(0)->getIdentifierInfo(); @@ -3926,7 +3926,7 @@ static void handleCallbackAttr(Sema &S, Decl *D, const ParsedAttr &AL) { return; } - bool HasImplicitThisParam = isInstanceMethod(D); + bool HasImplicitThisParam = hasImplicitObjectParameter(D); int32_t NumArgs = getFunctionOrMethodNumParams(D); FunctionDecl *FD = D->getAsFunction(); @@ -4110,7 +4110,7 @@ static void handleLifetimeCaptureByAttr(Sema &S, Decl *D, } void Sema::LazyProcessLifetimeCaptureByParams(FunctionDecl *FD) { - bool HasImplicitThisParam = isInstanceMethod(FD); + bool HasImplicitThisParam = hasImplicitObjectParameter(FD); SmallVector<LifetimeCaptureByAttr *, 1> Attrs; for (ParmVarDecl *PVD : FD->parameters()) if (auto *A = PVD->getAttr<LifetimeCaptureByAttr>()) diff --git a/clang/lib/Sema/SemaX86.cpp b/clang/lib/Sema/SemaX86.cpp index 850bcb1..2f61bdd 100644 --- a/clang/lib/Sema/SemaX86.cpp +++ b/clang/lib/Sema/SemaX86.cpp @@ -489,14 +489,6 @@ bool SemaX86::CheckBuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall) { case X86::BI__builtin_ia32_tileloaddrst164: case X86::BI__builtin_ia32_tilestored64: case X86::BI__builtin_ia32_tilezero: - case X86::BI__builtin_ia32_t2rpntlvwz0: - case X86::BI__builtin_ia32_t2rpntlvwz0t1: - case X86::BI__builtin_ia32_t2rpntlvwz1: - case X86::BI__builtin_ia32_t2rpntlvwz1t1: - case X86::BI__builtin_ia32_t2rpntlvwz0rst1: - case X86::BI__builtin_ia32_t2rpntlvwz1rs: - case X86::BI__builtin_ia32_t2rpntlvwz1rst1: - case X86::BI__builtin_ia32_t2rpntlvwz0rs: case X86::BI__builtin_ia32_tcvtrowps2bf16h: case X86::BI__builtin_ia32_tcvtrowps2bf16l: case X86::BI__builtin_ia32_tcvtrowps2phh: @@ -516,17 +508,8 @@ bool SemaX86::CheckBuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall) { case X86::BI__builtin_ia32_tdpbhf8ps: case X86::BI__builtin_ia32_tdphbf8ps: case X86::BI__builtin_ia32_tdphf8ps: - case X86::BI__builtin_ia32_ttdpbf16ps: - case X86::BI__builtin_ia32_ttdpfp16ps: - case X86::BI__builtin_ia32_ttcmmimfp16ps: - case X86::BI__builtin_ia32_ttcmmrlfp16ps: - case X86::BI__builtin_ia32_tconjtcmmimfp16ps: case X86::BI__builtin_ia32_tmmultf32ps: - case X86::BI__builtin_ia32_ttmmultf32ps: return CheckBuiltinTileRangeAndDuplicate(TheCall, {0, 1, 2}); - case X86::BI__builtin_ia32_ttransposed: - case X86::BI__builtin_ia32_tconjtfp16: - return CheckBuiltinTileArgumentsRange(TheCall, {0, 1}); } } static bool isX86_32Builtin(unsigned BuiltinID) { diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp index 42f52d0..eebecdb 100644 --- a/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp +++ b/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp @@ -350,7 +350,7 @@ void sanitizeDiagOpts(DiagnosticOptions &DiagOpts) { // See `test/ClangScanDeps/diagnostic-pragmas.c` for an example. llvm::erase_if(DiagOpts.Warnings, [](StringRef Warning) { return llvm::StringSwitch<bool>(Warning) - .Cases("pch-vfs-diff", "error=pch-vfs-diff", false) + .Cases({"pch-vfs-diff", "error=pch-vfs-diff"}, false) .StartsWith("no-error=", false) .Default(true); }); diff --git a/clang/lib/Tooling/Transformer/RangeSelector.cpp b/clang/lib/Tooling/Transformer/RangeSelector.cpp index 171c786..b4bdec1 100644 --- a/clang/lib/Tooling/Transformer/RangeSelector.cpp +++ b/clang/lib/Tooling/Transformer/RangeSelector.cpp @@ -205,8 +205,12 @@ RangeSelector transformer::name(std::string ID) { // `foo<int>` for which this range will be too short. Doing so will // require subcasing `NamedDecl`, because it doesn't provide virtual // access to the \c DeclarationNameInfo. - if (tooling::getText(R, *Result.Context) != D->getName()) - return CharSourceRange(); + StringRef Text = tooling::getText(R, *Result.Context); + if (Text != D->getName()) + return llvm::make_error<StringError>( + llvm::errc::not_supported, + "range selected by name(node id=" + ID + "): '" + Text + + "' is different from decl name '" + D->getName() + "'"); return R; } if (const auto *E = Node.get<DeclRefExpr>()) { |
