aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Analysis/IR2Vec.cpp20
-rw-r--r--llvm/lib/CodeGen/InterleavedAccessPass.cpp31
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp12
-rw-r--r--llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp10
-rw-r--r--llvm/lib/IR/RuntimeLibcalls.cpp45
-rw-r--r--llvm/lib/Object/IRSymtab.cpp45
-rw-r--r--llvm/lib/Support/BLAKE3/blake3_dispatch.c2
-rw-r--r--llvm/lib/Support/BLAKE3/blake3_impl.h2
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp7
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.h2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.td10
-rw-r--r--llvm/lib/Target/AMDGPU/FLATInstructions.td41
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.h3
-rw-r--r--llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/VOP1Instructions.td6
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp10
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.h2
-rw-r--r--llvm/lib/Target/DirectX/DXILFlattenArrays.cpp10
-rw-r--r--llvm/lib/Target/DirectX/DXILLegalizePass.cpp36
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.h5
-rw-r--r--llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp181
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVBuiltins.td27
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.h2
-rw-r--r--llvm/lib/Target/X86/X86InterleavedAccess.cpp7
-rw-r--r--llvm/lib/TargetParser/TargetParser.cpp1
-rw-r--r--llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp19
27 files changed, 285 insertions, 254 deletions
diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp
index 898bf5b..95f30fd 100644
--- a/llvm/lib/Analysis/IR2Vec.cpp
+++ b/llvm/lib/Analysis/IR2Vec.cpp
@@ -215,7 +215,7 @@ Vocabulary::Vocabulary(VocabVector &&Vocab)
: Vocab(std::move(Vocab)), Valid(true) {}
bool Vocabulary::isValid() const {
- return Vocab.size() == (MaxOpcodes + MaxTypeIDs + MaxOperandKinds) && Valid;
+ return Vocab.size() == Vocabulary::expectedSize() && Valid;
}
size_t Vocabulary::size() const {
@@ -324,8 +324,24 @@ Vocabulary::OperandKind Vocabulary::getOperandKind(const Value *Op) {
return OperandKind::VariableID;
}
+unsigned Vocabulary::getNumericID(unsigned Opcode) {
+ assert(Opcode >= 1 && Opcode <= MaxOpcodes && "Invalid opcode");
+ return Opcode - 1; // Convert to zero-based index
+}
+
+unsigned Vocabulary::getNumericID(Type::TypeID TypeID) {
+ assert(static_cast<unsigned>(TypeID) < MaxTypeIDs && "Invalid type ID");
+ return MaxOpcodes + static_cast<unsigned>(TypeID);
+}
+
+unsigned Vocabulary::getNumericID(const Value *Op) {
+ unsigned Index = static_cast<unsigned>(getOperandKind(Op));
+ assert(Index < MaxOperandKinds && "Invalid OperandKind");
+ return MaxOpcodes + MaxTypeIDs + Index;
+}
+
StringRef Vocabulary::getStringKey(unsigned Pos) {
- assert(Pos < MaxOpcodes + MaxTypeIDs + MaxOperandKinds &&
+ assert(Pos < Vocabulary::expectedSize() &&
"Position out of bounds in vocabulary");
// Opcode
if (Pos < MaxOpcodes)
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index d43cd46..d2b2edf 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -367,34 +367,23 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
bool BinOpShuffleChanged =
replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, Load);
+ Value *Mask = nullptr;
if (auto *VPLoad = dyn_cast<VPIntrinsic>(Load)) {
- Value *LaneMask =
- getMask(VPLoad->getMaskParam(), Factor, cast<VectorType>(VecTy));
- if (!LaneMask)
+ Mask = getMask(VPLoad->getMaskParam(), Factor, cast<VectorType>(VecTy));
+ if (!Mask)
return false;
-
LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.load: " << *Load << "\n");
-
- // Sometimes the number of Shuffles might be less than Factor, we have to
- // fill the gaps with null. Also, lowerInterleavedVPLoad
- // expects them to be sorted.
- SmallVector<Value *, 4> ShuffleValues(Factor, nullptr);
- for (auto [Idx, ShuffleMaskIdx] : enumerate(Indices))
- ShuffleValues[ShuffleMaskIdx] = Shuffles[Idx];
- if (!TLI->lowerInterleavedVPLoad(VPLoad, LaneMask, ShuffleValues))
- // If Extracts is not empty, tryReplaceExtracts made changes earlier.
- return !Extracts.empty() || BinOpShuffleChanged;
} else {
LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *Load << "\n");
-
- // Try to create target specific intrinsics to replace the load and
- // shuffles.
- if (!TLI->lowerInterleavedLoad(cast<LoadInst>(Load), Shuffles, Indices,
- Factor))
- // If Extracts is not empty, tryReplaceExtracts made changes earlier.
- return !Extracts.empty() || BinOpShuffleChanged;
}
+ // Try to create target specific intrinsics to replace the load and
+ // shuffles.
+ if (!TLI->lowerInterleavedLoad(cast<Instruction>(Load), Mask, Shuffles,
+ Indices, Factor))
+ // If Extracts is not empty, tryReplaceExtracts made changes earlier.
+ return !Extracts.empty() || BinOpShuffleChanged;
+
DeadInsts.insert_range(Shuffles);
DeadInsts.insert(Load);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 74c14ed..01e5312 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -18,6 +18,7 @@
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/AliasAnalysis.h"
@@ -845,16 +846,13 @@ static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL,
static void failForInvalidBundles(const CallBase &I, StringRef Name,
ArrayRef<uint32_t> AllowedBundles) {
if (I.hasOperandBundlesOtherThan(AllowedBundles)) {
+ ListSeparator LS;
std::string Error;
+ raw_string_ostream OS(Error);
for (unsigned i = 0, e = I.getNumOperandBundles(); i != e; ++i) {
OperandBundleUse U = I.getOperandBundleAt(i);
- bool First = true;
- if (is_contained(AllowedBundles, U.getTagID()))
- continue;
- if (!First)
- Error += ", ";
- First = false;
- Error += U.getTagName();
+ if (!is_contained(AllowedBundles, U.getTagID()))
+ OS << LS << U.getTagName();
}
reportFatalUsageError(
Twine("cannot lower ", Name)
diff --git a/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp b/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp
index 222dc88..559d808 100644
--- a/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp
+++ b/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp
@@ -43,6 +43,12 @@ namespace llvm {
using namespace dwarf_linker;
using namespace dwarf_linker::classic;
+enum InvalidStmtSeqOffset {
+ MaxStmtSeqOffset = UINT64_MAX,
+ OrigOffsetMissing = MaxStmtSeqOffset - 1,
+ NewOffsetMissing = MaxStmtSeqOffset - 2,
+};
+
/// Hold the input and output of the debug info size in bytes.
struct DebugInfoSize {
uint64_t Input;
@@ -2315,7 +2321,7 @@ void DWARFLinker::DIECloner::generateLineTableForUnit(CompileUnit &Unit) {
// Some sequences are discarded by the DWARFLinker if they are invalid
// (empty).
if (OrigRowIter == SeqOffToOrigRow.end()) {
- StmtSeq.set(UINT64_MAX);
+ StmtSeq.set(OrigOffsetMissing);
continue;
}
size_t OrigRowIndex = OrigRowIter->second;
@@ -2325,7 +2331,7 @@ void DWARFLinker::DIECloner::generateLineTableForUnit(CompileUnit &Unit) {
if (NewRowIter == OrigRowToNewRow.end()) {
// If the original row index is not found in the map, update the
// stmt_sequence attribute to the 'invalid offset' magic value.
- StmtSeq.set(UINT64_MAX);
+ StmtSeq.set(NewOffsetMissing);
continue;
}
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index b1864897..5936ac7 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -135,6 +135,51 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT,
}
}
+RTLIB::LibcallImpl
+RuntimeLibcallsInfo::getSupportedLibcallImpl(StringRef FuncName) const {
+ const ArrayRef<uint16_t> RuntimeLibcallNameOffsets(
+ RuntimeLibcallNameOffsetTable);
+
+ iterator_range<ArrayRef<uint16_t>::const_iterator> Range =
+ getRecognizedLibcallImpls(FuncName);
+
+ for (auto I = Range.begin(); I != Range.end(); ++I) {
+ RTLIB::LibcallImpl Impl =
+ static_cast<RTLIB::LibcallImpl>(I - RuntimeLibcallNameOffsets.begin());
+
+ // FIXME: This should not depend on looking up ImplToLibcall, only the list
+ // of libcalls for the module.
+ RTLIB::LibcallImpl Recognized = LibcallImpls[ImplToLibcall[Impl]];
+ if (Recognized != RTLIB::Unsupported)
+ return Recognized;
+ }
+
+ return RTLIB::Unsupported;
+}
+
+iterator_range<ArrayRef<uint16_t>::const_iterator>
+RuntimeLibcallsInfo::getRecognizedLibcallImpls(StringRef FuncName) {
+ StringTable::Iterator It = lower_bound(RuntimeLibcallImplNameTable, FuncName);
+ if (It == RuntimeLibcallImplNameTable.end() || *It != FuncName)
+ return iterator_range(ArrayRef<uint16_t>());
+
+ uint16_t IndexVal = It.offset().value();
+ const ArrayRef<uint16_t> TableRef(RuntimeLibcallNameOffsetTable);
+
+ ArrayRef<uint16_t>::const_iterator E = TableRef.end();
+ ArrayRef<uint16_t>::const_iterator EntriesBegin =
+ std::lower_bound(TableRef.begin(), E, IndexVal);
+ ArrayRef<uint16_t>::const_iterator EntriesEnd = EntriesBegin;
+
+ while (EntriesEnd != E && *EntriesEnd == IndexVal)
+ ++EntriesEnd;
+
+ assert(EntriesBegin != E &&
+ "libcall found in name table but not offset table");
+
+ return make_range(EntriesBegin, EntriesEnd);
+}
+
bool RuntimeLibcallsInfo::darwinHasExp10(const Triple &TT) {
switch (TT.getOS()) {
case Triple::MacOSX:
diff --git a/llvm/lib/Object/IRSymtab.cpp b/llvm/lib/Object/IRSymtab.cpp
index 2579fa3..79eeb08 100644
--- a/llvm/lib/Object/IRSymtab.cpp
+++ b/llvm/lib/Object/IRSymtab.cpp
@@ -54,6 +54,11 @@ static const char *PreservedSymbols[] = {
"__stack_chk_guard",
};
+static bool isPreservedGlobalVarName(StringRef Name) {
+ return StringRef(PreservedSymbols[0]) == Name ||
+ StringRef(PreservedSymbols[1]) == Name;
+}
+
namespace {
const char *getExpectedProducerName() {
@@ -81,12 +86,16 @@ struct Builder {
// The StringTableBuilder does not create a copy of any strings added to it,
// so this provides somewhere to store any strings that we create.
Builder(SmallVector<char, 0> &Symtab, StringTableBuilder &StrtabBuilder,
- BumpPtrAllocator &Alloc)
- : Symtab(Symtab), StrtabBuilder(StrtabBuilder), Saver(Alloc) {}
+ BumpPtrAllocator &Alloc, const Triple &TT)
+ : Symtab(Symtab), StrtabBuilder(StrtabBuilder), Saver(Alloc), TT(TT),
+ Libcalls(TT) {}
DenseMap<const Comdat *, int> ComdatMap;
Mangler Mang;
- Triple TT;
+ const Triple &TT;
+
+ // FIXME: This shouldn't be here.
+ RTLIB::RuntimeLibcallsInfo Libcalls;
std::vector<storage::Comdat> Comdats;
std::vector<storage::Module> Mods;
@@ -98,6 +107,10 @@ struct Builder {
std::vector<storage::Str> DependentLibraries;
+ bool isPreservedLibFuncName(StringRef Name) {
+ return Libcalls.getSupportedLibcallImpl(Name) != RTLIB::Unsupported;
+ }
+
void setStr(storage::Str &S, StringRef Value) {
S.Offset = StrtabBuilder.add(Value);
S.Size = Value.size();
@@ -213,18 +226,6 @@ Expected<int> Builder::getComdatIndex(const Comdat *C, const Module *M) {
return P.first->second;
}
-static DenseSet<StringRef> buildPreservedSymbolsSet(const Triple &TT) {
- DenseSet<StringRef> PreservedSymbolSet(std::begin(PreservedSymbols),
- std::end(PreservedSymbols));
- // FIXME: Do we need to pass in ABI fields from TargetOptions?
- RTLIB::RuntimeLibcallsInfo Libcalls(TT);
- for (RTLIB::LibcallImpl Impl : Libcalls.getLibcallImpls()) {
- if (Impl != RTLIB::Unsupported)
- PreservedSymbolSet.insert(Libcalls.getLibcallImplName(Impl));
- }
- return PreservedSymbolSet;
-}
-
Error Builder::addSymbol(const ModuleSymbolTable &Msymtab,
const SmallPtrSet<GlobalValue *, 4> &Used,
ModuleSymbolTable::Symbol Msym) {
@@ -278,13 +279,11 @@ Error Builder::addSymbol(const ModuleSymbolTable &Msymtab,
return Error::success();
}
- setStr(Sym.IRName, GV->getName());
-
- static const DenseSet<StringRef> PreservedSymbolsSet =
- buildPreservedSymbolsSet(GV->getParent()->getTargetTriple());
- bool IsPreservedSymbol = PreservedSymbolsSet.contains(GV->getName());
+ StringRef GVName = GV->getName();
+ setStr(Sym.IRName, GVName);
- if (Used.count(GV) || IsPreservedSymbol)
+ if (Used.count(GV) || isPreservedLibFuncName(GVName) ||
+ isPreservedGlobalVarName(GVName))
Sym.Flags |= 1 << storage::Symbol::FB_used;
if (GV->isThreadLocal())
Sym.Flags |= 1 << storage::Symbol::FB_tls;
@@ -351,7 +350,6 @@ Error Builder::build(ArrayRef<Module *> IRMods) {
setStr(Hdr.Producer, kExpectedProducerName);
setStr(Hdr.TargetTriple, IRMods[0]->getTargetTriple().str());
setStr(Hdr.SourceFileName, IRMods[0]->getSourceFileName());
- TT = IRMods[0]->getTargetTriple();
for (auto *M : IRMods)
if (Error Err = addModule(M))
@@ -377,7 +375,8 @@ Error Builder::build(ArrayRef<Module *> IRMods) {
Error irsymtab::build(ArrayRef<Module *> Mods, SmallVector<char, 0> &Symtab,
StringTableBuilder &StrtabBuilder,
BumpPtrAllocator &Alloc) {
- return Builder(Symtab, StrtabBuilder, Alloc).build(Mods);
+ const Triple &TT = Mods[0]->getTargetTriple();
+ return Builder(Symtab, StrtabBuilder, Alloc, TT).build(Mods);
}
// Upgrade a vector of bitcode modules created by an old version of LLVM by
diff --git a/llvm/lib/Support/BLAKE3/blake3_dispatch.c b/llvm/lib/Support/BLAKE3/blake3_dispatch.c
index d00580f..19918aa 100644
--- a/llvm/lib/Support/BLAKE3/blake3_dispatch.c
+++ b/llvm/lib/Support/BLAKE3/blake3_dispatch.c
@@ -236,7 +236,7 @@ void blake3_xof_many(const uint32_t cv[8],
#if defined(IS_X86)
const enum cpu_feature features = get_cpu_features();
MAYBE_UNUSED(features);
-#if !defined(_WIN32) && !defined(BLAKE3_NO_AVX512)
+#if !defined(_WIN32) && !defined(__CYGWIN__) && !defined(BLAKE3_NO_AVX512)
if (features & AVX512VL) {
blake3_xof_many_avx512(cv, block, block_len, counter, flags, out, outblocks);
return;
diff --git a/llvm/lib/Support/BLAKE3/blake3_impl.h b/llvm/lib/Support/BLAKE3/blake3_impl.h
index deed079..dd71e72 100644
--- a/llvm/lib/Support/BLAKE3/blake3_impl.h
+++ b/llvm/lib/Support/BLAKE3/blake3_impl.h
@@ -324,7 +324,7 @@ void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
uint8_t flags, uint8_t flags_start,
uint8_t flags_end, uint8_t *out);
-#if !defined(_WIN32)
+#if !defined(_WIN32) && !defined(__CYGWIN__)
LLVM_LIBRARY_VISIBILITY
void blake3_xof_many_avx512(const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index ff23f76..d04e6c4 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17155,7 +17155,7 @@ static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
bool AArch64TargetLowering::lowerInterleavedLoad(
- LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+ Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices, unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
@@ -17163,6 +17163,11 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
assert(Shuffles.size() == Indices.size() &&
"Unmatched number of shufflevectors and indices");
+ auto *LI = dyn_cast<LoadInst>(Load);
+ if (!LI)
+ return false;
+ assert(!Mask && "Unexpected mask on a load");
+
const DataLayout &DL = LI->getDataLayout();
VectorType *VTy = Shuffles[0]->getType();
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 7b1de3d..713793e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -211,7 +211,7 @@ public:
unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
- bool lowerInterleavedLoad(LoadInst *LI,
+ bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const override;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index faf59c1..0e0e83b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1118,6 +1118,12 @@ def FeatureBitOp3Insts : SubtargetFeature<"bitop3-insts",
"Has v_bitop3_b32/v_bitop3_b16 instructions"
>;
+def FeatureTanhInsts : SubtargetFeature<"tanh-insts",
+ "HasTanhInsts",
+ "true",
+ "Has v_tanh_f32/f16 instructions"
+>;
+
def FeatureTransposeLoadF4F6Insts : SubtargetFeature<"transpose-load-f4f6-insts",
"HasTransposeLoadF4F6Insts",
"true",
@@ -1979,6 +1985,7 @@ def FeatureISAVersion12_50 : FeatureSet<
FeatureScalarDwordx3Loads,
FeatureDPPSrc1SGPR,
FeatureBitOp3Insts,
+ FeatureTanhInsts,
FeatureTransposeLoadF4F6Insts,
FeatureBF16TransInsts,
FeatureBF16ConversionInsts,
@@ -2703,6 +2710,9 @@ def HasPseudoScalarTrans : Predicate<"Subtarget->hasPseudoScalarTrans()">,
def HasBitOp3Insts : Predicate<"Subtarget->hasBitOp3Insts()">,
AssemblerPredicate<(all_of FeatureBitOp3Insts)>;
+def HasTanhInsts : Predicate<"Subtarget->hasTanhInsts()">,
+ AssemblerPredicate<(all_of FeatureTanhInsts)>;
+
def HasTransposeLoadF4F6Insts : Predicate<"Subtarget->hasTransposeLoadF4F6Insts()">,
AssemblerPredicate<(all_of FeatureTransposeLoadF4F6Insts)>;
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 74632c7..c8a4e22 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -200,6 +200,7 @@ class VFLAT_Real <bits<8> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :
let Inst{95-72} = !if(ps.has_offset, offset, ?);
}
+// TODO: Rename to FlatSaddrTable, it now handles both global and flat GVS addressing mode.
class GlobalSaddrTable <bit is_saddr, string Name = ""> {
bit IsSaddr = is_saddr;
string SaddrOp = Name;
@@ -1723,6 +1724,7 @@ let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predi
defm : FlatLoadPats <FLAT_LOAD_DWORD, atomic_load_nonext_32_flat, i32>;
defm : FlatLoadPats <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, i64>;
+defm : FlatLoadPats <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, v2i32>;
defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i32>;
defm : FlatStorePats <FLAT_STORE_SHORT, truncstorei16_flat, i32>;
@@ -1734,7 +1736,7 @@ defm : FlatStorePats <FLAT_STORE_DWORD, store_flat, vt>;
foreach vt = VReg_64.RegTypes in {
defm : FlatStorePats <FLAT_STORE_DWORDX2, store_flat, vt>;
-def : FlatLoadPat <FLAT_LOAD_DWORDX2, load_flat, vt>;
+defm : FlatLoadPats <FLAT_LOAD_DWORDX2, load_flat, vt>;
}
defm : FlatStorePats <FLAT_STORE_DWORDX3, store_flat, v3i32>;
@@ -1746,6 +1748,7 @@ defm : FlatStorePats <FLAT_STORE_DWORDX4, store_flat, vt>;
defm : FlatStorePats <FLAT_STORE_DWORD, atomic_store_32_flat, i32>;
defm : FlatStorePats <FLAT_STORE_DWORDX2, atomic_store_64_flat, i64>;
+defm : FlatStorePats <FLAT_STORE_DWORDX2, atomic_store_64_flat, v2i32>;
defm : FlatStorePats <FLAT_STORE_BYTE, atomic_store_8_flat, i32>;
defm : FlatStorePats <FLAT_STORE_SHORT, atomic_store_16_flat, i32>;
@@ -1791,6 +1794,9 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_"#as, f64>;
} // end foreach as
+defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
+defm : FlatStorePats <FLAT_STORE_SHORT, store_flat, i16>;
+
let SubtargetPredicate = isGFX12Plus in {
defm : FlatAtomicRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32 >;
@@ -1805,19 +1811,19 @@ defm : FlatStorePats <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>;
let OtherPredicates = [D16PreservesUnusedBits] in {
// TODO: Handle atomic loads
-def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2i16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2f16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2i16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2f16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2i16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2f16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2i16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2f16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2i16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2f16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2i16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2f16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2i16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2f16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2i16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2f16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2i16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2f16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2i16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2f16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2i16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2f16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2i16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2f16>;
}
} // End OtherPredicates = [HasFlatAddressSpace]
@@ -1889,6 +1895,7 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX4, store_global, vt>;
// appropriate waits.
defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORD, atomic_load_nonext_32_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORDX2, atomic_load_nonext_64_global, i64>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORDX2, atomic_load_nonext_64_global, v2i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, truncstorei8_global, i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, truncstorei16_global, i32>;
@@ -1928,6 +1935,7 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, atomic_store_8_global, i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, atomic_store_16_global, i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_DWORD, atomic_store_32_global, i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX2, atomic_store_64_global, i64>;
+defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX2, atomic_store_64_global, v2i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD", "atomic_load_add_global", i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB", "atomic_load_sub_global", i32>;
@@ -2946,13 +2954,6 @@ multiclass VFLAT_Real_AllAddr_gfx12<bits<8> op,
defm _SADDR : VFLAT_Real_gfx12<op, name>;
}
-multiclass VGLOBAL_Real_AllAddr_gfx12<bits<8> op,
- string name = get_FLAT_ps<NAME>.Mnemonic,
- string alias = name> :
- VFLAT_Real_Base_gfx12<op, name, alias> {
- defm _SADDR : VFLAT_Real_gfx12<op, name>;
-}
-
multiclass VGLOBAL_Real_AllAddr_gfx1200<bits<8> op> {
let AssemblerPredicate = isGFX12Not12_50 in {
defm "" : VFLAT_Real_gfx12<op>;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 67c6daa..268162b 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -234,6 +234,7 @@ protected:
bool HasRestrictedSOffset = false;
bool Has64BitLiterals = false;
bool HasBitOp3Insts = false;
+ bool HasTanhInsts = false;
bool HasTransposeLoadF4F6Insts = false;
bool HasPrngInst = false;
bool HasBVHDualAndBVH8Insts = false;
@@ -1380,6 +1381,8 @@ public:
return HasMinimum3Maximum3F16;
}
+ bool hasTanhInsts() const { return HasTanhInsts; }
+
bool hasAddPC64Inst() const { return GFX1250Insts; }
bool hasMinimum3Maximum3PKF16() const {
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 9b5a463..44d9ef5 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -378,6 +378,7 @@ static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy,
default:
return false;
case AMDGPU::V_MOV_B32_e32:
+ case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
SMovOp = AMDGPU::S_MOV_B32;
break;
case AMDGPU::V_MOV_B64_PSEUDO:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index dfe6f65..27212fda 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -9308,7 +9308,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
Op.getOperand(2), Op.getOperand(3));
case Intrinsic::amdgcn_reloc_constant: {
- Module *M = const_cast<Module *>(MF.getFunction().getParent());
+ Module *M = MF.getFunction().getParent();
const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
auto *RelocSymbol = cast<GlobalVariable>(
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index c91319e..8c35fea 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -366,6 +366,9 @@ defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, int_amdgcn_sqrt>;
let TRANS = 1, SchedRW = [WriteTrans32] in {
defm V_SIN_F32 : VOP1Inst <"v_sin_f32", VOP_F32_F32, AMDGPUsin>;
defm V_COS_F32 : VOP1Inst <"v_cos_f32", VOP_F32_F32, AMDGPUcos>;
+
+let SubtargetPredicate = HasTanhInsts in
+defm V_TANH_F32 : VOP1Inst <"v_tanh_f32", VOP_F32_F32, int_amdgcn_tanh>;
} // End TRANS = 1, SchedRW = [WriteTrans32]
defm V_NOT_B32 : VOP1Inst <"v_not_b32", VOP_I32_I32>;
@@ -535,6 +538,7 @@ defm V_RSQ_BF16 : VOP1Inst_t16 <"v_rsq_bf16", VOP_BF16_BF16, AMDGPUrsq>;
defm V_LOG_BF16 : VOP1Inst_t16 <"v_log_bf16", VOP_BF16_BF16, AMDGPUlogf16>;
defm V_EXP_BF16 : VOP1Inst_t16 <"v_exp_bf16", VOP_BF16_BF16, AMDGPUexpf16>;
defm V_SIN_BF16 : VOP1Inst_t16 <"v_sin_bf16", VOP_BF16_BF16, AMDGPUsin>;
+defm V_COS_BF16 : VOP1Inst_t16 <"v_cos_bf16", VOP_BF16_BF16, AMDGPUcos>;
}
} // End TRANS = 1, SchedRW = [WriteTrans32]
defm V_FREXP_MANT_F16 : VOP1Inst_t16 <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>;
@@ -1137,6 +1141,7 @@ defm V_CVT_F32_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x00b>;
defm V_MOV_B64 : VOP1_Real_FULL <GFX1250Gen, 0x1d>;
+defm V_TANH_F32 : VOP1_Real_FULL<GFX1250Gen, 0x01e>;
defm V_TANH_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x04a>;
defm V_CVT_F32_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16", "V_CVT_F32_BF16_gfx1250">;
defm V_CVT_PK_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x075>;
@@ -1149,6 +1154,7 @@ defm V_RSQ_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07b>;
defm V_LOG_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07c>;
defm V_EXP_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07d>;
defm V_SIN_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07e>;
+defm V_COS_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07f>;
//===----------------------------------------------------------------------===//
// GFX10.
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 65d1c4e..fd3b052 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -3545,8 +3545,7 @@ SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
auto T = const_cast<Type*>(CP->getType());
auto C = const_cast<Constant*>(CP->getConstVal());
- auto M = const_cast<Module*>(DAG.getMachineFunction().
- getFunction().getParent());
+ auto M = DAG.getMachineFunction().getFunction().getParent();
auto GV = new GlobalVariable(
*M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C,
Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" +
@@ -21585,7 +21584,7 @@ unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const {
/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
bool ARMTargetLowering::lowerInterleavedLoad(
- LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+ Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices, unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
@@ -21593,6 +21592,11 @@ bool ARMTargetLowering::lowerInterleavedLoad(
assert(Shuffles.size() == Indices.size() &&
"Unmatched number of shufflevectors and indices");
+ auto *LI = dyn_cast<LoadInst>(Load);
+ if (!LI)
+ return false;
+ assert(!Mask && "Unexpected mask on a load");
+
auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType());
Type *EltTy = VecTy->getElementType();
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 5f4aef5..9159f3d 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -681,7 +681,7 @@ class VectorType;
unsigned getMaxSupportedInterleaveFactor() const override;
- bool lowerInterleavedLoad(LoadInst *LI,
+ bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const override;
diff --git a/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp b/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
index ce43645..f0e2e78 100644
--- a/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
+++ b/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
@@ -343,6 +343,16 @@ bool DXILFlattenArraysVisitor::visitGetElementPtrInst(GetElementPtrInst &GEP) {
Info.RootFlattenedArrayType, Info.RootPointerOperand,
{ZeroIndex, FlattenedIndex}, GEP.getName(), GEP.getNoWrapFlags());
+ // If the pointer operand is a global variable and all indices are 0,
+ // IRBuilder::CreateGEP will return the global variable instead of creating
+ // a GEP instruction or GEP ConstantExpr. In this case we have to create and
+ // insert our own GEP instruction.
+ if (!isa<GEPOperator>(NewGEP))
+ NewGEP = GetElementPtrInst::Create(
+ Info.RootFlattenedArrayType, Info.RootPointerOperand,
+ {ZeroIndex, FlattenedIndex}, GEP.getNoWrapFlags(), GEP.getName(),
+ Builder.GetInsertPoint());
+
// Replace the current GEP with the new GEP. Store GEPInfo into the map
// for later use in case this GEP was not the end of the chain
GEPChainInfoMap.insert({cast<GEPOperator>(NewGEP), std::move(Info)});
diff --git a/llvm/lib/Target/DirectX/DXILLegalizePass.cpp b/llvm/lib/Target/DirectX/DXILLegalizePass.cpp
index c9ff713..c73648f 100644
--- a/llvm/lib/Target/DirectX/DXILLegalizePass.cpp
+++ b/llvm/lib/Target/DirectX/DXILLegalizePass.cpp
@@ -563,7 +563,7 @@ legalizeGetHighLowi64Bytes(Instruction &I,
}
static void
-legalizeLoadStoreOnArrayAllocas(Instruction &I,
+legalizeScalarLoadStoreOnArrays(Instruction &I,
SmallVectorImpl<Instruction *> &ToRemove,
DenseMap<Value *, Value *> &) {
@@ -581,23 +581,31 @@ legalizeLoadStoreOnArrayAllocas(Instruction &I,
} else
return;
- assert(LoadStoreTy->isSingleValueType() &&
- "Expected load/store type to be a single-valued type");
+ // If the load/store is not of a single-value type (i.e., scalar or vector)
+ // then we do not modify it. It shouldn't be a vector either because the
+ // dxil-data-scalarization pass is expected to run before this, but it's not
+ // incorrect to apply this transformation to vector load/stores.
+ if (!LoadStoreTy->isSingleValueType())
+ return;
- auto *AllocaPtrOp = dyn_cast<AllocaInst>(PtrOp);
- if (!AllocaPtrOp)
+ Type *ArrayTy;
+ if (auto *GlobalVarPtrOp = dyn_cast<GlobalVariable>(PtrOp))
+ ArrayTy = GlobalVarPtrOp->getValueType();
+ else if (auto *AllocaPtrOp = dyn_cast<AllocaInst>(PtrOp))
+ ArrayTy = AllocaPtrOp->getAllocatedType();
+ else
return;
- Type *Ty = AllocaPtrOp->getAllocatedType();
- if (!isa<ArrayType>(Ty))
+ if (!isa<ArrayType>(ArrayTy))
return;
- assert(!isa<ArrayType>(Ty->getArrayElementType()) &&
- "Expected allocated type of AllocaInst to be a flat ArrayType");
- IRBuilder<> Builder(&I);
- Value *Zero = Builder.getInt32(0);
- Value *GEP = Builder.CreateGEP(Ty, AllocaPtrOp, {Zero, Zero}, "",
- GEPNoWrapFlags::all());
+ assert(ArrayTy->getArrayElementType() == LoadStoreTy &&
+ "Expected array element type to be the same as to the scalar load or "
+ "store type");
+
+ Value *Zero = ConstantInt::get(Type::getInt32Ty(I.getContext()), 0);
+ Value *GEP = GetElementPtrInst::Create(
+ ArrayTy, PtrOp, {Zero, Zero}, GEPNoWrapFlags::all(), "", I.getIterator());
I.setOperand(PtrOpIndex, GEP);
}
@@ -651,7 +659,7 @@ private:
// downcastI64toI32InsertExtractElements needs to handle.
LegalizationPipeline[Stage2].push_back(
downcastI64toI32InsertExtractElements);
- LegalizationPipeline[Stage2].push_back(legalizeLoadStoreOnArrayAllocas);
+ LegalizationPipeline[Stage2].push_back(legalizeScalarLoadStoreOnArrays);
}
};
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index a5d735c..e0a8c07 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -429,7 +429,7 @@ public:
bool fallBackToDAGISel(const Instruction &Inst) const override;
- bool lowerInterleavedLoad(LoadInst *LI,
+ bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const override;
@@ -444,9 +444,6 @@ public:
Instruction *Store, Value *Mask,
ArrayRef<Value *> InterleaveValues) const override;
- bool lowerInterleavedVPLoad(VPIntrinsic *Load, Value *Mask,
- ArrayRef<Value *> DeinterleaveRes) const override;
-
bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask,
ArrayRef<Value *> InterleaveOps) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
index 0d4f241..38cc0ce 100644
--- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
@@ -115,21 +115,49 @@ static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) {
/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
bool RISCVTargetLowering::lowerInterleavedLoad(
- LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+ Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices, unsigned Factor) const {
assert(Indices.size() == Shuffles.size());
- IRBuilder<> Builder(LI);
-
- const DataLayout &DL = LI->getDataLayout();
+ IRBuilder<> Builder(Load);
+ const DataLayout &DL = Load->getDataLayout();
auto *VTy = cast<FixedVectorType>(Shuffles[0]->getType());
- if (!isLegalInterleavedAccessType(VTy, Factor, LI->getAlign(),
- LI->getPointerAddressSpace(), DL))
- return false;
+ auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen());
+
+ Value *Ptr, *VL;
+ Align Alignment;
+ if (auto *LI = dyn_cast<LoadInst>(Load)) {
+ assert(LI->isSimple());
+ Ptr = LI->getPointerOperand();
+ Alignment = LI->getAlign();
+ assert(!Mask && "Unexpected mask on a load\n");
+ Mask = Builder.getAllOnesMask(VTy->getElementCount());
+ VL = Builder.CreateElementCount(XLenTy, VTy->getElementCount());
+ } else {
+ auto *VPLoad = cast<VPIntrinsic>(Load);
+ assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load &&
+ "Unexpected intrinsic");
+ Ptr = VPLoad->getMemoryPointerParam();
+ Alignment = VPLoad->getPointerAlignment().value_or(
+ DL.getABITypeAlign(VTy->getElementType()));
- auto *PtrTy = LI->getPointerOperandType();
- auto *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen());
+ assert(Mask && "vp.load needs a mask!");
+
+ Value *WideEVL = VPLoad->getVectorLengthParam();
+ // Conservatively check if EVL is a multiple of factor, otherwise some
+ // (trailing) elements might be lost after the transformation.
+ if (!isMultipleOfN(WideEVL, DL, Factor))
+ return false;
+
+ auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor);
+ VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy);
+ }
+
+ Type *PtrTy = Ptr->getType();
+ unsigned AS = PtrTy->getPointerAddressSpace();
+ if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL))
+ return false;
// If the segment load is going to be performed segment at a time anyways
// and there's only one element used, use a strided load instead. This
@@ -138,26 +166,23 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
unsigned ScalarSizeInBytes = DL.getTypeStoreSize(VTy->getElementType());
Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
Value *Offset = ConstantInt::get(XLenTy, Indices[0] * ScalarSizeInBytes);
- Value *BasePtr = Builder.CreatePtrAdd(LI->getPointerOperand(), Offset);
- Value *Mask = Builder.getAllOnesMask(VTy->getElementCount());
- Value *VL = Builder.CreateElementCount(Builder.getInt32Ty(),
- VTy->getElementCount());
-
+ Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset);
+ // Note: Same VL as above, but i32 not xlen due to signature of
+ // vp.strided.load
+ VL = Builder.CreateElementCount(Builder.getInt32Ty(),
+ VTy->getElementCount());
CallInst *CI =
Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load,
{VTy, BasePtr->getType(), Stride->getType()},
{BasePtr, Stride, Mask, VL});
- CI->addParamAttr(
- 0, Attribute::getWithAlignment(CI->getContext(), LI->getAlign()));
+ CI->addParamAttr(0,
+ Attribute::getWithAlignment(CI->getContext(), Alignment));
Shuffles[0]->replaceAllUsesWith(CI);
return true;
};
- Value *VL = Builder.CreateElementCount(XLenTy, VTy->getElementCount());
- Value *Mask = Builder.getAllOnesMask(VTy->getElementCount());
CallInst *VlsegN = Builder.CreateIntrinsic(
- FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy},
- {LI->getPointerOperand(), Mask, VL});
+ FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}, {Ptr, Mask, VL});
for (unsigned i = 0; i < Shuffles.size(); i++) {
Value *SubVec = Builder.CreateExtractValue(VlsegN, Indices[i]);
@@ -426,122 +451,6 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
return true;
}
-/// Lower an interleaved vp.load into a vlsegN intrinsic.
-///
-/// E.g. Lower an interleaved vp.load (Factor = 2):
-/// %l = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr %ptr,
-/// %mask,
-/// i32 %wide.rvl)
-/// %dl = tail call { <vscale x 32 x i8>, <vscale x 32 x i8> }
-/// @llvm.vector.deinterleave2.nxv64i8(
-/// <vscale x 64 x i8> %l)
-/// %r0 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %dl, 0
-/// %r1 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %dl, 1
-///
-/// Into:
-/// %rvl = udiv %wide.rvl, 2
-/// %sl = call { <vscale x 32 x i8>, <vscale x 32 x i8> }
-/// @llvm.riscv.vlseg2.mask.nxv32i8.i64(<vscale x 32 x i8> undef,
-/// <vscale x 32 x i8> undef,
-/// ptr %ptr,
-/// %mask,
-/// i64 %rvl,
-/// i64 1)
-/// %r0 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %sl, 0
-/// %r1 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %sl, 1
-///
-/// NOTE: the deinterleave2 intrinsic won't be touched and is expected to be
-/// removed by the caller
-/// TODO: We probably can loosen the dependency on matching extractvalue when
-/// dealing with factor of 2 (extractvalue is still required for most of other
-/// factors though).
-bool RISCVTargetLowering::lowerInterleavedVPLoad(
- VPIntrinsic *Load, Value *Mask,
- ArrayRef<Value *> DeinterleaveResults) const {
- const unsigned Factor = DeinterleaveResults.size();
- assert(Mask && "Expect a valid mask");
- assert(Load->getIntrinsicID() == Intrinsic::vp_load &&
- "Unexpected intrinsic");
-
- Value *FirstActive = *llvm::find_if(DeinterleaveResults,
- [](Value *V) { return V != nullptr; });
- VectorType *VTy = cast<VectorType>(FirstActive->getType());
-
- auto &DL = Load->getModule()->getDataLayout();
- Align Alignment = Load->getParamAlign(0).value_or(
- DL.getABITypeAlign(VTy->getElementType()));
- if (!isLegalInterleavedAccessType(
- VTy, Factor, Alignment,
- Load->getArgOperand(0)->getType()->getPointerAddressSpace(), DL))
- return false;
-
- IRBuilder<> Builder(Load);
-
- Value *WideEVL = Load->getVectorLengthParam();
- // Conservatively check if EVL is a multiple of factor, otherwise some
- // (trailing) elements might be lost after the transformation.
- if (!isMultipleOfN(WideEVL, Load->getDataLayout(), Factor))
- return false;
-
- auto *PtrTy = Load->getArgOperand(0)->getType();
- auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen());
- auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor);
- Value *EVL =
- Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy);
-
- Value *Return = nullptr;
- if (isa<FixedVectorType>(VTy)) {
- Return = Builder.CreateIntrinsic(FixedVlsegIntrIds[Factor - 2],
- {VTy, PtrTy, XLenTy},
- {Load->getArgOperand(0), Mask, EVL});
- } else {
- unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType());
- unsigned NumElts = VTy->getElementCount().getKnownMinValue();
- Type *VecTupTy = TargetExtType::get(
- Load->getContext(), "riscv.vector.tuple",
- ScalableVectorType::get(Type::getInt8Ty(Load->getContext()),
- NumElts * SEW / 8),
- Factor);
-
- Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration(
- Load->getModule(), ScalableVlsegIntrIds[Factor - 2],
- {VecTupTy, PtrTy, Mask->getType(), EVL->getType()});
-
- Value *Operands[] = {
- PoisonValue::get(VecTupTy),
- Load->getArgOperand(0),
- Mask,
- EVL,
- ConstantInt::get(XLenTy,
- RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC),
- ConstantInt::get(XLenTy, Log2_64(SEW))};
-
- CallInst *VlsegN = Builder.CreateCall(VlsegNFunc, Operands);
-
- SmallVector<Type *, 8> AggrTypes{Factor, VTy};
- Return = PoisonValue::get(StructType::get(Load->getContext(), AggrTypes));
- Function *VecExtractFunc = Intrinsic::getOrInsertDeclaration(
- Load->getModule(), Intrinsic::riscv_tuple_extract, {VTy, VecTupTy});
- for (unsigned i = 0; i < Factor; ++i) {
- Value *VecExtract =
- Builder.CreateCall(VecExtractFunc, {VlsegN, Builder.getInt32(i)});
- Return = Builder.CreateInsertValue(Return, VecExtract, i);
- }
- }
-
- for (auto [Idx, DIO] : enumerate(DeinterleaveResults)) {
- if (!DIO)
- continue;
- // We have to create a brand new ExtractValue to replace each
- // of these old ExtractValue instructions.
- Value *NewEV =
- Builder.CreateExtractValue(Return, {static_cast<unsigned>(Idx)});
- DIO->replaceAllUsesWith(NewEV);
- }
-
- return true;
-}
-
/// Lower an interleaved vp.store into a vssegN intrinsic.
///
/// E.g. Lower an interleaved vp.store (Factor = 2):
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
index 6897865..ea78dcd 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
@@ -1364,7 +1364,24 @@ defm : DemangledGetBuiltin<"get_sub_group_gt_mask", OpenCL_std, Variable, Subgro
defm : DemangledGetBuiltin<"get_sub_group_le_mask", OpenCL_std, Variable, SubgroupLeMask>;
defm : DemangledGetBuiltin<"get_sub_group_lt_mask", OpenCL_std, Variable, SubgroupLtMask>;
defm : DemangledGetBuiltin<"__spirv_BuiltInGlobalLinearId", OpenCL_std, Variable, GlobalLinearId>;
-defm : DemangledGetBuiltin<"__spirv_BuiltInGlobalInvocationId", OpenCL_std, Variable, GlobalInvocationId>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInLocalInvocationIndex", OpenCL_std, Variable, LocalInvocationIndex>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInWorkDim", OpenCL_std, Variable, WorkDim>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupSize", OpenCL_std, Variable, SubgroupSize>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupMaxSize", OpenCL_std, Variable, SubgroupMaxSize>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInNumSubgroups", OpenCL_std, Variable, NumSubgroups>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInNumEnqueuedSubgroups", OpenCL_std, Variable, NumEnqueuedSubgroups>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupId", OpenCL_std, Variable, SubgroupId>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupLocalInvocationId", OpenCL_std, Variable, SubgroupLocalInvocationId>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupEqMask", OpenCL_std, Variable, SubgroupEqMask>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupEqMaskKHR", OpenCL_std, Variable, SubgroupEqMask>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupGeMask", OpenCL_std, Variable, SubgroupGeMask>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupGeMaskKHR", OpenCL_std, Variable, SubgroupGeMask>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupGtMask", OpenCL_std, Variable, SubgroupGtMask>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupGtMaskKHR", OpenCL_std, Variable, SubgroupGtMask>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupLeMask", OpenCL_std, Variable, SubgroupLeMask>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupLeMaskKHR", OpenCL_std, Variable, SubgroupLeMask>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupLtMask", OpenCL_std, Variable, SubgroupLtMask>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupLtMaskKHR", OpenCL_std, Variable, SubgroupLtMask>;
// GetQuery builtin records:
defm : DemangledGetBuiltin<"get_local_id", OpenCL_std, GetQuery, LocalInvocationId>;
@@ -1375,6 +1392,14 @@ defm : DemangledGetBuiltin<"get_group_id", OpenCL_std, GetQuery, WorkgroupId>;
defm : DemangledGetBuiltin<"get_enqueued_local_size", OpenCL_std, GetQuery, EnqueuedWorkgroupSize>;
defm : DemangledGetBuiltin<"get_num_groups", OpenCL_std, GetQuery, NumWorkgroups>;
defm : DemangledGetBuiltin<"get_global_offset", OpenCL_std, GetQuery, GlobalOffset>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInLocalInvocationId", OpenCL_std, GetQuery, LocalInvocationId>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInGlobalInvocationId", OpenCL_std, GetQuery, GlobalInvocationId>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInWorkgroupSize", OpenCL_std, GetQuery, WorkgroupSize>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInGlobalSize", OpenCL_std, GetQuery, GlobalSize>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInWorkgroupId", OpenCL_std, GetQuery, WorkgroupId>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInEnqueuedWorkgroupSize", OpenCL_std, GetQuery, EnqueuedWorkgroupSize>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInNumWorkgroups", OpenCL_std, GetQuery, NumWorkgroups>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInGlobalOffset", OpenCL_std, GetQuery, GlobalOffset>;
defm : DemangledGetBuiltin<"__hlsl_wave_get_lane_index", GLSL_std_450, Wave, SubgroupLocalInvocationId>;
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 6bcb7a3..2636979 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1661,7 +1661,7 @@ namespace llvm {
/// Lower interleaved load(s) into target specific
/// instructions/intrinsics.
- bool lowerInterleavedLoad(LoadInst *LI,
+ bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const override;
diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
index 1eb47e3..360293bc 100644
--- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
@@ -801,7 +801,7 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
// number of shuffles and ISA.
// Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX.
bool X86TargetLowering::lowerInterleavedLoad(
- LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+ Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices, unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
@@ -809,6 +809,11 @@ bool X86TargetLowering::lowerInterleavedLoad(
assert(Shuffles.size() == Indices.size() &&
"Unmatched number of shufflevectors and indices");
+ auto *LI = dyn_cast<LoadInst>(Load);
+ if (!LI)
+ return false;
+ assert(!Mask && "Unexpected mask on a load");
+
// Create an interleaved access group.
IRBuilder<> Builder(LI);
X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget,
diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp
index d7e206ef..4ca7444 100644
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -443,6 +443,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
Features["gfx1250-insts"] = true;
Features["bitop3-insts"] = true;
Features["prng-inst"] = true;
+ Features["tanh-insts"] = true;
Features["transpose-load-f4f6-insts"] = true;
Features["bf16-trans-insts"] = true;
Features["fp8-conversion-insts"] = true;
diff --git a/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp b/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp
index e7a6fa4..55f3239 100644
--- a/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp
+++ b/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp
@@ -10,6 +10,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
@@ -184,22 +185,14 @@ void LowerAllowCheckPass::printPipeline(
// correctness.
// TODO: print shorter output by combining adjacent runs, etc.
int i = 0;
- bool printed = false;
+ ListSeparator LS(";");
for (unsigned int cutoff : Opts.cutoffs) {
- if (cutoff > 0) {
- if (printed)
- OS << ";";
- OS << "cutoffs[" << i << "]=" << cutoff;
- printed = true;
- }
-
+ if (cutoff > 0)
+ OS << LS << "cutoffs[" << i << "]=" << cutoff;
i++;
}
- if (Opts.runtime_check) {
- if (printed)
- OS << ";";
- OS << "runtime_check=" << Opts.runtime_check;
- }
+ if (Opts.runtime_check)
+ OS << LS << "runtime_check=" << Opts.runtime_check;
OS << '>';
}