diff options
author | U-BERGUFFLEN\meinersbur <llvm-project@meinersbur.de> | 2024-11-04 14:08:55 +0100 |
---|---|---|
committer | U-BERGUFFLEN\meinersbur <llvm-project@meinersbur.de> | 2024-11-04 14:08:55 +0100 |
commit | 54bc5758988e505e17618c04a9a36d1cfa72913d (patch) | |
tree | 37e93309d367876bc1b6ffac21988665f85dc43d | |
parent | dacf10168ae0dfd222ed301b93369834d411d139 (diff) | |
parent | 3cffa3474fd20518e19afa0c0ad3ff602864f688 (diff) | |
download | llvm-users/meinersbur/irbuilder-extract.zip llvm-users/meinersbur/irbuilder-extract.tar.gz llvm-users/meinersbur/irbuilder-extract.tar.bz2 |
Merge commit '3cffa3474fd20518e19afa0c0ad3ff602864f688' into users/meinersbur/irbuilder-extractusers/meinersbur/irbuilder-extract
20 files changed, 5299 insertions, 1673 deletions
diff --git a/clang/docs/APINotes.rst b/clang/docs/APINotes.rst index dcefa68..d20c4f9 100644 --- a/clang/docs/APINotes.rst +++ b/clang/docs/APINotes.rst @@ -172,10 +172,28 @@ declaration kind), all of which are optional: :: Tags: + - Name: OwnedStorage + SwiftImportAs: owned + +:SwiftRetainOp, SwiftReleaseOp: + + Controls the lifetime operations of a class which uses custom reference + counting. The class must be annotated as a reference type using + ``SwiftImportAs: reference``. The values are either names of global functions, + each taking a single parameter of a pointer type, or ``immortal`` for a type + that is considered alive for the duration of the program. + + :: + + Tags: - Name: RefCountedStorage SwiftImportAs: reference SwiftReleaseOp: RCRelease SwiftRetainOp: RCRetain + - Name: ImmortalSingleton + SwiftImportAs: reference + SwiftReleaseOp: immortal + SwiftRetainOp: immortal :SwiftCopyable: diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp index 229721a..6bf2908 100644 --- a/clang/lib/AST/Type.cpp +++ b/clang/lib/AST/Type.cpp @@ -43,6 +43,7 @@ #include "llvm/ADT/APSInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/FoldingSet.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" @@ -4774,7 +4775,10 @@ bool Type::canHaveNullability(bool ResultIfUnknown) const { ->getTemplateName() .getAsTemplateDecl()) if (auto *CTD = dyn_cast<ClassTemplateDecl>(templateDecl)) - return CTD->getTemplatedDecl()->hasAttr<TypeNullableAttr>(); + return llvm::any_of( + CTD->redecls(), [](const RedeclarableTemplateDecl *RTD) { + return RTD->getTemplatedDecl()->hasAttr<TypeNullableAttr>(); + }); return ResultIfUnknown; case Type::Builtin: @@ -4841,10 +4845,14 @@ bool Type::canHaveNullability(bool ResultIfUnknown) const { // For template specializations, look only at primary template attributes. // This is a consistent regardless of whether the instantiation is known. if (const auto *CTSD = dyn_cast<ClassTemplateSpecializationDecl>(RD)) - return CTSD->getSpecializedTemplate() - ->getTemplatedDecl() - ->hasAttr<TypeNullableAttr>(); - return RD->hasAttr<TypeNullableAttr>(); + return llvm::any_of( + CTSD->getSpecializedTemplate()->redecls(), + [](const RedeclarableTemplateDecl *RTD) { + return RTD->getTemplatedDecl()->hasAttr<TypeNullableAttr>(); + }); + return llvm::any_of(RD->redecls(), [](const TagDecl *RD) { + return RD->hasAttr<TypeNullableAttr>(); + }); } // Non-pointer types. diff --git a/clang/test/SemaCXX/nullability_redecl.cpp b/clang/test/SemaCXX/nullability_redecl.cpp new file mode 100644 index 0000000..99bc521 --- /dev/null +++ b/clang/test/SemaCXX/nullability_redecl.cpp @@ -0,0 +1,27 @@ +// RUN: %clang_cc1 -std=c++11 -fsyntax-only -Wno-nullability-declspec %s -verify -Wnullable-to-nonnull-conversion -I%S/Inputs + +class Foo; +using Foo1 = Foo _Nonnull; // expected-error{{nullability specifier '_Nonnull' cannot be applied to non-pointer type 'Foo'}} +class _Nullable Foo; +using Foo2 = Foo _Nonnull; +class Foo; +using Foo3 = Foo _Nonnull; + +template <class T> +class Bar; +using Bar1 = Bar<int> _Nonnull; // expected-error{{nullability specifier '_Nonnull' cannot be applied to non-pointer type 'Bar<int>'}} +template <class T> +class _Nullable Bar; +using Bar2 = Bar<int> _Nonnull; +template <class T> +class Bar; +using Bar3 = Bar<int> _Nonnull; + +namespace std { + template<class T> class unique_ptr; + using UP1 = unique_ptr<int> _Nonnull; + class X { template<class T> friend class unique_ptr; }; + using UP2 = unique_ptr<int> _Nonnull; + template<class T> class unique_ptr; + using UP3 = unique_ptr<int> _Nonnull; +} diff --git a/libclc/generic/lib/math/clc_fma.cl b/libclc/generic/lib/math/clc_fma.cl index dee90e9..34355a3 100644 --- a/libclc/generic/lib/math/clc_fma.cl +++ b/libclc/generic/lib/math/clc_fma.cl @@ -22,137 +22,141 @@ #include <clc/clc.h> +#include "../clcmacro.h" #include "config.h" #include "math.h" -#include "../clcmacro.h" struct fp { - ulong mantissa; - int exponent; - uint sign; + ulong mantissa; + int exponent; + uint sign; }; -_CLC_DEF _CLC_OVERLOAD float __clc_sw_fma(float a, float b, float c) -{ - /* special cases */ - if (isnan(a) || isnan(b) || isnan(c) || isinf(a) || isinf(b)) - return mad(a, b, c); +_CLC_DEF _CLC_OVERLOAD float __clc_sw_fma(float a, float b, float c) { + /* special cases */ + if (isnan(a) || isnan(b) || isnan(c) || isinf(a) || isinf(b)) + return mad(a, b, c); - /* If only c is inf, and both a,b are regular numbers, the result is c*/ - if (isinf(c)) - return c; + /* If only c is inf, and both a,b are regular numbers, the result is c*/ + if (isinf(c)) + return c; - a = __clc_flush_denormal_if_not_supported(a); - b = __clc_flush_denormal_if_not_supported(b); - c = __clc_flush_denormal_if_not_supported(c); + a = __clc_flush_denormal_if_not_supported(a); + b = __clc_flush_denormal_if_not_supported(b); + c = __clc_flush_denormal_if_not_supported(c); - if (c == 0) - return a * b; + if (c == 0) + return a * b; - struct fp st_a, st_b, st_c; + struct fp st_a, st_b, st_c; - st_a.exponent = a == .0f ? 0 : ((as_uint(a) & 0x7f800000) >> 23) - 127; - st_b.exponent = b == .0f ? 0 : ((as_uint(b) & 0x7f800000) >> 23) - 127; - st_c.exponent = c == .0f ? 0 : ((as_uint(c) & 0x7f800000) >> 23) - 127; + st_a.exponent = a == .0f ? 0 : ((as_uint(a) & 0x7f800000) >> 23) - 127; + st_b.exponent = b == .0f ? 0 : ((as_uint(b) & 0x7f800000) >> 23) - 127; + st_c.exponent = c == .0f ? 0 : ((as_uint(c) & 0x7f800000) >> 23) - 127; - st_a.mantissa = a == .0f ? 0 : (as_uint(a) & 0x7fffff) | 0x800000; - st_b.mantissa = b == .0f ? 0 : (as_uint(b) & 0x7fffff) | 0x800000; - st_c.mantissa = c == .0f ? 0 : (as_uint(c) & 0x7fffff) | 0x800000; + st_a.mantissa = a == .0f ? 0 : (as_uint(a) & 0x7fffff) | 0x800000; + st_b.mantissa = b == .0f ? 0 : (as_uint(b) & 0x7fffff) | 0x800000; + st_c.mantissa = c == .0f ? 0 : (as_uint(c) & 0x7fffff) | 0x800000; - st_a.sign = as_uint(a) & 0x80000000; - st_b.sign = as_uint(b) & 0x80000000; - st_c.sign = as_uint(c) & 0x80000000; + st_a.sign = as_uint(a) & 0x80000000; + st_b.sign = as_uint(b) & 0x80000000; + st_c.sign = as_uint(c) & 0x80000000; - // Multiplication. - // Move the product to the highest bits to maximize precision - // mantissa is 24 bits => product is 48 bits, 2bits non-fraction. - // Add one bit for future addition overflow, - // add another bit to detect subtraction underflow - struct fp st_mul; - st_mul.sign = st_a.sign ^ st_b.sign; - st_mul.mantissa = (st_a.mantissa * st_b.mantissa) << 14ul; - st_mul.exponent = st_mul.mantissa ? st_a.exponent + st_b.exponent : 0; + // Multiplication. + // Move the product to the highest bits to maximize precision + // mantissa is 24 bits => product is 48 bits, 2bits non-fraction. + // Add one bit for future addition overflow, + // add another bit to detect subtraction underflow + struct fp st_mul; + st_mul.sign = st_a.sign ^ st_b.sign; + st_mul.mantissa = (st_a.mantissa * st_b.mantissa) << 14ul; + st_mul.exponent = st_mul.mantissa ? st_a.exponent + st_b.exponent : 0; - // FIXME: Detecting a == 0 || b == 0 above crashed GCN isel - if (st_mul.exponent == 0 && st_mul.mantissa == 0) - return c; + // FIXME: Detecting a == 0 || b == 0 above crashed GCN isel + if (st_mul.exponent == 0 && st_mul.mantissa == 0) + return c; // Mantissa is 23 fractional bits, shift it the same way as product mantissa #define C_ADJUST 37ul - // both exponents are bias adjusted - int exp_diff = st_mul.exponent - st_c.exponent; - - st_c.mantissa <<= C_ADJUST; - ulong cutoff_bits = 0; - ulong cutoff_mask = (1ul << abs(exp_diff)) - 1ul; - if (exp_diff > 0) { - cutoff_bits = exp_diff >= 64 ? st_c.mantissa : (st_c.mantissa & cutoff_mask); - st_c.mantissa = exp_diff >= 64 ? 0 : (st_c.mantissa >> exp_diff); - } else { - cutoff_bits = -exp_diff >= 64 ? st_mul.mantissa : (st_mul.mantissa & cutoff_mask); - st_mul.mantissa = -exp_diff >= 64 ? 0 : (st_mul.mantissa >> -exp_diff); - } - - struct fp st_fma; - st_fma.sign = st_mul.sign; - st_fma.exponent = max(st_mul.exponent, st_c.exponent); - if (st_c.sign == st_mul.sign) { - st_fma.mantissa = st_mul.mantissa + st_c.mantissa; - } else { - // cutoff bits borrow one - st_fma.mantissa = st_mul.mantissa - st_c.mantissa - (cutoff_bits && (st_mul.exponent > st_c.exponent) ? 1 : 0); - } - - // underflow: st_c.sign != st_mul.sign, and magnitude switches the sign - if (st_fma.mantissa > LONG_MAX) { - st_fma.mantissa = 0 - st_fma.mantissa; - st_fma.sign = st_mul.sign ^ 0x80000000; - } - - // detect overflow/underflow - int overflow_bits = 3 - clz(st_fma.mantissa); - - // adjust exponent - st_fma.exponent += overflow_bits; - - // handle underflow - if (overflow_bits < 0) { - st_fma.mantissa <<= -overflow_bits; - overflow_bits = 0; - } - - // rounding - ulong trunc_mask = (1ul << (C_ADJUST + overflow_bits)) - 1; - ulong trunc_bits = (st_fma.mantissa & trunc_mask) | (cutoff_bits != 0); - ulong last_bit = st_fma.mantissa & (1ul << (C_ADJUST + overflow_bits)); - ulong grs_bits = (0x4ul << (C_ADJUST - 3 + overflow_bits)); - - // round to nearest even - if ((trunc_bits > grs_bits) || - (trunc_bits == grs_bits && last_bit != 0)) - st_fma.mantissa += (1ul << (C_ADJUST + overflow_bits)); - - // Shift mantissa back to bit 23 - st_fma.mantissa = (st_fma.mantissa >> (C_ADJUST + overflow_bits)); - - // Detect rounding overflow - if (st_fma.mantissa > 0xffffff) { - ++st_fma.exponent; - st_fma.mantissa >>= 1; - } - - if (st_fma.mantissa == 0) - return .0f; - - // Flating point range limit - if (st_fma.exponent > 127) - return as_float(as_uint(INFINITY) | st_fma.sign); - - // Flush denormals - if (st_fma.exponent <= -127) - return as_float(st_fma.sign); - - return as_float(st_fma.sign | ((st_fma.exponent + 127) << 23) | ((uint)st_fma.mantissa & 0x7fffff)); + // both exponents are bias adjusted + int exp_diff = st_mul.exponent - st_c.exponent; + + st_c.mantissa <<= C_ADJUST; + ulong cutoff_bits = 0; + ulong cutoff_mask = (1ul << abs(exp_diff)) - 1ul; + if (exp_diff > 0) { + cutoff_bits = + exp_diff >= 64 ? st_c.mantissa : (st_c.mantissa & cutoff_mask); + st_c.mantissa = exp_diff >= 64 ? 0 : (st_c.mantissa >> exp_diff); + } else { + cutoff_bits = + -exp_diff >= 64 ? st_mul.mantissa : (st_mul.mantissa & cutoff_mask); + st_mul.mantissa = -exp_diff >= 64 ? 0 : (st_mul.mantissa >> -exp_diff); + } + + struct fp st_fma; + st_fma.sign = st_mul.sign; + st_fma.exponent = max(st_mul.exponent, st_c.exponent); + if (st_c.sign == st_mul.sign) { + st_fma.mantissa = st_mul.mantissa + st_c.mantissa; + } else { + // cutoff bits borrow one + st_fma.mantissa = + st_mul.mantissa - st_c.mantissa - + (cutoff_bits && (st_mul.exponent > st_c.exponent) ? 1 : 0); + } + + // underflow: st_c.sign != st_mul.sign, and magnitude switches the sign + if (st_fma.mantissa > LONG_MAX) { + st_fma.mantissa = 0 - st_fma.mantissa; + st_fma.sign = st_mul.sign ^ 0x80000000; + } + + // detect overflow/underflow + int overflow_bits = 3 - clz(st_fma.mantissa); + + // adjust exponent + st_fma.exponent += overflow_bits; + + // handle underflow + if (overflow_bits < 0) { + st_fma.mantissa <<= -overflow_bits; + overflow_bits = 0; + } + + // rounding + ulong trunc_mask = (1ul << (C_ADJUST + overflow_bits)) - 1; + ulong trunc_bits = (st_fma.mantissa & trunc_mask) | (cutoff_bits != 0); + ulong last_bit = st_fma.mantissa & (1ul << (C_ADJUST + overflow_bits)); + ulong grs_bits = (0x4ul << (C_ADJUST - 3 + overflow_bits)); + + // round to nearest even + if ((trunc_bits > grs_bits) || (trunc_bits == grs_bits && last_bit != 0)) + st_fma.mantissa += (1ul << (C_ADJUST + overflow_bits)); + + // Shift mantissa back to bit 23 + st_fma.mantissa = (st_fma.mantissa >> (C_ADJUST + overflow_bits)); + + // Detect rounding overflow + if (st_fma.mantissa > 0xffffff) { + ++st_fma.exponent; + st_fma.mantissa >>= 1; + } + + if (st_fma.mantissa == 0) + return .0f; + + // Flating point range limit + if (st_fma.exponent > 127) + return as_float(as_uint(INFINITY) | st_fma.sign); + + // Flush denormals + if (st_fma.exponent <= -127) + return as_float(st_fma.sign); + + return as_float(st_fma.sign | ((st_fma.exponent + 127) << 23) | + ((uint)st_fma.mantissa & 0x7fffff)); } -_CLC_TERNARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_sw_fma, float, float, float) +_CLC_TERNARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_sw_fma, float, + float, float) diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/optional/TestDataFormatterLibcxxOptionalSimulator.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/optional/TestDataFormatterLibcxxOptionalSimulator.py index 7167f4c..3fefe87 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/optional/TestDataFormatterLibcxxOptionalSimulator.py +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/optional/TestDataFormatterLibcxxOptionalSimulator.py @@ -47,6 +47,12 @@ for r in range(2): name = f"test_r{r}" defines = [f"REVISION={r}"] + # LLDB's FormatterCache caches on DW_AT_name. A change introduced in + # clang-17 (commit bee886052) changed the contents of DW_AT_name for + # template specializations, which broke FormatterCache assumptions + # causing this test to fail. This was reverted in newer version of clang + # with commit 52a9ba7ca. + @skipIf(compiler="clang", compiler_version=["=", "17"]) @functools.wraps(LibcxxOptionalDataFormatterSimulatorTestCase._run_test) def test_method(self, defines=defines): LibcxxOptionalDataFormatterSimulatorTestCase._run_test(self, defines) diff --git a/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h b/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h index e82155a..ff5af59 100644 --- a/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h +++ b/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h @@ -155,7 +155,7 @@ class InstCostVisitor : public InstVisitor<InstCostVisitor, Constant *> { Function *F; const DataLayout &DL; TargetTransformInfo &TTI; - SCCPSolver &Solver; + const SCCPSolver &Solver; ConstMap KnownConstants; // Basic blocks known to be unreachable after constant propagation. diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index 526305d..892912a 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -990,13 +990,13 @@ bool SPIRVInstructionSelector::selectMemOperation(Register ResVReg, Register VarReg = MRI->createGenericVirtualRegister(LLT::scalar(64)); GR.add(GV, GR.CurMF, VarReg); - buildOpDecorate(VarReg, I, TII, SPIRV::Decoration::Constant, {}); BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpVariable)) .addDef(VarReg) .addUse(GR.getSPIRVTypeID(VarTy)) .addImm(SPIRV::StorageClass::UniformConstant) .addUse(Const) .constrainAllUses(TII, TRI, RBI); + buildOpDecorate(VarReg, I, TII, SPIRV::Decoration::Constant, {}); SPIRVType *SourceTy = GR.getOrCreateSPIRVPointerType( ValTy, I, TII, SPIRV::StorageClass::UniformConstant); SrcReg = MRI->createGenericVirtualRegister(LLT::scalar(64)); diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp index cc34cf8..790d86f 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp @@ -829,7 +829,7 @@ static void insertSpirvDecorations(MachineFunction &MF, MachineIRBuilder MIB) { for (MachineInstr &MI : MBB) { if (!isSpvIntrinsic(MI, Intrinsic::spv_assign_decoration)) continue; - MIB.setInsertPt(*MI.getParent(), MI); + MIB.setInsertPt(*MI.getParent(), MI.getNextNode()); buildOpSpirvDecorations(MI.getOperand(1).getReg(), MIB, MI.getOperand(2).getMetadata()); ToErase.push_back(&MI); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 0ae814d..1fd6b24 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -18831,44 +18831,66 @@ X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false); } -static SDValue -GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, - SDValue *InGlue, const EVT PtrVT, unsigned ReturnReg, - unsigned char OperandFlags, bool LocalDynamic = false) { +static SDValue GetTLSADDR(SelectionDAG &DAG, GlobalAddressSDNode *GA, + const EVT PtrVT, unsigned ReturnReg, + unsigned char OperandFlags, + bool LoadGlobalBaseReg = false, + bool LocalDynamic = false) { MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); SDLoc dl(GA); SDValue TGA; bool UseTLSDESC = DAG.getTarget().useTLSDESC(); + SDValue Chain = DAG.getEntryNode(); + SDValue Ret; if (LocalDynamic && UseTLSDESC) { TGA = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, OperandFlags); auto UI = TGA->use_begin(); // Reuse existing GetTLSADDR node if we can find it. - if (UI != TGA->use_end()) - return SDValue(*UI->use_begin()->use_begin(), 0); + if (UI != TGA->use_end()) { + // TLSDESC uses TGA. + auto TLSDescOp = UI; + assert(TLSDescOp->getOpcode() == X86ISD::TLSDESC && + "Unexpected TLSDESC DAG"); + // CALLSEQ_END uses TGA via a chain and glue. + auto *CallSeqEndOp = TLSDescOp->getGluedUser(); + assert(CallSeqEndOp && CallSeqEndOp->getOpcode() == ISD::CALLSEQ_END && + "Unexpected TLSDESC DAG"); + // CopyFromReg uses CALLSEQ_END via a chain and glue. + auto *CopyFromRegOp = CallSeqEndOp->getGluedUser(); + assert(CopyFromRegOp && CopyFromRegOp->getOpcode() == ISD::CopyFromReg && + "Unexpected TLSDESC DAG"); + Ret = SDValue(CopyFromRegOp, 0); + } } else { TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), GA->getOffset(), OperandFlags); } - X86ISD::NodeType CallType = UseTLSDESC ? X86ISD::TLSDESC - : LocalDynamic ? X86ISD::TLSBASEADDR - : X86ISD::TLSADDR; + if (!Ret) { + X86ISD::NodeType CallType = UseTLSDESC ? X86ISD::TLSDESC + : LocalDynamic ? X86ISD::TLSBASEADDR + : X86ISD::TLSADDR; - if (InGlue) { - SDValue Ops[] = { Chain, TGA, *InGlue }; - Chain = DAG.getNode(CallType, dl, NodeTys, Ops); - } else { - SDValue Ops[] = { Chain, TGA }; - Chain = DAG.getNode(CallType, dl, NodeTys, Ops); - } + Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl); + if (LoadGlobalBaseReg) { + SDValue InGlue; + Chain = DAG.getCopyToReg(Chain, dl, X86::EBX, + DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), + InGlue); + InGlue = Chain.getValue(1); + Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA, InGlue}); + } else { + Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA}); + } + Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), dl); - // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. - MFI.setAdjustsStack(true); - MFI.setHasCalls(true); + // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. + MFI.setHasCalls(true); - SDValue Glue = Chain.getValue(1); - SDValue Ret = DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue); + SDValue Glue = Chain.getValue(1); + Ret = DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue); + } if (!UseTLSDESC) return Ret; @@ -18887,30 +18909,22 @@ GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, static SDValue LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT) { - SDValue InGlue; - SDLoc dl(GA); // ? function entry point might be better - SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, - DAG.getNode(X86ISD::GlobalBaseReg, - SDLoc(), PtrVT), InGlue); - InGlue = Chain.getValue(1); - - return GetTLSADDR(DAG, Chain, GA, &InGlue, PtrVT, X86::EAX, X86II::MO_TLSGD); + return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD, + /*LoadGlobalBaseReg=*/true); } // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64 static SDValue LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT) { - return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, - X86::RAX, X86II::MO_TLSGD); + return GetTLSADDR(DAG, GA, PtrVT, X86::RAX, X86II::MO_TLSGD); } // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32 static SDValue LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT) { - return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, - X86::EAX, X86II::MO_TLSGD); + return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD); } static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, @@ -18919,22 +18933,20 @@ static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, SDLoc dl(GA); // Get the start address of the TLS block for this module. - X86MachineFunctionInfo *MFI = DAG.getMachineFunction() - .getInfo<X86MachineFunctionInfo>(); + X86MachineFunctionInfo *MFI = + DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>(); MFI->incNumLocalDynamicTLSAccesses(); SDValue Base; if (Is64Bit) { unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX; - Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, ReturnReg, - X86II::MO_TLSLD, /*LocalDynamic=*/true); + Base = GetTLSADDR(DAG, GA, PtrVT, ReturnReg, X86II::MO_TLSLD, + /*LoadGlobalBaseReg=*/false, + /*LocalDynamic=*/true); } else { - SDValue InGlue; - SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, - DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InGlue); - InGlue = Chain.getValue(1); - Base = GetTLSADDR(DAG, Chain, GA, &InGlue, PtrVT, X86::EAX, - X86II::MO_TLSLDM, /*LocalDynamic=*/true); + Base = GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSLDM, + /*LoadGlobalBaseReg=*/true, + /*LocalDynamic=*/true); } // Note: the CleanupLocalDynamicTLSPass will remove redundant computations @@ -36060,36 +36072,6 @@ X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI, } MachineBasicBlock * -X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI, - MachineBasicBlock *BB) const { - // So, here we replace TLSADDR with the sequence: - // adjust_stackdown -> TLSADDR -> adjust_stackup. - // We need this because TLSADDR is lowered into calls - // inside MC, therefore without the two markers shrink-wrapping - // may push the prologue/epilogue pass them. - const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); - const MIMetadata MIMD(MI); - MachineFunction &MF = *BB->getParent(); - - // Emit CALLSEQ_START right before the instruction. - MF.getFrameInfo().setAdjustsStack(true); - unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); - MachineInstrBuilder CallseqStart = - BuildMI(MF, MIMD, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0); - BB->insert(MachineBasicBlock::iterator(MI), CallseqStart); - - // Emit CALLSEQ_END right after the instruction. - // We don't call erase from parent because we want to keep the - // original instruction around. - unsigned AdjStackUp = TII.getCallFrameDestroyOpcode(); - MachineInstrBuilder CallseqEnd = - BuildMI(MF, MIMD, TII.get(AdjStackUp)).addImm(0).addImm(0); - BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd); - - return BB; -} - -MachineBasicBlock * X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI, MachineBasicBlock *BB) const { // This is pretty easy. We're taking the value that we received from @@ -37091,16 +37073,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, return X86::TMM0_TMM1 + Imm / 2; }; switch (MI.getOpcode()) { - default: llvm_unreachable("Unexpected instr type to insert"); - case X86::TLS_addr32: - case X86::TLS_addr64: - case X86::TLS_addrX32: - case X86::TLS_base_addr32: - case X86::TLS_base_addr64: - case X86::TLS_base_addrX32: - case X86::TLS_desc32: - case X86::TLS_desc64: - return EmitLoweredTLSAddr(MI, BB); + default: + llvm_unreachable("Unexpected instr type to insert"); case X86::INDIRECT_THUNK_CALL32: case X86::INDIRECT_THUNK_CALL64: case X86::INDIRECT_THUNK_TCRETURN32: diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 14ada17..2db25d6 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1848,9 +1848,6 @@ namespace llvm { MachineBasicBlock *EmitLoweredProbedAlloca(MachineInstr &MI, MachineBasicBlock *BB) const; - MachineBasicBlock *EmitLoweredTLSAddr(MachineInstr &MI, - MachineBasicBlock *BB) const; - MachineBasicBlock *EmitLoweredTLSCall(MachineInstr &MI, MachineBasicBlock *BB) const; diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td index a05c3f0..51cee2e 100644 --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -478,7 +478,7 @@ let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7, MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS, DF], - usesCustomInserter = 1, Uses = [ESP, SSP] in { + Uses = [ESP, SSP] in { def TLS_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym), "# TLS_addr32", [(X86tlsaddr tls32addr:$sym)]>, @@ -498,7 +498,7 @@ let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11, MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS, DF], - usesCustomInserter = 1, Uses = [RSP, SSP] in { + Uses = [RSP, SSP] in { def TLS_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym), "# TLS_addr64", [(X86tlsaddr tls64addr:$sym)]>, @@ -520,7 +520,7 @@ def TLS_base_addrX32 : I<0, Pseudo, (outs), (ins i32mem:$sym), // TLSDESC only clobbers EAX and EFLAGS. ESP is marked as a use to prevent // stack-pointer assignments that appear immediately before calls from // potentially appearing dead. -let Defs = [EAX, EFLAGS], usesCustomInserter = 1, Uses = [RSP, SSP] in { +let Defs = [EAX, EFLAGS], Uses = [RSP, SSP] in { def TLS_desc32 : I<0, Pseudo, (outs), (ins i32mem:$sym), "# TLS_desc32", [(X86tlsdesc tls32addr:$sym)]>; def TLS_desc64 : I<0, Pseudo, (outs), (ins i64mem:$sym), diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp index 1efec22..17d8283 100644 --- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp +++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp @@ -472,16 +472,24 @@ Constant *InstCostVisitor::visitCastInst(CastInst &I) { Constant *InstCostVisitor::visitCmpInst(CmpInst &I) { assert(LastVisited != KnownConstants.end() && "Invalid iterator!"); - bool Swap = I.getOperand(1) == LastVisited->first; - Value *V = Swap ? I.getOperand(0) : I.getOperand(1); + Constant *Const = LastVisited->second; + bool ConstOnRHS = I.getOperand(1) == LastVisited->first; + Value *V = ConstOnRHS ? I.getOperand(0) : I.getOperand(1); Constant *Other = findConstantFor(V, KnownConstants); - if (!Other) - return nullptr; - Constant *Const = LastVisited->second; - return Swap ? - ConstantFoldCompareInstOperands(I.getPredicate(), Other, Const, DL) - : ConstantFoldCompareInstOperands(I.getPredicate(), Const, Other, DL); + if (Other) { + if (ConstOnRHS) + std::swap(Const, Other); + return ConstantFoldCompareInstOperands(I.getPredicate(), Const, Other, DL); + } + + // If we haven't found Other to be a specific constant value, we may still be + // able to constant fold using information from the lattice value. + const ValueLatticeElement &ConstLV = ValueLatticeElement::get(Const); + const ValueLatticeElement &OtherLV = Solver.getLatticeValueFor(V); + auto &V1State = ConstOnRHS ? OtherLV : ConstLV; + auto &V2State = ConstOnRHS ? ConstLV : OtherLV; + return V1State.getCompare(I.getPredicate(), I.getType(), V2State, DL); } Constant *InstCostVisitor::visitUnaryOperator(UnaryOperator &I) { @@ -493,16 +501,17 @@ Constant *InstCostVisitor::visitUnaryOperator(UnaryOperator &I) { Constant *InstCostVisitor::visitBinaryOperator(BinaryOperator &I) { assert(LastVisited != KnownConstants.end() && "Invalid iterator!"); - bool Swap = I.getOperand(1) == LastVisited->first; - Value *V = Swap ? I.getOperand(0) : I.getOperand(1); + bool ConstOnRHS = I.getOperand(1) == LastVisited->first; + Value *V = ConstOnRHS ? I.getOperand(0) : I.getOperand(1); Constant *Other = findConstantFor(V, KnownConstants); - if (!Other) - return nullptr; + Value *OtherVal = Other ? Other : V; + Value *ConstVal = LastVisited->second; - Constant *Const = LastVisited->second; - return dyn_cast_or_null<Constant>(Swap ? - simplifyBinOp(I.getOpcode(), Other, Const, SimplifyQuery(DL)) - : simplifyBinOp(I.getOpcode(), Const, Other, SimplifyQuery(DL))); + if (ConstOnRHS) + std::swap(ConstVal, OtherVal); + + return dyn_cast_or_null<Constant>( + simplifyBinOp(I.getOpcode(), ConstVal, OtherVal, SimplifyQuery(DL))); } Constant *FunctionSpecializer::getPromotableAlloca(AllocaInst *Alloca, diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp index 440912e..b6906ab 100644 --- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -823,21 +823,16 @@ Function *CodeExtractor::constructFunctionDeclaration( // Assemble the function's parameter lists. std::vector<Type *> ParamTy; std::vector<Type *> AggParamTy; - std::vector<std::tuple<unsigned, Value *>> NumberedInputs; - std::vector<std::tuple<unsigned, Value *>> NumberedOutputs; const DataLayout &DL = M->getDataLayout(); // Add the types of the input values to the function's argument list - unsigned ArgNum = 0; for (Value *value : inputs) { LLVM_DEBUG(dbgs() << "value used in func: " << *value << "\n"); if (AggregateArgs && !ExcludeArgsFromAggregate.contains(value)) { AggParamTy.push_back(value->getType()); StructValues.insert(value); - } else { + } else ParamTy.push_back(value->getType()); - NumberedInputs.emplace_back(ArgNum++, value); - } } // Add the types of the output values to the function's argument list. @@ -846,11 +841,9 @@ Function *CodeExtractor::constructFunctionDeclaration( if (AggregateArgs && !ExcludeArgsFromAggregate.contains(output)) { AggParamTy.push_back(output->getType()); StructValues.insert(output); - } else { + } else ParamTy.push_back( PointerType::get(output->getType(), DL.getAllocaAddrSpace())); - NumberedOutputs.emplace_back(ArgNum++, output); - } } assert( diff --git a/llvm/test/CodeGen/NVPTX/load-store.ll b/llvm/test/CodeGen/NVPTX/load-store.ll index f922fd9..a4be81a 100644 --- a/llvm/test/CodeGen/NVPTX/load-store.ll +++ b/llvm/test/CodeGen/NVPTX/load-store.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck -check-prefixes=CHECK,SM60 %s ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | FileCheck %s -check-prefixes=CHECK,SM70 @@ -20,1595 +21,4947 @@ ; TODO: optimize .sys.shared into .cta.shared or .cluster.shared . -; generic statespace - -; CHECK-LABEL: generic_weak -define void @generic_weak(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr { - ; CHECK: ld.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +;; generic statespace + +; generic + +define void @generic_i8(ptr %a) { +; CHECK-LABEL: generic_i8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_i8_param_0]; +; CHECK-NEXT: ld.u8 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.u8 [%rd1], %rs2; +; CHECK-NEXT: ret; %a.load = load i8, ptr %a %a.add = add i8 %a.load, 1 - ; CHECK: st.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store i8 %a.add, ptr %a + ret void +} + +define void @generic_i16(ptr %a) { +; CHECK-LABEL: generic_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_i16_param_0]; +; CHECK-NEXT: ld.u16 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.u16 [%rd1], %rs2; +; CHECK-NEXT: ret; + %a.load = load i16, ptr %a + %a.add = add i16 %a.load, 1 + store i16 %a.add, ptr %a + ret void +} + +define void @generic_i32(ptr %a) { +; CHECK-LABEL: generic_i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_i32_param_0]; +; CHECK-NEXT: ld.u32 %r1, [%rd1]; +; CHECK-NEXT: add.s32 %r2, %r1, 1; +; CHECK-NEXT: st.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load i32, ptr %a + %a.add = add i32 %a.load, 1 + store i32 %a.add, ptr %a + ret void +} + +define void @generic_i64(ptr %a) { +; CHECK-LABEL: generic_i64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_i64_param_0]; +; CHECK-NEXT: ld.u64 %rd2, [%rd1]; +; CHECK-NEXT: add.s64 %rd3, %rd2, 1; +; CHECK-NEXT: st.u64 [%rd1], %rd3; +; CHECK-NEXT: ret; + %a.load = load i64, ptr %a + %a.add = add i64 %a.load, 1 + store i64 %a.add, ptr %a + ret void +} + +define void @generic_float(ptr %a) { +; CHECK-LABEL: generic_float( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_float_param_0]; +; CHECK-NEXT: ld.f32 %f1, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; CHECK-NEXT: st.f32 [%rd1], %f2; +; CHECK-NEXT: ret; + %a.load = load float, ptr %a + %a.add = fadd float %a.load, 1. + store float %a.add, ptr %a + ret void +} + +define void @generic_double(ptr %a) { +; CHECK-LABEL: generic_double( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_double_param_0]; +; CHECK-NEXT: ld.f64 %fd1, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.f64 [%rd1], %fd2; +; CHECK-NEXT: ret; + %a.load = load double, ptr %a + %a.add = fadd double %a.load, 1. + store double %a.add, ptr %a + ret void +} + +; TODO: make the lowering of this weak vector ops consistent with +; the ones of the next tests. This test lowers to a weak PTX +; vector op, but next test lowers to a vector PTX op. +define void @generic_2xi8(ptr %a) { +; CHECK-LABEL: generic_2xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xi8_param_0]; +; CHECK-NEXT: ld.v2.u8 {%rs1, %rs2}, [%rd1]; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: st.v2.u8 [%rd1], {%rs4, %rs3}; +; CHECK-NEXT: ret; + %a.load = load <2 x i8>, ptr %a + %a.add = add <2 x i8> %a.load, <i8 1, i8 1> + store <2 x i8> %a.add, ptr %a + ret void +} + +; TODO: make the lowering of this weak vector ops consistent with +; the ones of the previous test. This test lowers to a weak +; PTX scalar op, but prior test lowers to a vector PTX op. +define void @generic_4xi8(ptr %a) { +; CHECK-LABEL: generic_4xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b32 %r<13>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_4xi8_param_0]; +; CHECK-NEXT: ld.u32 %r1, [%rd1]; +; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; +; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; +; CHECK-NEXT: add.s16 %rs4, %rs3, 1; +; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; +; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; +; CHECK-NEXT: add.s16 %rs6, %rs5, 1; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; +; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8; +; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; +; CHECK-NEXT: add.s16 %rs8, %rs7, 1; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; +; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8; +; CHECK-NEXT: st.u32 [%rd1], %r12; +; CHECK-NEXT: ret; + %a.load = load <4 x i8>, ptr %a + %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1> + store <4 x i8> %a.add, ptr %a + ret void +} - ; CHECK: ld.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load i16, ptr %b - %b.add = add i16 %b.load, 1 - ; CHECK: st.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store i16 %b.add, ptr %b - - ; CHECK: ld.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load i32, ptr %c - %c.add = add i32 %c.load, 1 - ; CHECK: st.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store i32 %c.add, ptr %c - - ; CHECK: ld.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load i64, ptr %d - %d.add = add i64 %d.load, 1 - ; CHECK: st.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store i64 %d.add, ptr %d - - ; CHECK: ld.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load float, ptr %c - %e.add = fadd float %e.load, 1. - ; CHECK: st.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store float %e.add, ptr %c - - ; CHECK: ld.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load double, ptr %d - %f.add = fadd double %f.load, 1. - ; CHECK: st.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store double %f.add, ptr %d - - ; TODO: make the lowering of this weak vector ops consistent with - ; the ones of the next tests. This test lowers to a weak PTX - ; vector op, but next test lowers to a vector PTX op. - ; CHECK: ld.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] - %h.load = load <2 x i8>, ptr %b - %h.add = add <2 x i8> %h.load, <i8 1, i8 1> - ; CHECK: st.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}} - store <2 x i8> %h.add, ptr %b - - ; TODO: make the lowering of this weak vector ops consistent with - ; the ones of the previous test. This test lowers to a weak - ; PTX scalar op, but prior test lowers to a vector PTX op. - ; CHECK: ld.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %i.load = load <4 x i8>, ptr %c - %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1> - ; CHECK: st.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store <4 x i8> %i.add, ptr %c - - ; CHECK: ld.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %j.load = load <2 x i16>, ptr %c - %j.add = add <2 x i16> %j.load, <i16 1, i16 1> - ; CHECK: st.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store <2 x i16> %j.add, ptr %c - - ; CHECK: ld.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] - %k.load = load <4 x i16>, ptr %d - %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1> - ; CHECK: st.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} - store <4 x i16> %k.add, ptr %d - - ; CHECK: ld.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] - %l.load = load <2 x i32>, ptr %d - %l.add = add <2 x i32> %l.load, <i32 1, i32 1> - ; CHECK: st.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} - store <2 x i32> %l.add, ptr %d - - ; CHECK: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] - %m.load = load <4 x i32>, ptr %d - %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1> - ; CHECK: st.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} - store <4 x i32> %m.add, ptr %d - - ; CHECK: ld.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] - %n.load = load <2 x i64>, ptr %d - %n.add = add <2 x i64> %n.load, <i64 1, i64 1> - ; CHECK: st.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} - store <2 x i64> %n.add, ptr %d - - ; CHECK: ld.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] - %o.load = load <2 x float>, ptr %d - %o.add = fadd <2 x float> %o.load, <float 1., float 1.> - ; CHECK: st.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} - store <2 x float> %o.add, ptr %d - - ; CHECK: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] - %p.load = load <4 x float>, ptr %d - %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.> - ; CHECK: st.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} - store <4 x float> %p.add, ptr %d - - ; CHECK: ld.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] - %q.load = load <2 x double>, ptr %d - %q.add = fadd <2 x double> %q.load, <double 1., double 1.> - ; CHECK: st.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}} - store <2 x double> %q.add, ptr %d - - ret void -} - -; CHECK-LABEL: generic_volatile -define void @generic_volatile(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr { - ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +define void @generic_2xi16(ptr %a) { +; CHECK-LABEL: generic_2xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xi16_param_0]; +; CHECK-NEXT: ld.u32 %r1, [%rd1]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; CHECK-NEXT: st.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load <2 x i16>, ptr %a + %a.add = add <2 x i16> %a.load, <i16 1, i16 1> + store <2 x i16> %a.add, ptr %a + ret void +} + +define void @generic_4xi16(ptr %a) { +; CHECK-LABEL: generic_4xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_4xi16_param_0]; +; CHECK-NEXT: ld.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; +; CHECK-NEXT: add.s16 %rs5, %rs4, 1; +; CHECK-NEXT: add.s16 %rs6, %rs3, 1; +; CHECK-NEXT: add.s16 %rs7, %rs2, 1; +; CHECK-NEXT: add.s16 %rs8, %rs1, 1; +; CHECK-NEXT: st.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; +; CHECK-NEXT: ret; + %a.load = load <4 x i16>, ptr %a + %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1> + store <4 x i16> %a.add, ptr %a + ret void +} + +define void @generic_2xi32(ptr %a) { +; CHECK-LABEL: generic_2xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xi32_param_0]; +; CHECK-NEXT: ld.v2.u32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: add.s32 %r3, %r2, 1; +; CHECK-NEXT: add.s32 %r4, %r1, 1; +; CHECK-NEXT: st.v2.u32 [%rd1], {%r4, %r3}; +; CHECK-NEXT: ret; + %a.load = load <2 x i32>, ptr %a + %a.add = add <2 x i32> %a.load, <i32 1, i32 1> + store <2 x i32> %a.add, ptr %a + ret void +} + +define void @generic_4xi32(ptr %a) { +; CHECK-LABEL: generic_4xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_4xi32_param_0]; +; CHECK-NEXT: ld.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: add.s32 %r5, %r4, 1; +; CHECK-NEXT: add.s32 %r6, %r3, 1; +; CHECK-NEXT: add.s32 %r7, %r2, 1; +; CHECK-NEXT: add.s32 %r8, %r1, 1; +; CHECK-NEXT: st.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; +; CHECK-NEXT: ret; + %a.load = load <4 x i32>, ptr %a + %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1> + store <4 x i32> %a.add, ptr %a + ret void +} + +define void @generic_2xi64(ptr %a) { +; CHECK-LABEL: generic_2xi64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xi64_param_0]; +; CHECK-NEXT: ld.v2.u64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: add.s64 %rd4, %rd3, 1; +; CHECK-NEXT: add.s64 %rd5, %rd2, 1; +; CHECK-NEXT: st.v2.u64 [%rd1], {%rd5, %rd4}; +; CHECK-NEXT: ret; + %a.load = load <2 x i64>, ptr %a + %a.add = add <2 x i64> %a.load, <i64 1, i64 1> + store <2 x i64> %a.add, ptr %a + ret void +} + +define void @generic_2xfloat(ptr %a) { +; CHECK-LABEL: generic_2xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xfloat_param_0]; +; CHECK-NEXT: ld.v2.f32 {%f1, %f2}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; +; CHECK-NEXT: st.v2.f32 [%rd1], {%f4, %f3}; +; CHECK-NEXT: ret; + %a.load = load <2 x float>, ptr %a + %a.add = fadd <2 x float> %a.load, <float 1., float 1.> + store <2 x float> %a.add, ptr %a + ret void +} + +define void @generic_4xfloat(ptr %a) { +; CHECK-LABEL: generic_4xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_4xfloat_param_0]; +; CHECK-NEXT: ld.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; +; CHECK-NEXT: st.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; +; CHECK-NEXT: ret; + %a.load = load <4 x float>, ptr %a + %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.> + store <4 x float> %a.add, ptr %a + ret void +} + +define void @generic_2xdouble(ptr %a) { +; CHECK-LABEL: generic_2xdouble( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xdouble_param_0]; +; CHECK-NEXT: ld.v2.f64 {%fd1, %fd2}, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; +; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.v2.f64 [%rd1], {%fd4, %fd3}; +; CHECK-NEXT: ret; + %a.load = load <2 x double>, ptr %a + %a.add = fadd <2 x double> %a.load, <double 1., double 1.> + store <2 x double> %a.add, ptr %a + ret void +} + +; generic_volatile + +define void @generic_volatile_i8(ptr %a) { +; CHECK-LABEL: generic_volatile_i8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_i8_param_0]; +; CHECK-NEXT: ld.volatile.u8 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.volatile.u8 [%rd1], %rs2; +; CHECK-NEXT: ret; %a.load = load volatile i8, ptr %a %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store volatile i8 %a.add, ptr %a + ret void +} + +define void @generic_volatile_i16(ptr %a) { +; CHECK-LABEL: generic_volatile_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_i16_param_0]; +; CHECK-NEXT: ld.volatile.u16 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.volatile.u16 [%rd1], %rs2; +; CHECK-NEXT: ret; + %a.load = load volatile i16, ptr %a + %a.add = add i16 %a.load, 1 + store volatile i16 %a.add, ptr %a + ret void +} + +define void @generic_volatile_i32(ptr %a) { +; CHECK-LABEL: generic_volatile_i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_i32_param_0]; +; CHECK-NEXT: ld.volatile.u32 %r1, [%rd1]; +; CHECK-NEXT: add.s32 %r2, %r1, 1; +; CHECK-NEXT: st.volatile.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load volatile i32, ptr %a + %a.add = add i32 %a.load, 1 + store volatile i32 %a.add, ptr %a + ret void +} + +define void @generic_volatile_i64(ptr %a) { +; CHECK-LABEL: generic_volatile_i64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_i64_param_0]; +; CHECK-NEXT: ld.volatile.u64 %rd2, [%rd1]; +; CHECK-NEXT: add.s64 %rd3, %rd2, 1; +; CHECK-NEXT: st.volatile.u64 [%rd1], %rd3; +; CHECK-NEXT: ret; + %a.load = load volatile i64, ptr %a + %a.add = add i64 %a.load, 1 + store volatile i64 %a.add, ptr %a + ret void +} + +define void @generic_volatile_float(ptr %a) { +; CHECK-LABEL: generic_volatile_float( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_float_param_0]; +; CHECK-NEXT: ld.volatile.f32 %f1, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; CHECK-NEXT: st.volatile.f32 [%rd1], %f2; +; CHECK-NEXT: ret; + %a.load = load volatile float, ptr %a + %a.add = fadd float %a.load, 1. + store volatile float %a.add, ptr %a + ret void +} + +define void @generic_volatile_double(ptr %a) { +; CHECK-LABEL: generic_volatile_double( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_double_param_0]; +; CHECK-NEXT: ld.volatile.f64 %fd1, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.volatile.f64 [%rd1], %fd2; +; CHECK-NEXT: ret; + %a.load = load volatile double, ptr %a + %a.add = fadd double %a.load, 1. + store volatile double %a.add, ptr %a + ret void +} + +; TODO: volatile, atomic, and volatile atomic memory operations on vector types. +; Currently, LLVM: +; - does not allow atomic operations on vectors. +; - it allows volatile operations but not clear what that means. +; Following both semantics make sense in general and PTX supports both: +; - volatile/atomic/volatile atomic applies to the whole vector +; - volatile/atomic/volatile atomic applies elementwise +; Actions required: +; - clarify LLVM semantics for volatile on vectors and align the NVPTX backend with those +; Below tests show that the current implementation picks the semantics in an inconsistent way +; * volatile <2 x i8> lowers to "elementwise volatile" +; * <4 x i8> lowers to "full vector volatile" +; - provide support for vector atomics, e.g., by extending LLVM IR or via intrinsics +; - update tests in load-store-sm70.ll as well. + +; TODO: make this operation consistent with the one for <4 x i8> +; This operation lowers to a "element wise volatile PTX operation". +define void @generic_volatile_2xi8(ptr %a) { +; CHECK-LABEL: generic_volatile_2xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xi8_param_0]; +; CHECK-NEXT: ld.volatile.v2.u8 {%rs1, %rs2}, [%rd1]; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: st.volatile.v2.u8 [%rd1], {%rs4, %rs3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i8>, ptr %a + %a.add = add <2 x i8> %a.load, <i8 1, i8 1> + store volatile <2 x i8> %a.add, ptr %a + ret void +} + +; TODO: make this operation consistent with the one for <2 x i8> +; This operation lowers to a "full vector volatile PTX operation". +define void @generic_volatile_4xi8(ptr %a) { +; CHECK-LABEL: generic_volatile_4xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b32 %r<13>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_4xi8_param_0]; +; CHECK-NEXT: ld.volatile.u32 %r1, [%rd1]; +; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; +; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; +; CHECK-NEXT: add.s16 %rs4, %rs3, 1; +; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; +; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; +; CHECK-NEXT: add.s16 %rs6, %rs5, 1; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; +; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8; +; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; +; CHECK-NEXT: add.s16 %rs8, %rs7, 1; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; +; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8; +; CHECK-NEXT: st.volatile.u32 [%rd1], %r12; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x i8>, ptr %a + %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1> + store volatile <4 x i8> %a.add, ptr %a + ret void +} + +define void @generic_volatile_2xi16(ptr %a) { +; CHECK-LABEL: generic_volatile_2xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xi16_param_0]; +; CHECK-NEXT: ld.volatile.u32 %r1, [%rd1]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; CHECK-NEXT: st.volatile.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i16>, ptr %a + %a.add = add <2 x i16> %a.load, <i16 1, i16 1> + store volatile <2 x i16> %a.add, ptr %a + ret void +} + +define void @generic_volatile_4xi16(ptr %a) { +; CHECK-LABEL: generic_volatile_4xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_4xi16_param_0]; +; CHECK-NEXT: ld.volatile.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; +; CHECK-NEXT: add.s16 %rs5, %rs4, 1; +; CHECK-NEXT: add.s16 %rs6, %rs3, 1; +; CHECK-NEXT: add.s16 %rs7, %rs2, 1; +; CHECK-NEXT: add.s16 %rs8, %rs1, 1; +; CHECK-NEXT: st.volatile.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x i16>, ptr %a + %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1> + store volatile <4 x i16> %a.add, ptr %a + ret void +} + +define void @generic_volatile_2xi32(ptr %a) { +; CHECK-LABEL: generic_volatile_2xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xi32_param_0]; +; CHECK-NEXT: ld.volatile.v2.u32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: add.s32 %r3, %r2, 1; +; CHECK-NEXT: add.s32 %r4, %r1, 1; +; CHECK-NEXT: st.volatile.v2.u32 [%rd1], {%r4, %r3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i32>, ptr %a + %a.add = add <2 x i32> %a.load, <i32 1, i32 1> + store volatile <2 x i32> %a.add, ptr %a + ret void +} + +define void @generic_volatile_4xi32(ptr %a) { +; CHECK-LABEL: generic_volatile_4xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_4xi32_param_0]; +; CHECK-NEXT: ld.volatile.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: add.s32 %r5, %r4, 1; +; CHECK-NEXT: add.s32 %r6, %r3, 1; +; CHECK-NEXT: add.s32 %r7, %r2, 1; +; CHECK-NEXT: add.s32 %r8, %r1, 1; +; CHECK-NEXT: st.volatile.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x i32>, ptr %a + %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1> + store volatile <4 x i32> %a.add, ptr %a + ret void +} + +define void @generic_volatile_2xi64(ptr %a) { +; CHECK-LABEL: generic_volatile_2xi64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xi64_param_0]; +; CHECK-NEXT: ld.volatile.v2.u64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: add.s64 %rd4, %rd3, 1; +; CHECK-NEXT: add.s64 %rd5, %rd2, 1; +; CHECK-NEXT: st.volatile.v2.u64 [%rd1], {%rd5, %rd4}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i64>, ptr %a + %a.add = add <2 x i64> %a.load, <i64 1, i64 1> + store volatile <2 x i64> %a.add, ptr %a + ret void +} + +define void @generic_volatile_2xfloat(ptr %a) { +; CHECK-LABEL: generic_volatile_2xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xfloat_param_0]; +; CHECK-NEXT: ld.volatile.v2.f32 {%f1, %f2}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; +; CHECK-NEXT: st.volatile.v2.f32 [%rd1], {%f4, %f3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x float>, ptr %a + %a.add = fadd <2 x float> %a.load, <float 1., float 1.> + store volatile <2 x float> %a.add, ptr %a + ret void +} + +define void @generic_volatile_4xfloat(ptr %a) { +; CHECK-LABEL: generic_volatile_4xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_4xfloat_param_0]; +; CHECK-NEXT: ld.volatile.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; +; CHECK-NEXT: st.volatile.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x float>, ptr %a + %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.> + store volatile <4 x float> %a.add, ptr %a + ret void +} + +define void @generic_volatile_2xdouble(ptr %a) { +; CHECK-LABEL: generic_volatile_2xdouble( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xdouble_param_0]; +; CHECK-NEXT: ld.volatile.v2.f64 {%fd1, %fd2}, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; +; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.volatile.v2.f64 [%rd1], {%fd4, %fd3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x double>, ptr %a + %a.add = fadd <2 x double> %a.load, <double 1., double 1.> + store volatile <2 x double> %a.add, ptr %a + ret void +} - ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load volatile i16, ptr %b - %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store volatile i16 %b.add, ptr %b - - ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load volatile i32, ptr %c - %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store volatile i32 %c.add, ptr %c - - ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load volatile i64, ptr %d - %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store volatile i64 %d.add, ptr %d - - ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load volatile float, ptr %c - %e.add = fadd float %e.load, 1. - ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store volatile float %e.add, ptr %c - - ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load volatile double, ptr %c - %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store volatile double %f.add, ptr %c - - ; TODO: volatile, atomic, and volatile atomic memory operations on vector types. - ; Currently, LLVM: - ; - does not allow atomic operations on vectors. - ; - it allows volatile operations but not clear what that means. - ; Following both semantics make sense in general and PTX supports both: - ; - volatile/atomic/volatile atomic applies to the whole vector - ; - volatile/atomic/volatile atomic applies elementwise - ; Actions required: - ; - clarify LLVM semantics for volatile on vectors and align the NVPTX backend with those - ; Below tests show that the current implementation picks the semantics in an inconsistent way - ; * volatile <2 x i8> lowers to "elementwise volatile" - ; * <4 x i8> lowers to "full vector volatile" - ; - provide support for vector atomics, e.g., by extending LLVM IR or via intrinsics - ; - update tests in load-store-sm70.ll as well. - - ; TODO: make this operation consistent with the one for <4 x i8> - ; This operation lowers to a "element wise volatile PTX operation". - ; CHECK: ld.volatile.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] - %h.load = load volatile <2 x i8>, ptr %b - %h.add = add <2 x i8> %h.load, <i8 1, i8 1> - ; CHECK: st.volatile.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}} - store volatile <2 x i8> %h.add, ptr %b - - ; TODO: make this operation consistent with the one for <2 x i8> - ; This operation lowers to a "full vector volatile PTX operation". - ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %i.load = load volatile <4 x i8>, ptr %c - %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1> - ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store volatile <4 x i8> %i.add, ptr %c - - ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %j.load = load volatile <2 x i16>, ptr %c - %j.add = add <2 x i16> %j.load, <i16 1, i16 1> - ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store volatile <2 x i16> %j.add, ptr %c - - ; CHECK: ld.volatile.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] - %k.load = load volatile <4 x i16>, ptr %d - %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1> - ; CHECK: st.volatile.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} - store volatile <4 x i16> %k.add, ptr %d - - ; CHECK: ld.volatile.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] - %l.load = load volatile <2 x i32>, ptr %d - %l.add = add <2 x i32> %l.load, <i32 1, i32 1> - ; CHECK: st.volatile.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} - store volatile <2 x i32> %l.add, ptr %d - - ; CHECK: ld.volatile.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] - %m.load = load volatile <4 x i32>, ptr %d - %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1> - ; CHECK: st.volatile.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} - store volatile <4 x i32> %m.add, ptr %d - - ; CHECK: ld.volatile.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] - %n.load = load volatile <2 x i64>, ptr %d - %n.add = add <2 x i64> %n.load, <i64 1, i64 1> - ; CHECK: st.volatile.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} - store volatile <2 x i64> %n.add, ptr %d - - ; CHECK: ld.volatile.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] - %o.load = load volatile <2 x float>, ptr %d - %o.add = fadd <2 x float> %o.load, <float 1., float 1.> - ; CHECK: st.volatile.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} - store volatile <2 x float> %o.add, ptr %d - - ; CHECK: ld.volatile.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] - %p.load = load volatile <4 x float>, ptr %d - %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.> - ; CHECK: st.volatile.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} - store volatile <4 x float> %p.add, ptr %d - - ; CHECK: ld.volatile.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] - %q.load = load volatile <2 x double>, ptr %d - %q.add = fadd <2 x double> %q.load, <double 1., double 1.> - ; CHECK: st.volatile.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}} - store volatile <2 x double> %q.add, ptr %d - - ret void -} - -; CHECK-LABEL: generic_unordered_sys -define void @generic_unordered_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; SM60: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +; generic_unordered_sys + +define void @generic_unordered_sys_i8(ptr %a) { +; SM60-LABEL: generic_unordered_sys_i8( +; SM60: { +; SM60-NEXT: .reg .b16 %rs<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i8_param_0]; +; SM60-NEXT: ld.volatile.u8 %rs1, [%rd1]; +; SM60-NEXT: add.s16 %rs2, %rs1, 1; +; SM60-NEXT: st.volatile.u8 [%rd1], %rs2; +; SM60-NEXT: ret; +; +; SM70-LABEL: generic_unordered_sys_i8( +; SM70: { +; SM70-NEXT: .reg .b16 %rs<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i8_param_0]; +; SM70-NEXT: ld.relaxed.sys.u8 %rs1, [%rd1]; +; SM70-NEXT: add.s16 %rs2, %rs1, 1; +; SM70-NEXT: st.relaxed.sys.u8 [%rd1], %rs2; +; SM70-NEXT: ret; %a.load = load atomic i8, ptr %a unordered, align 1 %a.add = add i8 %a.load, 1 - ; SM60: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - ; SM70: st.relaxed.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr %a unordered, align 1 + ret void +} + +define void @generic_unordered_sys_i16(ptr %a) { +; SM60-LABEL: generic_unordered_sys_i16( +; SM60: { +; SM60-NEXT: .reg .b16 %rs<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i16_param_0]; +; SM60-NEXT: ld.volatile.u16 %rs1, [%rd1]; +; SM60-NEXT: add.s16 %rs2, %rs1, 1; +; SM60-NEXT: st.volatile.u16 [%rd1], %rs2; +; SM60-NEXT: ret; +; +; SM70-LABEL: generic_unordered_sys_i16( +; SM70: { +; SM70-NEXT: .reg .b16 %rs<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i16_param_0]; +; SM70-NEXT: ld.relaxed.sys.u16 %rs1, [%rd1]; +; SM70-NEXT: add.s16 %rs2, %rs1, 1; +; SM70-NEXT: st.relaxed.sys.u16 [%rd1], %rs2; +; SM70-NEXT: ret; + %a.load = load atomic i16, ptr %a unordered, align 2 + %a.add = add i16 %a.load, 1 + store atomic i16 %a.add, ptr %a unordered, align 2 + ret void +} - ; SM60: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr %b unordered, align 2 - %b.add = add i16 %b.load, 1 - ; SM60: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - ; SM70: st.relaxed.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr %b unordered, align 2 - - ; SM60: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr %c unordered, align 4 - %c.add = add i32 %c.load, 1 - ; SM60: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - ; SM70: st.relaxed.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr %c unordered, align 4 - - ; SM60: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr %d unordered, align 8 - %d.add = add i64 %d.load, 1 - ; SM60: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - ; SM70: st.relaxed.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr %d unordered, align 8 - - ; SM60: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr %e unordered, align 4 - %e.add = fadd float %e.load, 1.0 - ; SM60: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - ; SM70: st.relaxed.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr %e unordered, align 4 - - ; SM60: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr %e unordered, align 8 - %f.add = fadd double %f.load, 1. - ; SM60: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - ; SM70: st.relaxed.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr %e unordered, align 8 - - ret void -} - -; CHECK-LABEL: generic_unordered_volatile_sys -define void @generic_unordered_volatile_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +define void @generic_unordered_sys_i32(ptr %a) { +; SM60-LABEL: generic_unordered_sys_i32( +; SM60: { +; SM60-NEXT: .reg .b32 %r<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i32_param_0]; +; SM60-NEXT: ld.volatile.u32 %r1, [%rd1]; +; SM60-NEXT: add.s32 %r2, %r1, 1; +; SM60-NEXT: st.volatile.u32 [%rd1], %r2; +; SM60-NEXT: ret; +; +; SM70-LABEL: generic_unordered_sys_i32( +; SM70: { +; SM70-NEXT: .reg .b32 %r<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i32_param_0]; +; SM70-NEXT: ld.relaxed.sys.u32 %r1, [%rd1]; +; SM70-NEXT: add.s32 %r2, %r1, 1; +; SM70-NEXT: st.relaxed.sys.u32 [%rd1], %r2; +; SM70-NEXT: ret; + %a.load = load atomic i32, ptr %a unordered, align 4 + %a.add = add i32 %a.load, 1 + store atomic i32 %a.add, ptr %a unordered, align 4 + ret void +} + +define void @generic_unordered_sys_i64(ptr %a) { +; SM60-LABEL: generic_unordered_sys_i64( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<4>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i64_param_0]; +; SM60-NEXT: ld.volatile.u64 %rd2, [%rd1]; +; SM60-NEXT: add.s64 %rd3, %rd2, 1; +; SM60-NEXT: st.volatile.u64 [%rd1], %rd3; +; SM60-NEXT: ret; +; +; SM70-LABEL: generic_unordered_sys_i64( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<4>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i64_param_0]; +; SM70-NEXT: ld.relaxed.sys.u64 %rd2, [%rd1]; +; SM70-NEXT: add.s64 %rd3, %rd2, 1; +; SM70-NEXT: st.relaxed.sys.u64 [%rd1], %rd3; +; SM70-NEXT: ret; + %a.load = load atomic i64, ptr %a unordered, align 8 + %a.add = add i64 %a.load, 1 + store atomic i64 %a.add, ptr %a unordered, align 8 + ret void +} + +define void @generic_unordered_sys_float(ptr %a) { +; SM60-LABEL: generic_unordered_sys_float( +; SM60: { +; SM60-NEXT: .reg .f32 %f<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_float_param_0]; +; SM60-NEXT: ld.volatile.f32 %f1, [%rd1]; +; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; SM60-NEXT: st.volatile.f32 [%rd1], %f2; +; SM60-NEXT: ret; +; +; SM70-LABEL: generic_unordered_sys_float( +; SM70: { +; SM70-NEXT: .reg .f32 %f<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_float_param_0]; +; SM70-NEXT: ld.relaxed.sys.f32 %f1, [%rd1]; +; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; SM70-NEXT: st.relaxed.sys.f32 [%rd1], %f2; +; SM70-NEXT: ret; + %a.load = load atomic float, ptr %a unordered, align 4 + %a.add = fadd float %a.load, 1. + store atomic float %a.add, ptr %a unordered, align 4 + ret void +} + +define void @generic_unordered_sys_double(ptr %a) { +; SM60-LABEL: generic_unordered_sys_double( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .f64 %fd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_double_param_0]; +; SM60-NEXT: ld.volatile.f64 %fd1, [%rd1]; +; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; SM60-NEXT: st.volatile.f64 [%rd1], %fd2; +; SM60-NEXT: ret; +; +; SM70-LABEL: generic_unordered_sys_double( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .f64 %fd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_double_param_0]; +; SM70-NEXT: ld.relaxed.sys.f64 %fd1, [%rd1]; +; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; SM70-NEXT: st.relaxed.sys.f64 [%rd1], %fd2; +; SM70-NEXT: ret; + %a.load = load atomic double, ptr %a unordered, align 8 + %a.add = fadd double %a.load, 1. + store atomic double %a.add, ptr %a unordered, align 8 + ret void +} + +; generic_unordered_volatile_sys + +define void @generic_unordered_volatile_sys_i8(ptr %a) { +; CHECK-LABEL: generic_unordered_volatile_sys_i8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_unordered_volatile_sys_i8_param_0]; +; CHECK-NEXT: ld.volatile.u8 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.volatile.u8 [%rd1], %rs2; +; CHECK-NEXT: ret; %a.load = load atomic volatile i8, ptr %a unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr %a unordered, align 1 + ret void +} - ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr %b unordered, align 2 - %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr %b unordered, align 2 - - ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr %c unordered, align 4 - %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr %c unordered, align 4 +define void @generic_unordered_volatile_sys_i16(ptr %a) { +; CHECK-LABEL: generic_unordered_volatile_sys_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_unordered_volatile_sys_i16_param_0]; +; CHECK-NEXT: ld.volatile.u16 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.volatile.u16 [%rd1], %rs2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile i16, ptr %a unordered, align 2 + %a.add = add i16 %a.load, 1 + store atomic volatile i16 %a.add, ptr %a unordered, align 2 + ret void +} - ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr %d unordered, align 8 - %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr %d unordered, align 8 +define void @generic_unordered_volatile_sys_i32(ptr %a) { +; CHECK-LABEL: generic_unordered_volatile_sys_i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_unordered_volatile_sys_i32_param_0]; +; CHECK-NEXT: ld.volatile.u32 %r1, [%rd1]; +; CHECK-NEXT: add.s32 %r2, %r1, 1; +; CHECK-NEXT: st.volatile.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile i32, ptr %a unordered, align 4 + %a.add = add i32 %a.load, 1 + store atomic volatile i32 %a.add, ptr %a unordered, align 4 + ret void +} - ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr %e unordered, align 4 - %e.add = fadd float %e.load, 1.0 - ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr %e unordered, align 4 +define void @generic_unordered_volatile_sys_i64(ptr %a) { +; CHECK-LABEL: generic_unordered_volatile_sys_i64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_unordered_volatile_sys_i64_param_0]; +; CHECK-NEXT: ld.volatile.u64 %rd2, [%rd1]; +; CHECK-NEXT: add.s64 %rd3, %rd2, 1; +; CHECK-NEXT: st.volatile.u64 [%rd1], %rd3; +; CHECK-NEXT: ret; + %a.load = load atomic volatile i64, ptr %a unordered, align 8 + %a.add = add i64 %a.load, 1 + store atomic volatile i64 %a.add, ptr %a unordered, align 8 + ret void +} - ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr %e unordered, align 8 - %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr %e unordered, align 8 +define void @generic_unordered_volatile_sys_float(ptr %a) { +; CHECK-LABEL: generic_unordered_volatile_sys_float( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_unordered_volatile_sys_float_param_0]; +; CHECK-NEXT: ld.volatile.f32 %f1, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; CHECK-NEXT: st.volatile.f32 [%rd1], %f2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile float, ptr %a unordered, align 4 + %a.add = fadd float %a.load, 1. + store atomic volatile float %a.add, ptr %a unordered, align 4 + ret void +} +define void @generic_unordered_volatile_sys_double(ptr %a) { +; CHECK-LABEL: generic_unordered_volatile_sys_double( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_unordered_volatile_sys_double_param_0]; +; CHECK-NEXT: ld.volatile.f64 %fd1, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.volatile.f64 [%rd1], %fd2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile double, ptr %a unordered, align 8 + %a.add = fadd double %a.load, 1. + store atomic volatile double %a.add, ptr %a unordered, align 8 ret void } -; CHECK-LABEL: generic_monotonic_sys -define void @generic_monotonic_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; SM60: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +; generic_monotonic_sys + +define void @generic_monotonic_sys_i8(ptr %a) { +; SM60-LABEL: generic_monotonic_sys_i8( +; SM60: { +; SM60-NEXT: .reg .b16 %rs<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i8_param_0]; +; SM60-NEXT: ld.volatile.u8 %rs1, [%rd1]; +; SM60-NEXT: add.s16 %rs2, %rs1, 1; +; SM60-NEXT: st.volatile.u8 [%rd1], %rs2; +; SM60-NEXT: ret; +; +; SM70-LABEL: generic_monotonic_sys_i8( +; SM70: { +; SM70-NEXT: .reg .b16 %rs<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i8_param_0]; +; SM70-NEXT: ld.relaxed.sys.u8 %rs1, [%rd1]; +; SM70-NEXT: add.s16 %rs2, %rs1, 1; +; SM70-NEXT: st.relaxed.sys.u8 [%rd1], %rs2; +; SM70-NEXT: ret; %a.load = load atomic i8, ptr %a monotonic, align 1 %a.add = add i8 %a.load, 1 - ; SM60: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - ; SM70: st.relaxed.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr %a monotonic, align 1 + ret void +} + +define void @generic_monotonic_sys_i16(ptr %a) { +; SM60-LABEL: generic_monotonic_sys_i16( +; SM60: { +; SM60-NEXT: .reg .b16 %rs<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i16_param_0]; +; SM60-NEXT: ld.volatile.u16 %rs1, [%rd1]; +; SM60-NEXT: add.s16 %rs2, %rs1, 1; +; SM60-NEXT: st.volatile.u16 [%rd1], %rs2; +; SM60-NEXT: ret; +; +; SM70-LABEL: generic_monotonic_sys_i16( +; SM70: { +; SM70-NEXT: .reg .b16 %rs<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i16_param_0]; +; SM70-NEXT: ld.relaxed.sys.u16 %rs1, [%rd1]; +; SM70-NEXT: add.s16 %rs2, %rs1, 1; +; SM70-NEXT: st.relaxed.sys.u16 [%rd1], %rs2; +; SM70-NEXT: ret; + %a.load = load atomic i16, ptr %a monotonic, align 2 + %a.add = add i16 %a.load, 1 + store atomic i16 %a.add, ptr %a monotonic, align 2 + ret void +} - ; SM60: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr %b monotonic, align 2 - %b.add = add i16 %b.load, 1 - ; SM60: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - ; SM70: st.relaxed.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr %b monotonic, align 2 - - ; SM60: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr %c monotonic, align 4 - %c.add = add i32 %c.load, 1 - ; SM60: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - ; SM70: st.relaxed.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr %c monotonic, align 4 - - ; SM60: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr %d monotonic, align 8 - %d.add = add i64 %d.load, 1 - ; SM60: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - ; SM70: st.relaxed.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr %d monotonic, align 8 - - ; SM60: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr %e monotonic, align 4 - %e.add = fadd float %e.load, 1. - ; SM60: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - ; SM70: st.relaxed.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr %e monotonic, align 4 - - ; SM60: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr %e monotonic, align 8 - %f.add = fadd double %f.load, 1. - ; SM60: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - ; SM70: st.relaxed.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr %e monotonic, align 8 - - ret void -} - -; CHECK-LABEL: generic_monotonic_volatile_sys -define void @generic_monotonic_volatile_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +define void @generic_monotonic_sys_i32(ptr %a) { +; SM60-LABEL: generic_monotonic_sys_i32( +; SM60: { +; SM60-NEXT: .reg .b32 %r<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i32_param_0]; +; SM60-NEXT: ld.volatile.u32 %r1, [%rd1]; +; SM60-NEXT: add.s32 %r2, %r1, 1; +; SM60-NEXT: st.volatile.u32 [%rd1], %r2; +; SM60-NEXT: ret; +; +; SM70-LABEL: generic_monotonic_sys_i32( +; SM70: { +; SM70-NEXT: .reg .b32 %r<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i32_param_0]; +; SM70-NEXT: ld.relaxed.sys.u32 %r1, [%rd1]; +; SM70-NEXT: add.s32 %r2, %r1, 1; +; SM70-NEXT: st.relaxed.sys.u32 [%rd1], %r2; +; SM70-NEXT: ret; + %a.load = load atomic i32, ptr %a monotonic, align 4 + %a.add = add i32 %a.load, 1 + store atomic i32 %a.add, ptr %a monotonic, align 4 + ret void +} + +define void @generic_monotonic_sys_i64(ptr %a) { +; SM60-LABEL: generic_monotonic_sys_i64( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<4>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i64_param_0]; +; SM60-NEXT: ld.volatile.u64 %rd2, [%rd1]; +; SM60-NEXT: add.s64 %rd3, %rd2, 1; +; SM60-NEXT: st.volatile.u64 [%rd1], %rd3; +; SM60-NEXT: ret; +; +; SM70-LABEL: generic_monotonic_sys_i64( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<4>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i64_param_0]; +; SM70-NEXT: ld.relaxed.sys.u64 %rd2, [%rd1]; +; SM70-NEXT: add.s64 %rd3, %rd2, 1; +; SM70-NEXT: st.relaxed.sys.u64 [%rd1], %rd3; +; SM70-NEXT: ret; + %a.load = load atomic i64, ptr %a monotonic, align 8 + %a.add = add i64 %a.load, 1 + store atomic i64 %a.add, ptr %a monotonic, align 8 + ret void +} + +define void @generic_monotonic_sys_float(ptr %a) { +; SM60-LABEL: generic_monotonic_sys_float( +; SM60: { +; SM60-NEXT: .reg .f32 %f<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_float_param_0]; +; SM60-NEXT: ld.volatile.f32 %f1, [%rd1]; +; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; SM60-NEXT: st.volatile.f32 [%rd1], %f2; +; SM60-NEXT: ret; +; +; SM70-LABEL: generic_monotonic_sys_float( +; SM70: { +; SM70-NEXT: .reg .f32 %f<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_float_param_0]; +; SM70-NEXT: ld.relaxed.sys.f32 %f1, [%rd1]; +; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; SM70-NEXT: st.relaxed.sys.f32 [%rd1], %f2; +; SM70-NEXT: ret; + %a.load = load atomic float, ptr %a monotonic, align 4 + %a.add = fadd float %a.load, 1. + store atomic float %a.add, ptr %a monotonic, align 4 + ret void +} + +define void @generic_monotonic_sys_double(ptr %a) { +; SM60-LABEL: generic_monotonic_sys_double( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .f64 %fd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_double_param_0]; +; SM60-NEXT: ld.volatile.f64 %fd1, [%rd1]; +; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; SM60-NEXT: st.volatile.f64 [%rd1], %fd2; +; SM60-NEXT: ret; +; +; SM70-LABEL: generic_monotonic_sys_double( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .f64 %fd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_double_param_0]; +; SM70-NEXT: ld.relaxed.sys.f64 %fd1, [%rd1]; +; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; SM70-NEXT: st.relaxed.sys.f64 [%rd1], %fd2; +; SM70-NEXT: ret; + %a.load = load atomic double, ptr %a monotonic, align 8 + %a.add = fadd double %a.load, 1. + store atomic double %a.add, ptr %a monotonic, align 8 + ret void +} + +; generic_monotonic_volatile_sys + +define void @generic_monotonic_volatile_sys_i8(ptr %a) { +; CHECK-LABEL: generic_monotonic_volatile_sys_i8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_monotonic_volatile_sys_i8_param_0]; +; CHECK-NEXT: ld.volatile.u8 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.volatile.u8 [%rd1], %rs2; +; CHECK-NEXT: ret; %a.load = load atomic volatile i8, ptr %a monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr %a monotonic, align 1 + ret void +} - ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr %b monotonic, align 2 - %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr %b monotonic, align 2 - - ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr %c monotonic, align 4 - %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr %c monotonic, align 4 +define void @generic_monotonic_volatile_sys_i16(ptr %a) { +; CHECK-LABEL: generic_monotonic_volatile_sys_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_monotonic_volatile_sys_i16_param_0]; +; CHECK-NEXT: ld.volatile.u16 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.volatile.u16 [%rd1], %rs2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile i16, ptr %a monotonic, align 2 + %a.add = add i16 %a.load, 1 + store atomic volatile i16 %a.add, ptr %a monotonic, align 2 + ret void +} - ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr %d monotonic, align 8 - %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr %d monotonic, align 8 +define void @generic_monotonic_volatile_sys_i32(ptr %a) { +; CHECK-LABEL: generic_monotonic_volatile_sys_i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_monotonic_volatile_sys_i32_param_0]; +; CHECK-NEXT: ld.volatile.u32 %r1, [%rd1]; +; CHECK-NEXT: add.s32 %r2, %r1, 1; +; CHECK-NEXT: st.volatile.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile i32, ptr %a monotonic, align 4 + %a.add = add i32 %a.load, 1 + store atomic volatile i32 %a.add, ptr %a monotonic, align 4 + ret void +} - ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr %e monotonic, align 4 - %e.add = fadd float %e.load, 1. - ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr %e monotonic, align 4 +define void @generic_monotonic_volatile_sys_i64(ptr %a) { +; CHECK-LABEL: generic_monotonic_volatile_sys_i64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_monotonic_volatile_sys_i64_param_0]; +; CHECK-NEXT: ld.volatile.u64 %rd2, [%rd1]; +; CHECK-NEXT: add.s64 %rd3, %rd2, 1; +; CHECK-NEXT: st.volatile.u64 [%rd1], %rd3; +; CHECK-NEXT: ret; + %a.load = load atomic volatile i64, ptr %a monotonic, align 8 + %a.add = add i64 %a.load, 1 + store atomic volatile i64 %a.add, ptr %a monotonic, align 8 + ret void +} - ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr %e monotonic, align 8 - %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr %e monotonic, align 8 +define void @generic_monotonic_volatile_sys_float(ptr %a) { +; CHECK-LABEL: generic_monotonic_volatile_sys_float( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_monotonic_volatile_sys_float_param_0]; +; CHECK-NEXT: ld.volatile.f32 %f1, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; CHECK-NEXT: st.volatile.f32 [%rd1], %f2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile float, ptr %a monotonic, align 4 + %a.add = fadd float %a.load, 1. + store atomic volatile float %a.add, ptr %a monotonic, align 4 + ret void +} +define void @generic_monotonic_volatile_sys_double(ptr %a) { +; CHECK-LABEL: generic_monotonic_volatile_sys_double( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_monotonic_volatile_sys_double_param_0]; +; CHECK-NEXT: ld.volatile.f64 %fd1, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.volatile.f64 [%rd1], %fd2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile double, ptr %a monotonic, align 8 + %a.add = fadd double %a.load, 1. + store atomic volatile double %a.add, ptr %a monotonic, align 8 ret void } ;; global statespace -; CHECK-LABEL: global_weak -define void @global_weak(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr { - ; CHECK: ld.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +; global + +define void @global_i8(ptr addrspace(1) %a) { +; CHECK-LABEL: global_i8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_i8_param_0]; +; CHECK-NEXT: ld.global.u8 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.global.u8 [%rd1], %rs2; +; CHECK-NEXT: ret; %a.load = load i8, ptr addrspace(1) %a %a.add = add i8 %a.load, 1 - ; CHECK: st.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store i8 %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_i16(ptr addrspace(1) %a) { +; CHECK-LABEL: global_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_i16_param_0]; +; CHECK-NEXT: ld.global.u16 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.global.u16 [%rd1], %rs2; +; CHECK-NEXT: ret; + %a.load = load i16, ptr addrspace(1) %a + %a.add = add i16 %a.load, 1 + store i16 %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_i32(ptr addrspace(1) %a) { +; CHECK-LABEL: global_i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_i32_param_0]; +; CHECK-NEXT: ld.global.u32 %r1, [%rd1]; +; CHECK-NEXT: add.s32 %r2, %r1, 1; +; CHECK-NEXT: st.global.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load i32, ptr addrspace(1) %a + %a.add = add i32 %a.load, 1 + store i32 %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_i64(ptr addrspace(1) %a) { +; CHECK-LABEL: global_i64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_i64_param_0]; +; CHECK-NEXT: ld.global.u64 %rd2, [%rd1]; +; CHECK-NEXT: add.s64 %rd3, %rd2, 1; +; CHECK-NEXT: st.global.u64 [%rd1], %rd3; +; CHECK-NEXT: ret; + %a.load = load i64, ptr addrspace(1) %a + %a.add = add i64 %a.load, 1 + store i64 %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_float(ptr addrspace(1) %a) { +; CHECK-LABEL: global_float( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_float_param_0]; +; CHECK-NEXT: ld.global.f32 %f1, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; CHECK-NEXT: st.global.f32 [%rd1], %f2; +; CHECK-NEXT: ret; + %a.load = load float, ptr addrspace(1) %a + %a.add = fadd float %a.load, 1. + store float %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_double(ptr addrspace(1) %a) { +; CHECK-LABEL: global_double( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_double_param_0]; +; CHECK-NEXT: ld.global.f64 %fd1, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.global.f64 [%rd1], %fd2; +; CHECK-NEXT: ret; + %a.load = load double, ptr addrspace(1) %a + %a.add = fadd double %a.load, 1. + store double %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_2xi8(ptr addrspace(1) %a) { +; CHECK-LABEL: global_2xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_2xi8_param_0]; +; CHECK-NEXT: ld.global.v2.u8 {%rs1, %rs2}, [%rd1]; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: st.global.v2.u8 [%rd1], {%rs4, %rs3}; +; CHECK-NEXT: ret; + %a.load = load <2 x i8>, ptr addrspace(1) %a + %a.add = add <2 x i8> %a.load, <i8 1, i8 1> + store <2 x i8> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_4xi8(ptr addrspace(1) %a) { +; CHECK-LABEL: global_4xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b32 %r<13>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_4xi8_param_0]; +; CHECK-NEXT: ld.global.u32 %r1, [%rd1]; +; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; +; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; +; CHECK-NEXT: add.s16 %rs4, %rs3, 1; +; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; +; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; +; CHECK-NEXT: add.s16 %rs6, %rs5, 1; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; +; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8; +; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; +; CHECK-NEXT: add.s16 %rs8, %rs7, 1; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; +; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8; +; CHECK-NEXT: st.global.u32 [%rd1], %r12; +; CHECK-NEXT: ret; + %a.load = load <4 x i8>, ptr addrspace(1) %a + %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1> + store <4 x i8> %a.add, ptr addrspace(1) %a + ret void +} - ; CHECK: ld.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load i16, ptr addrspace(1) %b - %b.add = add i16 %b.load, 1 - ; CHECK: st.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store i16 %b.add, ptr addrspace(1) %b - - ; CHECK: ld.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load i32, ptr addrspace(1) %c - %c.add = add i32 %c.load, 1 - ; CHECK: st.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store i32 %c.add, ptr addrspace(1) %c - - ; CHECK: ld.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load i64, ptr addrspace(1) %d - %d.add = add i64 %d.load, 1 - ; CHECK: st.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store i64 %d.add, ptr addrspace(1) %d - - ; CHECK: ld.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load float, ptr addrspace(1) %c - %e.add = fadd float %e.load, 1. - ; CHECK: st.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store float %e.add, ptr addrspace(1) %c - - ; CHECK: ld.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load double, ptr addrspace(1) %c - %f.add = fadd double %f.load, 1. - ; CHECK: st.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store double %f.add, ptr addrspace(1) %c - - ; CHECK: ld.global.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] - %h.load = load <2 x i8>, ptr addrspace(1) %b - %h.add = add <2 x i8> %h.load, <i8 1, i8 1> - ; CHECK: st.global.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}} - store <2 x i8> %h.add, ptr addrspace(1) %b - - ; CHECK: ld.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %i.load = load <4 x i8>, ptr addrspace(1) %c - %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1> - ; CHECK: st.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store <4 x i8> %i.add, ptr addrspace(1) %c - - ; CHECK: ld.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %j.load = load <2 x i16>, ptr addrspace(1) %c - %j.add = add <2 x i16> %j.load, <i16 1, i16 1> - ; CHECK: st.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store <2 x i16> %j.add, ptr addrspace(1) %c - - ; CHECK: ld.global.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] - %k.load = load <4 x i16>, ptr addrspace(1) %d - %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1> - ; CHECK: st.global.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} - store <4 x i16> %k.add, ptr addrspace(1) %d - - ; CHECK: ld.global.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] - %l.load = load <2 x i32>, ptr addrspace(1) %d - %l.add = add <2 x i32> %l.load, <i32 1, i32 1> - ; CHECK: st.global.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} - store <2 x i32> %l.add, ptr addrspace(1) %d - - ; CHECK: ld.global.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] - %m.load = load <4 x i32>, ptr addrspace(1) %d - %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1> - ; CHECK: st.global.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} - store <4 x i32> %m.add, ptr addrspace(1) %d - - ; CHECK: ld.global.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] - %n.load = load <2 x i64>, ptr addrspace(1) %d - %n.add = add <2 x i64> %n.load, <i64 1, i64 1> - ; CHECK: st.global.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} - store <2 x i64> %n.add, ptr addrspace(1) %d - - ; CHECK: ld.global.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] - %o.load = load <2 x float>, ptr addrspace(1) %d - %o.add = fadd <2 x float> %o.load, <float 1., float 1.> - ; CHECK: st.global.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} - store <2 x float> %o.add, ptr addrspace(1) %d - - ; CHECK: ld.global.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] - %p.load = load <4 x float>, ptr addrspace(1) %d - %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.> - ; CHECK: st.global.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} - store <4 x float> %p.add, ptr addrspace(1) %d - - ; CHECK: ld.global.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] - %q.load = load <2 x double>, ptr addrspace(1) %d - %q.add = fadd <2 x double> %q.load, <double 1., double 1.> - ; CHECK: st.global.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}} - store <2 x double> %q.add, ptr addrspace(1) %d - - ret void -} - -; CHECK-LABEL: global_volatile -define void @global_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr { - ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +define void @global_2xi16(ptr addrspace(1) %a) { +; CHECK-LABEL: global_2xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_2xi16_param_0]; +; CHECK-NEXT: ld.global.u32 %r1, [%rd1]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; CHECK-NEXT: st.global.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load <2 x i16>, ptr addrspace(1) %a + %a.add = add <2 x i16> %a.load, <i16 1, i16 1> + store <2 x i16> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_4xi16(ptr addrspace(1) %a) { +; CHECK-LABEL: global_4xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_4xi16_param_0]; +; CHECK-NEXT: ld.global.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; +; CHECK-NEXT: add.s16 %rs5, %rs4, 1; +; CHECK-NEXT: add.s16 %rs6, %rs3, 1; +; CHECK-NEXT: add.s16 %rs7, %rs2, 1; +; CHECK-NEXT: add.s16 %rs8, %rs1, 1; +; CHECK-NEXT: st.global.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; +; CHECK-NEXT: ret; + %a.load = load <4 x i16>, ptr addrspace(1) %a + %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1> + store <4 x i16> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_2xi32(ptr addrspace(1) %a) { +; CHECK-LABEL: global_2xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_2xi32_param_0]; +; CHECK-NEXT: ld.global.v2.u32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: add.s32 %r3, %r2, 1; +; CHECK-NEXT: add.s32 %r4, %r1, 1; +; CHECK-NEXT: st.global.v2.u32 [%rd1], {%r4, %r3}; +; CHECK-NEXT: ret; + %a.load = load <2 x i32>, ptr addrspace(1) %a + %a.add = add <2 x i32> %a.load, <i32 1, i32 1> + store <2 x i32> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_4xi32(ptr addrspace(1) %a) { +; CHECK-LABEL: global_4xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_4xi32_param_0]; +; CHECK-NEXT: ld.global.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: add.s32 %r5, %r4, 1; +; CHECK-NEXT: add.s32 %r6, %r3, 1; +; CHECK-NEXT: add.s32 %r7, %r2, 1; +; CHECK-NEXT: add.s32 %r8, %r1, 1; +; CHECK-NEXT: st.global.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; +; CHECK-NEXT: ret; + %a.load = load <4 x i32>, ptr addrspace(1) %a + %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1> + store <4 x i32> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_2xi64(ptr addrspace(1) %a) { +; CHECK-LABEL: global_2xi64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_2xi64_param_0]; +; CHECK-NEXT: ld.global.v2.u64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: add.s64 %rd4, %rd3, 1; +; CHECK-NEXT: add.s64 %rd5, %rd2, 1; +; CHECK-NEXT: st.global.v2.u64 [%rd1], {%rd5, %rd4}; +; CHECK-NEXT: ret; + %a.load = load <2 x i64>, ptr addrspace(1) %a + %a.add = add <2 x i64> %a.load, <i64 1, i64 1> + store <2 x i64> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_2xfloat(ptr addrspace(1) %a) { +; CHECK-LABEL: global_2xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_2xfloat_param_0]; +; CHECK-NEXT: ld.global.v2.f32 {%f1, %f2}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; +; CHECK-NEXT: st.global.v2.f32 [%rd1], {%f4, %f3}; +; CHECK-NEXT: ret; + %a.load = load <2 x float>, ptr addrspace(1) %a + %a.add = fadd <2 x float> %a.load, <float 1., float 1.> + store <2 x float> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_4xfloat(ptr addrspace(1) %a) { +; CHECK-LABEL: global_4xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_4xfloat_param_0]; +; CHECK-NEXT: ld.global.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; +; CHECK-NEXT: st.global.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; +; CHECK-NEXT: ret; + %a.load = load <4 x float>, ptr addrspace(1) %a + %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.> + store <4 x float> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_2xdouble(ptr addrspace(1) %a) { +; CHECK-LABEL: global_2xdouble( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_2xdouble_param_0]; +; CHECK-NEXT: ld.global.v2.f64 {%fd1, %fd2}, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; +; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.global.v2.f64 [%rd1], {%fd4, %fd3}; +; CHECK-NEXT: ret; + %a.load = load <2 x double>, ptr addrspace(1) %a + %a.add = fadd <2 x double> %a.load, <double 1., double 1.> + store <2 x double> %a.add, ptr addrspace(1) %a + ret void +} + +; global_volatile + +define void @global_volatile_i8(ptr addrspace(1) %a) { +; CHECK-LABEL: global_volatile_i8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_i8_param_0]; +; CHECK-NEXT: ld.volatile.global.u8 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.volatile.global.u8 [%rd1], %rs2; +; CHECK-NEXT: ret; %a.load = load volatile i8, ptr addrspace(1) %a %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store volatile i8 %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_volatile_i16(ptr addrspace(1) %a) { +; CHECK-LABEL: global_volatile_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_i16_param_0]; +; CHECK-NEXT: ld.volatile.global.u16 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.volatile.global.u16 [%rd1], %rs2; +; CHECK-NEXT: ret; + %a.load = load volatile i16, ptr addrspace(1) %a + %a.add = add i16 %a.load, 1 + store volatile i16 %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_volatile_i32(ptr addrspace(1) %a) { +; CHECK-LABEL: global_volatile_i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_i32_param_0]; +; CHECK-NEXT: ld.volatile.global.u32 %r1, [%rd1]; +; CHECK-NEXT: add.s32 %r2, %r1, 1; +; CHECK-NEXT: st.volatile.global.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load volatile i32, ptr addrspace(1) %a + %a.add = add i32 %a.load, 1 + store volatile i32 %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_volatile_i64(ptr addrspace(1) %a) { +; CHECK-LABEL: global_volatile_i64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_i64_param_0]; +; CHECK-NEXT: ld.volatile.global.u64 %rd2, [%rd1]; +; CHECK-NEXT: add.s64 %rd3, %rd2, 1; +; CHECK-NEXT: st.volatile.global.u64 [%rd1], %rd3; +; CHECK-NEXT: ret; + %a.load = load volatile i64, ptr addrspace(1) %a + %a.add = add i64 %a.load, 1 + store volatile i64 %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_volatile_float(ptr addrspace(1) %a) { +; CHECK-LABEL: global_volatile_float( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_float_param_0]; +; CHECK-NEXT: ld.volatile.global.f32 %f1, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; CHECK-NEXT: st.volatile.global.f32 [%rd1], %f2; +; CHECK-NEXT: ret; + %a.load = load volatile float, ptr addrspace(1) %a + %a.add = fadd float %a.load, 1. + store volatile float %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_volatile_double(ptr addrspace(1) %a) { +; CHECK-LABEL: global_volatile_double( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_double_param_0]; +; CHECK-NEXT: ld.volatile.global.f64 %fd1, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.volatile.global.f64 [%rd1], %fd2; +; CHECK-NEXT: ret; + %a.load = load volatile double, ptr addrspace(1) %a + %a.add = fadd double %a.load, 1. + store volatile double %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_volatile_2xi8(ptr addrspace(1) %a) { +; CHECK-LABEL: global_volatile_2xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xi8_param_0]; +; CHECK-NEXT: ld.volatile.global.v2.u8 {%rs1, %rs2}, [%rd1]; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: st.volatile.global.v2.u8 [%rd1], {%rs4, %rs3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i8>, ptr addrspace(1) %a + %a.add = add <2 x i8> %a.load, <i8 1, i8 1> + store volatile <2 x i8> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_volatile_4xi8(ptr addrspace(1) %a) { +; CHECK-LABEL: global_volatile_4xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b32 %r<13>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_4xi8_param_0]; +; CHECK-NEXT: ld.volatile.global.u32 %r1, [%rd1]; +; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; +; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; +; CHECK-NEXT: add.s16 %rs4, %rs3, 1; +; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; +; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; +; CHECK-NEXT: add.s16 %rs6, %rs5, 1; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; +; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8; +; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; +; CHECK-NEXT: add.s16 %rs8, %rs7, 1; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; +; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8; +; CHECK-NEXT: st.volatile.global.u32 [%rd1], %r12; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x i8>, ptr addrspace(1) %a + %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1> + store volatile <4 x i8> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_volatile_2xi16(ptr addrspace(1) %a) { +; CHECK-LABEL: global_volatile_2xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xi16_param_0]; +; CHECK-NEXT: ld.volatile.global.u32 %r1, [%rd1]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; CHECK-NEXT: st.volatile.global.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i16>, ptr addrspace(1) %a + %a.add = add <2 x i16> %a.load, <i16 1, i16 1> + store volatile <2 x i16> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_volatile_4xi16(ptr addrspace(1) %a) { +; CHECK-LABEL: global_volatile_4xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_4xi16_param_0]; +; CHECK-NEXT: ld.volatile.global.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; +; CHECK-NEXT: add.s16 %rs5, %rs4, 1; +; CHECK-NEXT: add.s16 %rs6, %rs3, 1; +; CHECK-NEXT: add.s16 %rs7, %rs2, 1; +; CHECK-NEXT: add.s16 %rs8, %rs1, 1; +; CHECK-NEXT: st.volatile.global.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x i16>, ptr addrspace(1) %a + %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1> + store volatile <4 x i16> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_volatile_2xi32(ptr addrspace(1) %a) { +; CHECK-LABEL: global_volatile_2xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xi32_param_0]; +; CHECK-NEXT: ld.volatile.global.v2.u32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: add.s32 %r3, %r2, 1; +; CHECK-NEXT: add.s32 %r4, %r1, 1; +; CHECK-NEXT: st.volatile.global.v2.u32 [%rd1], {%r4, %r3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i32>, ptr addrspace(1) %a + %a.add = add <2 x i32> %a.load, <i32 1, i32 1> + store volatile <2 x i32> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_volatile_4xi32(ptr addrspace(1) %a) { +; CHECK-LABEL: global_volatile_4xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_4xi32_param_0]; +; CHECK-NEXT: ld.volatile.global.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: add.s32 %r5, %r4, 1; +; CHECK-NEXT: add.s32 %r6, %r3, 1; +; CHECK-NEXT: add.s32 %r7, %r2, 1; +; CHECK-NEXT: add.s32 %r8, %r1, 1; +; CHECK-NEXT: st.volatile.global.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x i32>, ptr addrspace(1) %a + %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1> + store volatile <4 x i32> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_volatile_2xi64(ptr addrspace(1) %a) { +; CHECK-LABEL: global_volatile_2xi64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xi64_param_0]; +; CHECK-NEXT: ld.volatile.global.v2.u64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: add.s64 %rd4, %rd3, 1; +; CHECK-NEXT: add.s64 %rd5, %rd2, 1; +; CHECK-NEXT: st.volatile.global.v2.u64 [%rd1], {%rd5, %rd4}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i64>, ptr addrspace(1) %a + %a.add = add <2 x i64> %a.load, <i64 1, i64 1> + store volatile <2 x i64> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_volatile_2xfloat(ptr addrspace(1) %a) { +; CHECK-LABEL: global_volatile_2xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xfloat_param_0]; +; CHECK-NEXT: ld.volatile.global.v2.f32 {%f1, %f2}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; +; CHECK-NEXT: st.volatile.global.v2.f32 [%rd1], {%f4, %f3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x float>, ptr addrspace(1) %a + %a.add = fadd <2 x float> %a.load, <float 1., float 1.> + store volatile <2 x float> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_volatile_4xfloat(ptr addrspace(1) %a) { +; CHECK-LABEL: global_volatile_4xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_4xfloat_param_0]; +; CHECK-NEXT: ld.volatile.global.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; +; CHECK-NEXT: st.volatile.global.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x float>, ptr addrspace(1) %a + %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.> + store volatile <4 x float> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_volatile_2xdouble(ptr addrspace(1) %a) { +; CHECK-LABEL: global_volatile_2xdouble( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xdouble_param_0]; +; CHECK-NEXT: ld.volatile.global.v2.f64 {%fd1, %fd2}, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; +; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.volatile.global.v2.f64 [%rd1], {%fd4, %fd3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x double>, ptr addrspace(1) %a + %a.add = fadd <2 x double> %a.load, <double 1., double 1.> + store volatile <2 x double> %a.add, ptr addrspace(1) %a + ret void +} - ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load volatile i16, ptr addrspace(1) %b - %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store volatile i16 %b.add, ptr addrspace(1) %b - - ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load volatile i32, ptr addrspace(1) %c - %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store volatile i32 %c.add, ptr addrspace(1) %c - - ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load volatile i64, ptr addrspace(1) %d - %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store volatile i64 %d.add, ptr addrspace(1) %d - - ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load volatile float, ptr addrspace(1) %c - %e.add = fadd float %e.load, 1. - ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store volatile float %e.add, ptr addrspace(1) %c - - ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load volatile double, ptr addrspace(1) %c - %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store volatile double %f.add, ptr addrspace(1) %c - - ; CHECK: ld.volatile.global.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] - %h.load = load volatile <2 x i8>, ptr addrspace(1) %b - %h.add = add <2 x i8> %h.load, <i8 1, i8 1> - ; CHECK: st.volatile.global.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}} - store volatile<2 x i8> %h.add, ptr addrspace(1) %b - - ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %i.load = load volatile <4 x i8>, ptr addrspace(1) %c - %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1> - ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store volatile<4 x i8> %i.add, ptr addrspace(1) %c - - ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %j.load = load volatile <2 x i16>, ptr addrspace(1) %c - %j.add = add <2 x i16> %j.load, <i16 1, i16 1> - ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store volatile<2 x i16> %j.add, ptr addrspace(1) %c - - ; CHECK: ld.volatile.global.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] - %k.load = load volatile <4 x i16>, ptr addrspace(1) %d - %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1> - ; CHECK: st.volatile.global.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} - store volatile<4 x i16> %k.add, ptr addrspace(1) %d - - ; CHECK: ld.volatile.global.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] - %l.load = load volatile <2 x i32>, ptr addrspace(1) %d - %l.add = add <2 x i32> %l.load, <i32 1, i32 1> - ; CHECK: st.volatile.global.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} - store volatile<2 x i32> %l.add, ptr addrspace(1) %d - - ; CHECK: ld.volatile.global.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] - %m.load = load volatile <4 x i32>, ptr addrspace(1) %d - %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1> - ; CHECK: st.volatile.global.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} - store volatile<4 x i32> %m.add, ptr addrspace(1) %d - - ; CHECK: ld.volatile.global.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] - %n.load = load volatile <2 x i64>, ptr addrspace(1) %d - %n.add = add <2 x i64> %n.load, <i64 1, i64 1> - ; CHECK: st.volatile.global.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} - store volatile<2 x i64> %n.add, ptr addrspace(1) %d - - ; CHECK: ld.volatile.global.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] - %o.load = load volatile <2 x float>, ptr addrspace(1) %d - %o.add = fadd <2 x float> %o.load, <float 1., float 1.> - ; CHECK: st.volatile.global.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} - store volatile<2 x float> %o.add, ptr addrspace(1) %d - - ; CHECK: ld.volatile.global.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] - %p.load = load volatile <4 x float>, ptr addrspace(1) %d - %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.> - ; CHECK: st.volatile.global.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} - store volatile<4 x float> %p.add, ptr addrspace(1) %d - - ; CHECK: ld.volatile.global.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] - %q.load = load volatile <2 x double>, ptr addrspace(1) %d - %q.add = fadd <2 x double> %q.load, <double 1., double 1.> - ; CHECK: st.volatile.global.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}} - store volatile<2 x double> %q.add, ptr addrspace(1) %d - - ret void -} - -; CHECK-LABEL: global_unordered_sys -define void @global_unordered_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +; global_unordered_sys + +define void @global_unordered_sys_i8(ptr addrspace(1) %a) { +; SM60-LABEL: global_unordered_sys_i8( +; SM60: { +; SM60-NEXT: .reg .b16 %rs<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i8_param_0]; +; SM60-NEXT: ld.volatile.global.u8 %rs1, [%rd1]; +; SM60-NEXT: add.s16 %rs2, %rs1, 1; +; SM60-NEXT: st.volatile.global.u8 [%rd1], %rs2; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_unordered_sys_i8( +; SM70: { +; SM70-NEXT: .reg .b16 %rs<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i8_param_0]; +; SM70-NEXT: ld.relaxed.sys.global.u8 %rs1, [%rd1]; +; SM70-NEXT: add.s16 %rs2, %rs1, 1; +; SM70-NEXT: st.relaxed.sys.global.u8 [%rd1], %rs2; +; SM70-NEXT: ret; %a.load = load atomic i8, ptr addrspace(1) %a unordered, align 1 %a.add = add i8 %a.load, 1 - ; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - ; SM70: st.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(1) %a unordered, align 1 + ret void +} + +define void @global_unordered_sys_i16(ptr addrspace(1) %a) { +; SM60-LABEL: global_unordered_sys_i16( +; SM60: { +; SM60-NEXT: .reg .b16 %rs<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i16_param_0]; +; SM60-NEXT: ld.volatile.global.u16 %rs1, [%rd1]; +; SM60-NEXT: add.s16 %rs2, %rs1, 1; +; SM60-NEXT: st.volatile.global.u16 [%rd1], %rs2; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_unordered_sys_i16( +; SM70: { +; SM70-NEXT: .reg .b16 %rs<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i16_param_0]; +; SM70-NEXT: ld.relaxed.sys.global.u16 %rs1, [%rd1]; +; SM70-NEXT: add.s16 %rs2, %rs1, 1; +; SM70-NEXT: st.relaxed.sys.global.u16 [%rd1], %rs2; +; SM70-NEXT: ret; + %a.load = load atomic i16, ptr addrspace(1) %a unordered, align 2 + %a.add = add i16 %a.load, 1 + store atomic i16 %a.add, ptr addrspace(1) %a unordered, align 2 + ret void +} - ; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(1) %b unordered, align 2 - %b.add = add i16 %b.load, 1 - ; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - ; SM70: st.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(1) %b unordered, align 2 - - ; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(1) %c unordered, align 4 - %c.add = add i32 %c.load, 1 - ; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - ; SM70: st.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(1) %c unordered, align 4 - - ; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(1) %d unordered, align 8 - %d.add = add i64 %d.load, 1 - ; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - ; SM70: st.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(1) %d unordered, align 8 - - ; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(1) %e unordered, align 4 - %e.add = fadd float %e.load, 1.0 - ; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - ; SM70: st.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(1) %e unordered, align 4 - - ; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(1) %e unordered, align 8 - %f.add = fadd double %f.load, 1. - ; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - ; SM70: st.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(1) %e unordered, align 8 - - ret void -} - -; CHECK-LABEL: global_unordered_volatile_sys -define void @global_unordered_volatile_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +define void @global_unordered_sys_i32(ptr addrspace(1) %a) { +; SM60-LABEL: global_unordered_sys_i32( +; SM60: { +; SM60-NEXT: .reg .b32 %r<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i32_param_0]; +; SM60-NEXT: ld.volatile.global.u32 %r1, [%rd1]; +; SM60-NEXT: add.s32 %r2, %r1, 1; +; SM60-NEXT: st.volatile.global.u32 [%rd1], %r2; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_unordered_sys_i32( +; SM70: { +; SM70-NEXT: .reg .b32 %r<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i32_param_0]; +; SM70-NEXT: ld.relaxed.sys.global.u32 %r1, [%rd1]; +; SM70-NEXT: add.s32 %r2, %r1, 1; +; SM70-NEXT: st.relaxed.sys.global.u32 [%rd1], %r2; +; SM70-NEXT: ret; + %a.load = load atomic i32, ptr addrspace(1) %a unordered, align 4 + %a.add = add i32 %a.load, 1 + store atomic i32 %a.add, ptr addrspace(1) %a unordered, align 4 + ret void +} + +define void @global_unordered_sys_i64(ptr addrspace(1) %a) { +; SM60-LABEL: global_unordered_sys_i64( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<4>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i64_param_0]; +; SM60-NEXT: ld.volatile.global.u64 %rd2, [%rd1]; +; SM60-NEXT: add.s64 %rd3, %rd2, 1; +; SM60-NEXT: st.volatile.global.u64 [%rd1], %rd3; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_unordered_sys_i64( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<4>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i64_param_0]; +; SM70-NEXT: ld.relaxed.sys.global.u64 %rd2, [%rd1]; +; SM70-NEXT: add.s64 %rd3, %rd2, 1; +; SM70-NEXT: st.relaxed.sys.global.u64 [%rd1], %rd3; +; SM70-NEXT: ret; + %a.load = load atomic i64, ptr addrspace(1) %a unordered, align 8 + %a.add = add i64 %a.load, 1 + store atomic i64 %a.add, ptr addrspace(1) %a unordered, align 8 + ret void +} + +define void @global_unordered_sys_float(ptr addrspace(1) %a) { +; SM60-LABEL: global_unordered_sys_float( +; SM60: { +; SM60-NEXT: .reg .f32 %f<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_sys_float_param_0]; +; SM60-NEXT: ld.volatile.global.f32 %f1, [%rd1]; +; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; SM60-NEXT: st.volatile.global.f32 [%rd1], %f2; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_unordered_sys_float( +; SM70: { +; SM70-NEXT: .reg .f32 %f<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_sys_float_param_0]; +; SM70-NEXT: ld.relaxed.sys.global.f32 %f1, [%rd1]; +; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; SM70-NEXT: st.relaxed.sys.global.f32 [%rd1], %f2; +; SM70-NEXT: ret; + %a.load = load atomic float, ptr addrspace(1) %a unordered, align 4 + %a.add = fadd float %a.load, 1. + store atomic float %a.add, ptr addrspace(1) %a unordered, align 4 + ret void +} + +define void @global_unordered_sys_double(ptr addrspace(1) %a) { +; SM60-LABEL: global_unordered_sys_double( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .f64 %fd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_sys_double_param_0]; +; SM60-NEXT: ld.volatile.global.f64 %fd1, [%rd1]; +; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; SM60-NEXT: st.volatile.global.f64 [%rd1], %fd2; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_unordered_sys_double( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .f64 %fd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_sys_double_param_0]; +; SM70-NEXT: ld.relaxed.sys.global.f64 %fd1, [%rd1]; +; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; SM70-NEXT: st.relaxed.sys.global.f64 [%rd1], %fd2; +; SM70-NEXT: ret; + %a.load = load atomic double, ptr addrspace(1) %a unordered, align 8 + %a.add = fadd double %a.load, 1. + store atomic double %a.add, ptr addrspace(1) %a unordered, align 8 + ret void +} + +; global_unordered_volatile_sys + +define void @global_unordered_volatile_sys_i8(ptr addrspace(1) %a) { +; SM60-LABEL: global_unordered_volatile_sys_i8( +; SM60: { +; SM60-NEXT: .reg .b16 %rs<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i8_param_0]; +; SM60-NEXT: ld.volatile.global.u8 %rs1, [%rd1]; +; SM60-NEXT: add.s16 %rs2, %rs1, 1; +; SM60-NEXT: st.volatile.global.u8 [%rd1], %rs2; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_unordered_volatile_sys_i8( +; SM70: { +; SM70-NEXT: .reg .b16 %rs<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i8_param_0]; +; SM70-NEXT: ld.mmio.relaxed.sys.global.u8 %rs1, [%rd1]; +; SM70-NEXT: add.s16 %rs2, %rs1, 1; +; SM70-NEXT: st.mmio.relaxed.sys.global.u8 [%rd1], %rs2; +; SM70-NEXT: ret; %a.load = load atomic volatile i8, ptr addrspace(1) %a unordered, align 1 %a.add = add i8 %a.load, 1 - ; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - ; SM70: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(1) %a unordered, align 1 + ret void +} + +define void @global_unordered_volatile_sys_i16(ptr addrspace(1) %a) { +; SM60-LABEL: global_unordered_volatile_sys_i16( +; SM60: { +; SM60-NEXT: .reg .b16 %rs<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i16_param_0]; +; SM60-NEXT: ld.volatile.global.u16 %rs1, [%rd1]; +; SM60-NEXT: add.s16 %rs2, %rs1, 1; +; SM60-NEXT: st.volatile.global.u16 [%rd1], %rs2; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_unordered_volatile_sys_i16( +; SM70: { +; SM70-NEXT: .reg .b16 %rs<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i16_param_0]; +; SM70-NEXT: ld.mmio.relaxed.sys.global.u16 %rs1, [%rd1]; +; SM70-NEXT: add.s16 %rs2, %rs1, 1; +; SM70-NEXT: st.mmio.relaxed.sys.global.u16 [%rd1], %rs2; +; SM70-NEXT: ret; + %a.load = load atomic volatile i16, ptr addrspace(1) %a unordered, align 2 + %a.add = add i16 %a.load, 1 + store atomic volatile i16 %a.add, ptr addrspace(1) %a unordered, align 2 + ret void +} + +define void @global_unordered_volatile_sys_i32(ptr addrspace(1) %a) { +; SM60-LABEL: global_unordered_volatile_sys_i32( +; SM60: { +; SM60-NEXT: .reg .b32 %r<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i32_param_0]; +; SM60-NEXT: ld.volatile.global.u32 %r1, [%rd1]; +; SM60-NEXT: add.s32 %r2, %r1, 1; +; SM60-NEXT: st.volatile.global.u32 [%rd1], %r2; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_unordered_volatile_sys_i32( +; SM70: { +; SM70-NEXT: .reg .b32 %r<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i32_param_0]; +; SM70-NEXT: ld.mmio.relaxed.sys.global.u32 %r1, [%rd1]; +; SM70-NEXT: add.s32 %r2, %r1, 1; +; SM70-NEXT: st.mmio.relaxed.sys.global.u32 [%rd1], %r2; +; SM70-NEXT: ret; + %a.load = load atomic volatile i32, ptr addrspace(1) %a unordered, align 4 + %a.add = add i32 %a.load, 1 + store atomic volatile i32 %a.add, ptr addrspace(1) %a unordered, align 4 + ret void +} + +define void @global_unordered_volatile_sys_i64(ptr addrspace(1) %a) { +; SM60-LABEL: global_unordered_volatile_sys_i64( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<4>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i64_param_0]; +; SM60-NEXT: ld.volatile.global.u64 %rd2, [%rd1]; +; SM60-NEXT: add.s64 %rd3, %rd2, 1; +; SM60-NEXT: st.volatile.global.u64 [%rd1], %rd3; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_unordered_volatile_sys_i64( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<4>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i64_param_0]; +; SM70-NEXT: ld.mmio.relaxed.sys.global.u64 %rd2, [%rd1]; +; SM70-NEXT: add.s64 %rd3, %rd2, 1; +; SM70-NEXT: st.mmio.relaxed.sys.global.u64 [%rd1], %rd3; +; SM70-NEXT: ret; + %a.load = load atomic volatile i64, ptr addrspace(1) %a unordered, align 8 + %a.add = add i64 %a.load, 1 + store atomic volatile i64 %a.add, ptr addrspace(1) %a unordered, align 8 + ret void +} - ; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(1) %b unordered, align 2 - %b.add = add i16 %b.load, 1 - ; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - ; SM70: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(1) %b unordered, align 2 - - ; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(1) %c unordered, align 4 - %c.add = add i32 %c.load, 1 - ; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - ; SM70: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(1) %c unordered, align 4 - - ; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(1) %d unordered, align 8 - %d.add = add i64 %d.load, 1 - ; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - ; SM70: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(1) %d unordered, align 8 - - ; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(1) %e unordered, align 4 - %e.add = fadd float %e.load, 1.0 - ; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - ; SM70: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(1) %e unordered, align 4 - - ; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(1) %e unordered, align 8 - %f.add = fadd double %f.load, 1. - ; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - ; SM70: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(1) %e unordered, align 8 - - ret void -} - -; CHECK-LABEL: global_monotonic_sys -define void @global_monotonic_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +define void @global_unordered_volatile_sys_float(ptr addrspace(1) %a) { +; SM60-LABEL: global_unordered_volatile_sys_float( +; SM60: { +; SM60-NEXT: .reg .f32 %f<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_float_param_0]; +; SM60-NEXT: ld.volatile.global.f32 %f1, [%rd1]; +; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; SM60-NEXT: st.volatile.global.f32 [%rd1], %f2; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_unordered_volatile_sys_float( +; SM70: { +; SM70-NEXT: .reg .f32 %f<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_float_param_0]; +; SM70-NEXT: ld.mmio.relaxed.sys.global.f32 %f1, [%rd1]; +; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; SM70-NEXT: st.mmio.relaxed.sys.global.f32 [%rd1], %f2; +; SM70-NEXT: ret; + %a.load = load atomic volatile float, ptr addrspace(1) %a unordered, align 4 + %a.add = fadd float %a.load, 1. + store atomic volatile float %a.add, ptr addrspace(1) %a unordered, align 4 + ret void +} + +define void @global_unordered_volatile_sys_double(ptr addrspace(1) %a) { +; SM60-LABEL: global_unordered_volatile_sys_double( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .f64 %fd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_double_param_0]; +; SM60-NEXT: ld.volatile.global.f64 %fd1, [%rd1]; +; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; SM60-NEXT: st.volatile.global.f64 [%rd1], %fd2; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_unordered_volatile_sys_double( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .f64 %fd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_double_param_0]; +; SM70-NEXT: ld.mmio.relaxed.sys.global.f64 %fd1, [%rd1]; +; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; SM70-NEXT: st.mmio.relaxed.sys.global.f64 [%rd1], %fd2; +; SM70-NEXT: ret; + %a.load = load atomic volatile double, ptr addrspace(1) %a unordered, align 8 + %a.add = fadd double %a.load, 1. + store atomic volatile double %a.add, ptr addrspace(1) %a unordered, align 8 + ret void +} + +; global_monotonic_sys + +define void @global_monotonic_sys_i8(ptr addrspace(1) %a) { +; SM60-LABEL: global_monotonic_sys_i8( +; SM60: { +; SM60-NEXT: .reg .b16 %rs<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i8_param_0]; +; SM60-NEXT: ld.volatile.global.u8 %rs1, [%rd1]; +; SM60-NEXT: add.s16 %rs2, %rs1, 1; +; SM60-NEXT: st.volatile.global.u8 [%rd1], %rs2; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_monotonic_sys_i8( +; SM70: { +; SM70-NEXT: .reg .b16 %rs<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i8_param_0]; +; SM70-NEXT: ld.relaxed.sys.global.u8 %rs1, [%rd1]; +; SM70-NEXT: add.s16 %rs2, %rs1, 1; +; SM70-NEXT: st.relaxed.sys.global.u8 [%rd1], %rs2; +; SM70-NEXT: ret; %a.load = load atomic i8, ptr addrspace(1) %a monotonic, align 1 %a.add = add i8 %a.load, 1 - ; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - ; SM70: st.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(1) %a monotonic, align 1 + ret void +} - ; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(1) %b monotonic, align 2 - %b.add = add i16 %b.load, 1 - ; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - ; SM70: st.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(1) %b monotonic, align 2 - - ; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(1) %c monotonic, align 4 - %c.add = add i32 %c.load, 1 - ; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - ; SM70: st.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(1) %c monotonic, align 4 - - ; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(1) %d monotonic, align 8 - %d.add = add i64 %d.load, 1 - ; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - ; SM70: st.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(1) %d monotonic, align 8 - - ; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(1) %e monotonic, align 4 - %e.add = fadd float %e.load, 1. - ; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - ; SM70: st.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(1) %e monotonic, align 4 - - ; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(1) %e monotonic, align 8 - %f.add = fadd double %f.load, 1. - ; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - ; SM70: st.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(1) %e monotonic, align 8 - - ret void -} - -; CHECK-LABEL: global_monotonic_volatile_sys -define void @global_monotonic_volatile_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +define void @global_monotonic_sys_i16(ptr addrspace(1) %a) { +; SM60-LABEL: global_monotonic_sys_i16( +; SM60: { +; SM60-NEXT: .reg .b16 %rs<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i16_param_0]; +; SM60-NEXT: ld.volatile.global.u16 %rs1, [%rd1]; +; SM60-NEXT: add.s16 %rs2, %rs1, 1; +; SM60-NEXT: st.volatile.global.u16 [%rd1], %rs2; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_monotonic_sys_i16( +; SM70: { +; SM70-NEXT: .reg .b16 %rs<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i16_param_0]; +; SM70-NEXT: ld.relaxed.sys.global.u16 %rs1, [%rd1]; +; SM70-NEXT: add.s16 %rs2, %rs1, 1; +; SM70-NEXT: st.relaxed.sys.global.u16 [%rd1], %rs2; +; SM70-NEXT: ret; + %a.load = load atomic i16, ptr addrspace(1) %a monotonic, align 2 + %a.add = add i16 %a.load, 1 + store atomic i16 %a.add, ptr addrspace(1) %a monotonic, align 2 + ret void +} + +define void @global_monotonic_sys_i32(ptr addrspace(1) %a) { +; SM60-LABEL: global_monotonic_sys_i32( +; SM60: { +; SM60-NEXT: .reg .b32 %r<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i32_param_0]; +; SM60-NEXT: ld.volatile.global.u32 %r1, [%rd1]; +; SM60-NEXT: add.s32 %r2, %r1, 1; +; SM60-NEXT: st.volatile.global.u32 [%rd1], %r2; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_monotonic_sys_i32( +; SM70: { +; SM70-NEXT: .reg .b32 %r<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i32_param_0]; +; SM70-NEXT: ld.relaxed.sys.global.u32 %r1, [%rd1]; +; SM70-NEXT: add.s32 %r2, %r1, 1; +; SM70-NEXT: st.relaxed.sys.global.u32 [%rd1], %r2; +; SM70-NEXT: ret; + %a.load = load atomic i32, ptr addrspace(1) %a monotonic, align 4 + %a.add = add i32 %a.load, 1 + store atomic i32 %a.add, ptr addrspace(1) %a monotonic, align 4 + ret void +} + +define void @global_monotonic_sys_i64(ptr addrspace(1) %a) { +; SM60-LABEL: global_monotonic_sys_i64( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<4>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i64_param_0]; +; SM60-NEXT: ld.volatile.global.u64 %rd2, [%rd1]; +; SM60-NEXT: add.s64 %rd3, %rd2, 1; +; SM60-NEXT: st.volatile.global.u64 [%rd1], %rd3; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_monotonic_sys_i64( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<4>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i64_param_0]; +; SM70-NEXT: ld.relaxed.sys.global.u64 %rd2, [%rd1]; +; SM70-NEXT: add.s64 %rd3, %rd2, 1; +; SM70-NEXT: st.relaxed.sys.global.u64 [%rd1], %rd3; +; SM70-NEXT: ret; + %a.load = load atomic i64, ptr addrspace(1) %a monotonic, align 8 + %a.add = add i64 %a.load, 1 + store atomic i64 %a.add, ptr addrspace(1) %a monotonic, align 8 + ret void +} + +define void @global_monotonic_sys_float(ptr addrspace(1) %a) { +; SM60-LABEL: global_monotonic_sys_float( +; SM60: { +; SM60-NEXT: .reg .f32 %f<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_float_param_0]; +; SM60-NEXT: ld.volatile.global.f32 %f1, [%rd1]; +; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; SM60-NEXT: st.volatile.global.f32 [%rd1], %f2; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_monotonic_sys_float( +; SM70: { +; SM70-NEXT: .reg .f32 %f<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_float_param_0]; +; SM70-NEXT: ld.relaxed.sys.global.f32 %f1, [%rd1]; +; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; SM70-NEXT: st.relaxed.sys.global.f32 [%rd1], %f2; +; SM70-NEXT: ret; + %a.load = load atomic float, ptr addrspace(1) %a monotonic, align 4 + %a.add = fadd float %a.load, 1. + store atomic float %a.add, ptr addrspace(1) %a monotonic, align 4 + ret void +} + +define void @global_monotonic_sys_double(ptr addrspace(1) %a) { +; SM60-LABEL: global_monotonic_sys_double( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .f64 %fd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_double_param_0]; +; SM60-NEXT: ld.volatile.global.f64 %fd1, [%rd1]; +; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; SM60-NEXT: st.volatile.global.f64 [%rd1], %fd2; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_monotonic_sys_double( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .f64 %fd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_double_param_0]; +; SM70-NEXT: ld.relaxed.sys.global.f64 %fd1, [%rd1]; +; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; SM70-NEXT: st.relaxed.sys.global.f64 [%rd1], %fd2; +; SM70-NEXT: ret; + %a.load = load atomic double, ptr addrspace(1) %a monotonic, align 8 + %a.add = fadd double %a.load, 1. + store atomic double %a.add, ptr addrspace(1) %a monotonic, align 8 + ret void +} + +; global_monotonic_volatile_sys + +define void @global_monotonic_volatile_sys_i8(ptr addrspace(1) %a) { +; SM60-LABEL: global_monotonic_volatile_sys_i8( +; SM60: { +; SM60-NEXT: .reg .b16 %rs<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i8_param_0]; +; SM60-NEXT: ld.volatile.global.u8 %rs1, [%rd1]; +; SM60-NEXT: add.s16 %rs2, %rs1, 1; +; SM60-NEXT: st.volatile.global.u8 [%rd1], %rs2; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_monotonic_volatile_sys_i8( +; SM70: { +; SM70-NEXT: .reg .b16 %rs<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i8_param_0]; +; SM70-NEXT: ld.mmio.relaxed.sys.global.u8 %rs1, [%rd1]; +; SM70-NEXT: add.s16 %rs2, %rs1, 1; +; SM70-NEXT: st.mmio.relaxed.sys.global.u8 [%rd1], %rs2; +; SM70-NEXT: ret; %a.load = load atomic volatile i8, ptr addrspace(1) %a monotonic, align 1 %a.add = add i8 %a.load, 1 - ; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - ; SM70: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(1) %a monotonic, align 1 + ret void +} - ; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(1) %b monotonic, align 2 - %b.add = add i16 %b.load, 1 - ; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - ; SM70: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(1) %b monotonic, align 2 - - ; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(1) %c monotonic, align 4 - %c.add = add i32 %c.load, 1 - ; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - ; SM70: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(1) %c monotonic, align 4 - - ; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(1) %d monotonic, align 8 - %d.add = add i64 %d.load, 1 - ; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - ; SM70: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(1) %d monotonic, align 8 - - ; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(1) %e monotonic, align 4 - %e.add = fadd float %e.load, 1. - ; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - ; SM70: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(1) %e monotonic, align 4 - - ; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(1) %e monotonic, align 8 - %f.add = fadd double %f.load, 1. - ; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - ; SM70: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(1) %e monotonic, align 8 +define void @global_monotonic_volatile_sys_i16(ptr addrspace(1) %a) { +; SM60-LABEL: global_monotonic_volatile_sys_i16( +; SM60: { +; SM60-NEXT: .reg .b16 %rs<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i16_param_0]; +; SM60-NEXT: ld.volatile.global.u16 %rs1, [%rd1]; +; SM60-NEXT: add.s16 %rs2, %rs1, 1; +; SM60-NEXT: st.volatile.global.u16 [%rd1], %rs2; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_monotonic_volatile_sys_i16( +; SM70: { +; SM70-NEXT: .reg .b16 %rs<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i16_param_0]; +; SM70-NEXT: ld.mmio.relaxed.sys.global.u16 %rs1, [%rd1]; +; SM70-NEXT: add.s16 %rs2, %rs1, 1; +; SM70-NEXT: st.mmio.relaxed.sys.global.u16 [%rd1], %rs2; +; SM70-NEXT: ret; + %a.load = load atomic volatile i16, ptr addrspace(1) %a monotonic, align 2 + %a.add = add i16 %a.load, 1 + store atomic volatile i16 %a.add, ptr addrspace(1) %a monotonic, align 2 + ret void +} +define void @global_monotonic_volatile_sys_i32(ptr addrspace(1) %a) { +; SM60-LABEL: global_monotonic_volatile_sys_i32( +; SM60: { +; SM60-NEXT: .reg .b32 %r<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i32_param_0]; +; SM60-NEXT: ld.volatile.global.u32 %r1, [%rd1]; +; SM60-NEXT: add.s32 %r2, %r1, 1; +; SM60-NEXT: st.volatile.global.u32 [%rd1], %r2; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_monotonic_volatile_sys_i32( +; SM70: { +; SM70-NEXT: .reg .b32 %r<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i32_param_0]; +; SM70-NEXT: ld.mmio.relaxed.sys.global.u32 %r1, [%rd1]; +; SM70-NEXT: add.s32 %r2, %r1, 1; +; SM70-NEXT: st.mmio.relaxed.sys.global.u32 [%rd1], %r2; +; SM70-NEXT: ret; + %a.load = load atomic volatile i32, ptr addrspace(1) %a monotonic, align 4 + %a.add = add i32 %a.load, 1 + store atomic volatile i32 %a.add, ptr addrspace(1) %a monotonic, align 4 + ret void +} + +define void @global_monotonic_volatile_sys_i64(ptr addrspace(1) %a) { +; SM60-LABEL: global_monotonic_volatile_sys_i64( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<4>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i64_param_0]; +; SM60-NEXT: ld.volatile.global.u64 %rd2, [%rd1]; +; SM60-NEXT: add.s64 %rd3, %rd2, 1; +; SM60-NEXT: st.volatile.global.u64 [%rd1], %rd3; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_monotonic_volatile_sys_i64( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<4>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i64_param_0]; +; SM70-NEXT: ld.mmio.relaxed.sys.global.u64 %rd2, [%rd1]; +; SM70-NEXT: add.s64 %rd3, %rd2, 1; +; SM70-NEXT: st.mmio.relaxed.sys.global.u64 [%rd1], %rd3; +; SM70-NEXT: ret; + %a.load = load atomic volatile i64, ptr addrspace(1) %a monotonic, align 8 + %a.add = add i64 %a.load, 1 + store atomic volatile i64 %a.add, ptr addrspace(1) %a monotonic, align 8 + ret void +} + +define void @global_monotonic_volatile_sys_float(ptr addrspace(1) %a) { +; SM60-LABEL: global_monotonic_volatile_sys_float( +; SM60: { +; SM60-NEXT: .reg .f32 %f<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_float_param_0]; +; SM60-NEXT: ld.volatile.global.f32 %f1, [%rd1]; +; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; SM60-NEXT: st.volatile.global.f32 [%rd1], %f2; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_monotonic_volatile_sys_float( +; SM70: { +; SM70-NEXT: .reg .f32 %f<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_float_param_0]; +; SM70-NEXT: ld.mmio.relaxed.sys.global.f32 %f1, [%rd1]; +; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; SM70-NEXT: st.mmio.relaxed.sys.global.f32 [%rd1], %f2; +; SM70-NEXT: ret; + %a.load = load atomic volatile float, ptr addrspace(1) %a monotonic, align 4 + %a.add = fadd float %a.load, 1. + store atomic volatile float %a.add, ptr addrspace(1) %a monotonic, align 4 + ret void +} + +define void @global_monotonic_volatile_sys_double(ptr addrspace(1) %a) { +; SM60-LABEL: global_monotonic_volatile_sys_double( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .f64 %fd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_double_param_0]; +; SM60-NEXT: ld.volatile.global.f64 %fd1, [%rd1]; +; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; SM60-NEXT: st.volatile.global.f64 [%rd1], %fd2; +; SM60-NEXT: ret; +; +; SM70-LABEL: global_monotonic_volatile_sys_double( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .f64 %fd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_double_param_0]; +; SM70-NEXT: ld.mmio.relaxed.sys.global.f64 %fd1, [%rd1]; +; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; SM70-NEXT: st.mmio.relaxed.sys.global.f64 [%rd1], %fd2; +; SM70-NEXT: ret; + %a.load = load atomic volatile double, ptr addrspace(1) %a monotonic, align 8 + %a.add = fadd double %a.load, 1. + store atomic volatile double %a.add, ptr addrspace(1) %a monotonic, align 8 ret void } ;; shared statespace -; CHECK-LABEL: shared_weak -define void @shared_weak(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr { - ; CHECK: ld.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +; shared + +define void @shared_i8(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_i8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_i8_param_0]; +; CHECK-NEXT: ld.shared.u8 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.shared.u8 [%rd1], %rs2; +; CHECK-NEXT: ret; %a.load = load i8, ptr addrspace(3) %a %a.add = add i8 %a.load, 1 - ; CHECK: st.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store i8 %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_i16(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_i16_param_0]; +; CHECK-NEXT: ld.shared.u16 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.shared.u16 [%rd1], %rs2; +; CHECK-NEXT: ret; + %a.load = load i16, ptr addrspace(3) %a + %a.add = add i16 %a.load, 1 + store i16 %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_i32(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_i32_param_0]; +; CHECK-NEXT: ld.shared.u32 %r1, [%rd1]; +; CHECK-NEXT: add.s32 %r2, %r1, 1; +; CHECK-NEXT: st.shared.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load i32, ptr addrspace(3) %a + %a.add = add i32 %a.load, 1 + store i32 %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_i64(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_i64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_i64_param_0]; +; CHECK-NEXT: ld.shared.u64 %rd2, [%rd1]; +; CHECK-NEXT: add.s64 %rd3, %rd2, 1; +; CHECK-NEXT: st.shared.u64 [%rd1], %rd3; +; CHECK-NEXT: ret; + %a.load = load i64, ptr addrspace(3) %a + %a.add = add i64 %a.load, 1 + store i64 %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_float(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_float( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_float_param_0]; +; CHECK-NEXT: ld.shared.f32 %f1, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; CHECK-NEXT: st.shared.f32 [%rd1], %f2; +; CHECK-NEXT: ret; + %a.load = load float, ptr addrspace(3) %a + %a.add = fadd float %a.load, 1. + store float %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_double(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_double( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_double_param_0]; +; CHECK-NEXT: ld.shared.f64 %fd1, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.shared.f64 [%rd1], %fd2; +; CHECK-NEXT: ret; + %a.load = load double, ptr addrspace(3) %a + %a.add = fadd double %a.load, 1. + store double %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_2xi8(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_2xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xi8_param_0]; +; CHECK-NEXT: ld.shared.v2.u8 {%rs1, %rs2}, [%rd1]; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: st.shared.v2.u8 [%rd1], {%rs4, %rs3}; +; CHECK-NEXT: ret; + %a.load = load <2 x i8>, ptr addrspace(3) %a + %a.add = add <2 x i8> %a.load, <i8 1, i8 1> + store <2 x i8> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_4xi8(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_4xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b32 %r<13>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_4xi8_param_0]; +; CHECK-NEXT: ld.shared.u32 %r1, [%rd1]; +; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; +; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; +; CHECK-NEXT: add.s16 %rs4, %rs3, 1; +; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; +; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; +; CHECK-NEXT: add.s16 %rs6, %rs5, 1; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; +; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8; +; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; +; CHECK-NEXT: add.s16 %rs8, %rs7, 1; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; +; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8; +; CHECK-NEXT: st.shared.u32 [%rd1], %r12; +; CHECK-NEXT: ret; + %a.load = load <4 x i8>, ptr addrspace(3) %a + %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1> + store <4 x i8> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_2xi16(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_2xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xi16_param_0]; +; CHECK-NEXT: ld.shared.u32 %r1, [%rd1]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; CHECK-NEXT: st.shared.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load <2 x i16>, ptr addrspace(3) %a + %a.add = add <2 x i16> %a.load, <i16 1, i16 1> + store <2 x i16> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_4xi16(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_4xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_4xi16_param_0]; +; CHECK-NEXT: ld.shared.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; +; CHECK-NEXT: add.s16 %rs5, %rs4, 1; +; CHECK-NEXT: add.s16 %rs6, %rs3, 1; +; CHECK-NEXT: add.s16 %rs7, %rs2, 1; +; CHECK-NEXT: add.s16 %rs8, %rs1, 1; +; CHECK-NEXT: st.shared.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; +; CHECK-NEXT: ret; + %a.load = load <4 x i16>, ptr addrspace(3) %a + %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1> + store <4 x i16> %a.add, ptr addrspace(3) %a + ret void +} - ; CHECK: ld.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load i16, ptr addrspace(3) %b - %b.add = add i16 %b.load, 1 - ; CHECK: st.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store i16 %b.add, ptr addrspace(3) %b - - ; CHECK: ld.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load i32, ptr addrspace(3) %c - %c.add = add i32 %c.load, 1 - ; CHECK: st.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store i32 %c.add, ptr addrspace(3) %c - - ; CHECK: ld.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load i64, ptr addrspace(3) %d - %d.add = add i64 %d.load, 1 - ; CHECK: st.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store i64 %d.add, ptr addrspace(3) %d - - ; CHECK: ld.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load float, ptr addrspace(3) %c - %e.add = fadd float %e.load, 1. - ; CHECK: st.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store float %e.add, ptr addrspace(3) %c - - ; CHECK: ld.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load double, ptr addrspace(3) %c - %f.add = fadd double %f.load, 1. - ; CHECK: st.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store double %f.add, ptr addrspace(3) %c - - ; CHECK: ld.shared.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] - %h.load = load <2 x i8>, ptr addrspace(3) %b - %h.add = add <2 x i8> %h.load, <i8 1, i8 1> - ; CHECK: st.shared.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}} - store <2 x i8> %h.add, ptr addrspace(3) %b - - ; CHECK: ld.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %i.load = load <4 x i8>, ptr addrspace(3) %c - %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1> - ; CHECK: st.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store <4 x i8> %i.add, ptr addrspace(3) %c - - ; CHECK: ld.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %j.load = load <2 x i16>, ptr addrspace(3) %c - %j.add = add <2 x i16> %j.load, <i16 1, i16 1> - ; CHECK: st.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store <2 x i16> %j.add, ptr addrspace(3) %c - - ; CHECK: ld.shared.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] - %k.load = load <4 x i16>, ptr addrspace(3) %d - %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1> - ; CHECK: st.shared.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} - store <4 x i16> %k.add, ptr addrspace(3) %d - - ; CHECK: ld.shared.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] - %l.load = load <2 x i32>, ptr addrspace(3) %d - %l.add = add <2 x i32> %l.load, <i32 1, i32 1> - ; CHECK: st.shared.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} - store <2 x i32> %l.add, ptr addrspace(3) %d - - ; CHECK: ld.shared.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] - %m.load = load <4 x i32>, ptr addrspace(3) %d - %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1> - ; CHECK: st.shared.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} - store <4 x i32> %m.add, ptr addrspace(3) %d - - ; CHECK: ld.shared.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] - %n.load = load <2 x i64>, ptr addrspace(3) %d - %n.add = add <2 x i64> %n.load, <i64 1, i64 1> - ; CHECK: st.shared.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} - store <2 x i64> %n.add, ptr addrspace(3) %d - - ; CHECK: ld.shared.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] - %o.load = load <2 x float>, ptr addrspace(3) %d - %o.add = fadd <2 x float> %o.load, <float 1., float 1.> - ; CHECK: st.shared.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} - store <2 x float> %o.add, ptr addrspace(3) %d - - ; CHECK: ld.shared.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] - %p.load = load <4 x float>, ptr addrspace(3) %d - %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.> - ; CHECK: st.shared.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} - store <4 x float> %p.add, ptr addrspace(3) %d - - ; CHECK: ld.shared.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] - %q.load = load <2 x double>, ptr addrspace(3) %d - %q.add = fadd <2 x double> %q.load, <double 1., double 1.> - ; CHECK: st.shared.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}} - store <2 x double> %q.add, ptr addrspace(3) %d - - ret void -} - -; CHECK-LABEL: shared_volatile -define void @shared_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr { - ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +define void @shared_2xi32(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_2xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xi32_param_0]; +; CHECK-NEXT: ld.shared.v2.u32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: add.s32 %r3, %r2, 1; +; CHECK-NEXT: add.s32 %r4, %r1, 1; +; CHECK-NEXT: st.shared.v2.u32 [%rd1], {%r4, %r3}; +; CHECK-NEXT: ret; + %a.load = load <2 x i32>, ptr addrspace(3) %a + %a.add = add <2 x i32> %a.load, <i32 1, i32 1> + store <2 x i32> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_4xi32(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_4xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_4xi32_param_0]; +; CHECK-NEXT: ld.shared.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: add.s32 %r5, %r4, 1; +; CHECK-NEXT: add.s32 %r6, %r3, 1; +; CHECK-NEXT: add.s32 %r7, %r2, 1; +; CHECK-NEXT: add.s32 %r8, %r1, 1; +; CHECK-NEXT: st.shared.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; +; CHECK-NEXT: ret; + %a.load = load <4 x i32>, ptr addrspace(3) %a + %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1> + store <4 x i32> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_2xi64(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_2xi64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xi64_param_0]; +; CHECK-NEXT: ld.shared.v2.u64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: add.s64 %rd4, %rd3, 1; +; CHECK-NEXT: add.s64 %rd5, %rd2, 1; +; CHECK-NEXT: st.shared.v2.u64 [%rd1], {%rd5, %rd4}; +; CHECK-NEXT: ret; + %a.load = load <2 x i64>, ptr addrspace(3) %a + %a.add = add <2 x i64> %a.load, <i64 1, i64 1> + store <2 x i64> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_2xfloat(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_2xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xfloat_param_0]; +; CHECK-NEXT: ld.shared.v2.f32 {%f1, %f2}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; +; CHECK-NEXT: st.shared.v2.f32 [%rd1], {%f4, %f3}; +; CHECK-NEXT: ret; + %a.load = load <2 x float>, ptr addrspace(3) %a + %a.add = fadd <2 x float> %a.load, <float 1., float 1.> + store <2 x float> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_4xfloat(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_4xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_4xfloat_param_0]; +; CHECK-NEXT: ld.shared.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; +; CHECK-NEXT: st.shared.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; +; CHECK-NEXT: ret; + %a.load = load <4 x float>, ptr addrspace(3) %a + %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.> + store <4 x float> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_2xdouble(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_2xdouble( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xdouble_param_0]; +; CHECK-NEXT: ld.shared.v2.f64 {%fd1, %fd2}, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; +; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.shared.v2.f64 [%rd1], {%fd4, %fd3}; +; CHECK-NEXT: ret; + %a.load = load <2 x double>, ptr addrspace(3) %a + %a.add = fadd <2 x double> %a.load, <double 1., double 1.> + store <2 x double> %a.add, ptr addrspace(3) %a + ret void +} + +; shared_volatile + +define void @shared_volatile_i8(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_volatile_i8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_i8_param_0]; +; CHECK-NEXT: ld.volatile.shared.u8 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.volatile.shared.u8 [%rd1], %rs2; +; CHECK-NEXT: ret; %a.load = load volatile i8, ptr addrspace(3) %a %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store volatile i8 %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_volatile_i16(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_volatile_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_i16_param_0]; +; CHECK-NEXT: ld.volatile.shared.u16 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.volatile.shared.u16 [%rd1], %rs2; +; CHECK-NEXT: ret; + %a.load = load volatile i16, ptr addrspace(3) %a + %a.add = add i16 %a.load, 1 + store volatile i16 %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_volatile_i32(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_volatile_i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_i32_param_0]; +; CHECK-NEXT: ld.volatile.shared.u32 %r1, [%rd1]; +; CHECK-NEXT: add.s32 %r2, %r1, 1; +; CHECK-NEXT: st.volatile.shared.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load volatile i32, ptr addrspace(3) %a + %a.add = add i32 %a.load, 1 + store volatile i32 %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_volatile_i64(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_volatile_i64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_i64_param_0]; +; CHECK-NEXT: ld.volatile.shared.u64 %rd2, [%rd1]; +; CHECK-NEXT: add.s64 %rd3, %rd2, 1; +; CHECK-NEXT: st.volatile.shared.u64 [%rd1], %rd3; +; CHECK-NEXT: ret; + %a.load = load volatile i64, ptr addrspace(3) %a + %a.add = add i64 %a.load, 1 + store volatile i64 %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_volatile_float(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_volatile_float( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_float_param_0]; +; CHECK-NEXT: ld.volatile.shared.f32 %f1, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; CHECK-NEXT: st.volatile.shared.f32 [%rd1], %f2; +; CHECK-NEXT: ret; + %a.load = load volatile float, ptr addrspace(3) %a + %a.add = fadd float %a.load, 1. + store volatile float %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_volatile_double(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_volatile_double( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_double_param_0]; +; CHECK-NEXT: ld.volatile.shared.f64 %fd1, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.volatile.shared.f64 [%rd1], %fd2; +; CHECK-NEXT: ret; + %a.load = load volatile double, ptr addrspace(3) %a + %a.add = fadd double %a.load, 1. + store volatile double %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_volatile_2xi8(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_volatile_2xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xi8_param_0]; +; CHECK-NEXT: ld.volatile.shared.v2.u8 {%rs1, %rs2}, [%rd1]; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: st.volatile.shared.v2.u8 [%rd1], {%rs4, %rs3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i8>, ptr addrspace(3) %a + %a.add = add <2 x i8> %a.load, <i8 1, i8 1> + store volatile <2 x i8> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_volatile_4xi8(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_volatile_4xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b32 %r<13>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_4xi8_param_0]; +; CHECK-NEXT: ld.volatile.shared.u32 %r1, [%rd1]; +; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; +; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; +; CHECK-NEXT: add.s16 %rs4, %rs3, 1; +; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; +; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; +; CHECK-NEXT: add.s16 %rs6, %rs5, 1; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; +; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8; +; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; +; CHECK-NEXT: add.s16 %rs8, %rs7, 1; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; +; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8; +; CHECK-NEXT: st.volatile.shared.u32 [%rd1], %r12; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x i8>, ptr addrspace(3) %a + %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1> + store volatile <4 x i8> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_volatile_2xi16(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_volatile_2xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xi16_param_0]; +; CHECK-NEXT: ld.volatile.shared.u32 %r1, [%rd1]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; CHECK-NEXT: st.volatile.shared.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i16>, ptr addrspace(3) %a + %a.add = add <2 x i16> %a.load, <i16 1, i16 1> + store volatile <2 x i16> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_volatile_4xi16(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_volatile_4xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_4xi16_param_0]; +; CHECK-NEXT: ld.volatile.shared.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; +; CHECK-NEXT: add.s16 %rs5, %rs4, 1; +; CHECK-NEXT: add.s16 %rs6, %rs3, 1; +; CHECK-NEXT: add.s16 %rs7, %rs2, 1; +; CHECK-NEXT: add.s16 %rs8, %rs1, 1; +; CHECK-NEXT: st.volatile.shared.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x i16>, ptr addrspace(3) %a + %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1> + store volatile <4 x i16> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_volatile_2xi32(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_volatile_2xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xi32_param_0]; +; CHECK-NEXT: ld.volatile.shared.v2.u32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: add.s32 %r3, %r2, 1; +; CHECK-NEXT: add.s32 %r4, %r1, 1; +; CHECK-NEXT: st.volatile.shared.v2.u32 [%rd1], {%r4, %r3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i32>, ptr addrspace(3) %a + %a.add = add <2 x i32> %a.load, <i32 1, i32 1> + store volatile <2 x i32> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_volatile_4xi32(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_volatile_4xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_4xi32_param_0]; +; CHECK-NEXT: ld.volatile.shared.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: add.s32 %r5, %r4, 1; +; CHECK-NEXT: add.s32 %r6, %r3, 1; +; CHECK-NEXT: add.s32 %r7, %r2, 1; +; CHECK-NEXT: add.s32 %r8, %r1, 1; +; CHECK-NEXT: st.volatile.shared.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x i32>, ptr addrspace(3) %a + %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1> + store volatile <4 x i32> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_volatile_2xi64(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_volatile_2xi64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xi64_param_0]; +; CHECK-NEXT: ld.volatile.shared.v2.u64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: add.s64 %rd4, %rd3, 1; +; CHECK-NEXT: add.s64 %rd5, %rd2, 1; +; CHECK-NEXT: st.volatile.shared.v2.u64 [%rd1], {%rd5, %rd4}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i64>, ptr addrspace(3) %a + %a.add = add <2 x i64> %a.load, <i64 1, i64 1> + store volatile <2 x i64> %a.add, ptr addrspace(3) %a + ret void +} - ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load volatile i16, ptr addrspace(3) %b - %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store volatile i16 %b.add, ptr addrspace(3) %b - - ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load volatile i32, ptr addrspace(3) %c - %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store volatile i32 %c.add, ptr addrspace(3) %c - - ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load volatile i64, ptr addrspace(3) %d - %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store volatile i64 %d.add, ptr addrspace(3) %d - - ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load volatile float, ptr addrspace(3) %c - %e.add = fadd float %e.load, 1. - ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store volatile float %e.add, ptr addrspace(3) %c - - ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load volatile double, ptr addrspace(3) %c - %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store volatile double %f.add, ptr addrspace(3) %c - - ; CHECK: ld.volatile.shared.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] - %h.load = load volatile <2 x i8>, ptr addrspace(3) %b - %h.add = add <2 x i8> %h.load, <i8 1, i8 1> - ; CHECK: st.volatile.shared.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}} - store volatile <2 x i8> %h.add, ptr addrspace(3) %b - - ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %i.load = load volatile <4 x i8>, ptr addrspace(3) %c - %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1> - ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store volatile <4 x i8> %i.add, ptr addrspace(3) %c - - ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %j.load = load volatile <2 x i16>, ptr addrspace(3) %c - %j.add = add <2 x i16> %j.load, <i16 1, i16 1> - ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store volatile <2 x i16> %j.add, ptr addrspace(3) %c - - ; CHECK: ld.volatile.shared.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] - %k.load = load volatile <4 x i16>, ptr addrspace(3) %d - %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1> - ; CHECK: st.volatile.shared.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} - store volatile <4 x i16> %k.add, ptr addrspace(3) %d - - ; CHECK: ld.volatile.shared.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] - %l.load = load volatile <2 x i32>, ptr addrspace(3) %d - %l.add = add <2 x i32> %l.load, <i32 1, i32 1> - ; CHECK: st.volatile.shared.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} - store volatile <2 x i32> %l.add, ptr addrspace(3) %d - - ; CHECK: ld.volatile.shared.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] - %m.load = load volatile <4 x i32>, ptr addrspace(3) %d - %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1> - ; CHECK: st.volatile.shared.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} - store volatile <4 x i32> %m.add, ptr addrspace(3) %d - - ; CHECK: ld.volatile.shared.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] - %n.load = load volatile <2 x i64>, ptr addrspace(3) %d - %n.add = add <2 x i64> %n.load, <i64 1, i64 1> - ; CHECK: st.volatile.shared.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} - store volatile <2 x i64> %n.add, ptr addrspace(3) %d - - ; CHECK: ld.volatile.shared.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] - %o.load = load volatile <2 x float>, ptr addrspace(3) %d - %o.add = fadd <2 x float> %o.load, <float 1., float 1.> - ; CHECK: st.volatile.shared.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} - store volatile <2 x float> %o.add, ptr addrspace(3) %d - - ; CHECK: ld.volatile.shared.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] - %p.load = load volatile <4 x float>, ptr addrspace(3) %d - %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.> - ; CHECK: st.volatile.shared.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} - store volatile <4 x float> %p.add, ptr addrspace(3) %d - - ; CHECK: ld.volatile.shared.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] - %q.load = load volatile <2 x double>, ptr addrspace(3) %d - %q.add = fadd <2 x double> %q.load, <double 1., double 1.> - ; CHECK: st.volatile.shared.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}} - store volatile <2 x double> %q.add, ptr addrspace(3) %d - - ret void -} - -; CHECK-LABEL: shared_unordered_sys -define void @shared_unordered_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; SM60: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +define void @shared_volatile_2xfloat(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_volatile_2xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xfloat_param_0]; +; CHECK-NEXT: ld.volatile.shared.v2.f32 {%f1, %f2}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; +; CHECK-NEXT: st.volatile.shared.v2.f32 [%rd1], {%f4, %f3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x float>, ptr addrspace(3) %a + %a.add = fadd <2 x float> %a.load, <float 1., float 1.> + store volatile <2 x float> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_volatile_4xfloat(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_volatile_4xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_4xfloat_param_0]; +; CHECK-NEXT: ld.volatile.shared.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; +; CHECK-NEXT: st.volatile.shared.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x float>, ptr addrspace(3) %a + %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.> + store volatile <4 x float> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_volatile_2xdouble(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_volatile_2xdouble( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xdouble_param_0]; +; CHECK-NEXT: ld.volatile.shared.v2.f64 {%fd1, %fd2}, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; +; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.volatile.shared.v2.f64 [%rd1], {%fd4, %fd3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x double>, ptr addrspace(3) %a + %a.add = fadd <2 x double> %a.load, <double 1., double 1.> + store volatile <2 x double> %a.add, ptr addrspace(3) %a + ret void +} + +; shared_unordered_sys + +define void @shared_unordered_sys_i8(ptr addrspace(3) %a) { +; SM60-LABEL: shared_unordered_sys_i8( +; SM60: { +; SM60-NEXT: .reg .b16 %rs<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i8_param_0]; +; SM60-NEXT: ld.volatile.shared.u8 %rs1, [%rd1]; +; SM60-NEXT: add.s16 %rs2, %rs1, 1; +; SM60-NEXT: st.volatile.shared.u8 [%rd1], %rs2; +; SM60-NEXT: ret; +; +; SM70-LABEL: shared_unordered_sys_i8( +; SM70: { +; SM70-NEXT: .reg .b16 %rs<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i8_param_0]; +; SM70-NEXT: ld.relaxed.sys.shared.u8 %rs1, [%rd1]; +; SM70-NEXT: add.s16 %rs2, %rs1, 1; +; SM70-NEXT: st.relaxed.sys.shared.u8 [%rd1], %rs2; +; SM70-NEXT: ret; %a.load = load atomic i8, ptr addrspace(3) %a unordered, align 1 %a.add = add i8 %a.load, 1 - ; SM60: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - ; SM70: st.relaxed.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(3) %a unordered, align 1 + ret void +} - ; SM60: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(3) %b unordered, align 2 - %b.add = add i16 %b.load, 1 - ; SM60: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - ; SM70: st.relaxed.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(3) %b unordered, align 2 - - ; SM60: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(3) %c unordered, align 4 - %c.add = add i32 %c.load, 1 - ; SM60: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - ; SM70: st.relaxed.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(3) %c unordered, align 4 - - ; SM60: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(3) %d unordered, align 8 - %d.add = add i64 %d.load, 1 - ; SM60: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - ; SM70: st.relaxed.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(3) %d unordered, align 8 - - ; SM60: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(3) %e unordered, align 4 - %e.add = fadd float %e.load, 1.0 - ; SM60: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - ; SM70: st.relaxed.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(3) %e unordered, align 4 - - ; SM60: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(3) %e unordered, align 8 - %f.add = fadd double %f.load, 1. - ; SM60: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - ; SM70: st.relaxed.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(3) %e unordered, align 8 - - ret void -} - -; CHECK-LABEL: shared_unordered_volatile_sys -define void @shared_unordered_volatile_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +define void @shared_unordered_sys_i16(ptr addrspace(3) %a) { +; SM60-LABEL: shared_unordered_sys_i16( +; SM60: { +; SM60-NEXT: .reg .b16 %rs<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i16_param_0]; +; SM60-NEXT: ld.volatile.shared.u16 %rs1, [%rd1]; +; SM60-NEXT: add.s16 %rs2, %rs1, 1; +; SM60-NEXT: st.volatile.shared.u16 [%rd1], %rs2; +; SM60-NEXT: ret; +; +; SM70-LABEL: shared_unordered_sys_i16( +; SM70: { +; SM70-NEXT: .reg .b16 %rs<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i16_param_0]; +; SM70-NEXT: ld.relaxed.sys.shared.u16 %rs1, [%rd1]; +; SM70-NEXT: add.s16 %rs2, %rs1, 1; +; SM70-NEXT: st.relaxed.sys.shared.u16 [%rd1], %rs2; +; SM70-NEXT: ret; + %a.load = load atomic i16, ptr addrspace(3) %a unordered, align 2 + %a.add = add i16 %a.load, 1 + store atomic i16 %a.add, ptr addrspace(3) %a unordered, align 2 + ret void +} + +define void @shared_unordered_sys_i32(ptr addrspace(3) %a) { +; SM60-LABEL: shared_unordered_sys_i32( +; SM60: { +; SM60-NEXT: .reg .b32 %r<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i32_param_0]; +; SM60-NEXT: ld.volatile.shared.u32 %r1, [%rd1]; +; SM60-NEXT: add.s32 %r2, %r1, 1; +; SM60-NEXT: st.volatile.shared.u32 [%rd1], %r2; +; SM60-NEXT: ret; +; +; SM70-LABEL: shared_unordered_sys_i32( +; SM70: { +; SM70-NEXT: .reg .b32 %r<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i32_param_0]; +; SM70-NEXT: ld.relaxed.sys.shared.u32 %r1, [%rd1]; +; SM70-NEXT: add.s32 %r2, %r1, 1; +; SM70-NEXT: st.relaxed.sys.shared.u32 [%rd1], %r2; +; SM70-NEXT: ret; + %a.load = load atomic i32, ptr addrspace(3) %a unordered, align 4 + %a.add = add i32 %a.load, 1 + store atomic i32 %a.add, ptr addrspace(3) %a unordered, align 4 + ret void +} + +define void @shared_unordered_sys_i64(ptr addrspace(3) %a) { +; SM60-LABEL: shared_unordered_sys_i64( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<4>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i64_param_0]; +; SM60-NEXT: ld.volatile.shared.u64 %rd2, [%rd1]; +; SM60-NEXT: add.s64 %rd3, %rd2, 1; +; SM60-NEXT: st.volatile.shared.u64 [%rd1], %rd3; +; SM60-NEXT: ret; +; +; SM70-LABEL: shared_unordered_sys_i64( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<4>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i64_param_0]; +; SM70-NEXT: ld.relaxed.sys.shared.u64 %rd2, [%rd1]; +; SM70-NEXT: add.s64 %rd3, %rd2, 1; +; SM70-NEXT: st.relaxed.sys.shared.u64 [%rd1], %rd3; +; SM70-NEXT: ret; + %a.load = load atomic i64, ptr addrspace(3) %a unordered, align 8 + %a.add = add i64 %a.load, 1 + store atomic i64 %a.add, ptr addrspace(3) %a unordered, align 8 + ret void +} + +define void @shared_unordered_sys_float(ptr addrspace(3) %a) { +; SM60-LABEL: shared_unordered_sys_float( +; SM60: { +; SM60-NEXT: .reg .f32 %f<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_float_param_0]; +; SM60-NEXT: ld.volatile.shared.f32 %f1, [%rd1]; +; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; SM60-NEXT: st.volatile.shared.f32 [%rd1], %f2; +; SM60-NEXT: ret; +; +; SM70-LABEL: shared_unordered_sys_float( +; SM70: { +; SM70-NEXT: .reg .f32 %f<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_float_param_0]; +; SM70-NEXT: ld.relaxed.sys.shared.f32 %f1, [%rd1]; +; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; SM70-NEXT: st.relaxed.sys.shared.f32 [%rd1], %f2; +; SM70-NEXT: ret; + %a.load = load atomic float, ptr addrspace(3) %a unordered, align 4 + %a.add = fadd float %a.load, 1. + store atomic float %a.add, ptr addrspace(3) %a unordered, align 4 + ret void +} + +define void @shared_unordered_sys_double(ptr addrspace(3) %a) { +; SM60-LABEL: shared_unordered_sys_double( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .f64 %fd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_double_param_0]; +; SM60-NEXT: ld.volatile.shared.f64 %fd1, [%rd1]; +; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; SM60-NEXT: st.volatile.shared.f64 [%rd1], %fd2; +; SM60-NEXT: ret; +; +; SM70-LABEL: shared_unordered_sys_double( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .f64 %fd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_double_param_0]; +; SM70-NEXT: ld.relaxed.sys.shared.f64 %fd1, [%rd1]; +; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; SM70-NEXT: st.relaxed.sys.shared.f64 [%rd1], %fd2; +; SM70-NEXT: ret; + %a.load = load atomic double, ptr addrspace(3) %a unordered, align 8 + %a.add = fadd double %a.load, 1. + store atomic double %a.add, ptr addrspace(3) %a unordered, align 8 + ret void +} + +; shared_unordered_volatile_sys + +define void @shared_unordered_volatile_sys_i8(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_unordered_volatile_sys_i8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_unordered_volatile_sys_i8_param_0]; +; CHECK-NEXT: ld.volatile.shared.u8 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.volatile.shared.u8 [%rd1], %rs2; +; CHECK-NEXT: ret; %a.load = load atomic volatile i8, ptr addrspace(3) %a unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(3) %a unordered, align 1 + ret void +} - ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(3) %b unordered, align 2 - %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(3) %b unordered, align 2 - - ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(3) %c unordered, align 4 - %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(3) %c unordered, align 4 +define void @shared_unordered_volatile_sys_i16(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_unordered_volatile_sys_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_unordered_volatile_sys_i16_param_0]; +; CHECK-NEXT: ld.volatile.shared.u16 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.volatile.shared.u16 [%rd1], %rs2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile i16, ptr addrspace(3) %a unordered, align 2 + %a.add = add i16 %a.load, 1 + store atomic volatile i16 %a.add, ptr addrspace(3) %a unordered, align 2 + ret void +} - ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(3) %d unordered, align 8 - %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(3) %d unordered, align 8 +define void @shared_unordered_volatile_sys_i32(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_unordered_volatile_sys_i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_unordered_volatile_sys_i32_param_0]; +; CHECK-NEXT: ld.volatile.shared.u32 %r1, [%rd1]; +; CHECK-NEXT: add.s32 %r2, %r1, 1; +; CHECK-NEXT: st.volatile.shared.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile i32, ptr addrspace(3) %a unordered, align 4 + %a.add = add i32 %a.load, 1 + store atomic volatile i32 %a.add, ptr addrspace(3) %a unordered, align 4 + ret void +} - ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(3) %e unordered, align 4 - %e.add = fadd float %e.load, 1.0 - ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(3) %e unordered, align 4 +define void @shared_unordered_volatile_sys_i64(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_unordered_volatile_sys_i64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_unordered_volatile_sys_i64_param_0]; +; CHECK-NEXT: ld.volatile.shared.u64 %rd2, [%rd1]; +; CHECK-NEXT: add.s64 %rd3, %rd2, 1; +; CHECK-NEXT: st.volatile.shared.u64 [%rd1], %rd3; +; CHECK-NEXT: ret; + %a.load = load atomic volatile i64, ptr addrspace(3) %a unordered, align 8 + %a.add = add i64 %a.load, 1 + store atomic volatile i64 %a.add, ptr addrspace(3) %a unordered, align 8 + ret void +} - ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(3) %e unordered, align 8 - %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(3) %e unordered, align 8 +define void @shared_unordered_volatile_sys_float(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_unordered_volatile_sys_float( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_unordered_volatile_sys_float_param_0]; +; CHECK-NEXT: ld.volatile.shared.f32 %f1, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; CHECK-NEXT: st.volatile.shared.f32 [%rd1], %f2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile float, ptr addrspace(3) %a unordered, align 4 + %a.add = fadd float %a.load, 1. + store atomic volatile float %a.add, ptr addrspace(3) %a unordered, align 4 + ret void +} +define void @shared_unordered_volatile_sys_double(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_unordered_volatile_sys_double( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_unordered_volatile_sys_double_param_0]; +; CHECK-NEXT: ld.volatile.shared.f64 %fd1, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.volatile.shared.f64 [%rd1], %fd2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile double, ptr addrspace(3) %a unordered, align 8 + %a.add = fadd double %a.load, 1. + store atomic volatile double %a.add, ptr addrspace(3) %a unordered, align 8 ret void } -; CHECK-LABEL: shared_monotonic_sys -define void @shared_monotonic_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; SM60: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +; shared_monotonic_sys + +define void @shared_monotonic_sys_i8(ptr addrspace(3) %a) { +; SM60-LABEL: shared_monotonic_sys_i8( +; SM60: { +; SM60-NEXT: .reg .b16 %rs<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i8_param_0]; +; SM60-NEXT: ld.volatile.shared.u8 %rs1, [%rd1]; +; SM60-NEXT: add.s16 %rs2, %rs1, 1; +; SM60-NEXT: st.volatile.shared.u8 [%rd1], %rs2; +; SM60-NEXT: ret; +; +; SM70-LABEL: shared_monotonic_sys_i8( +; SM70: { +; SM70-NEXT: .reg .b16 %rs<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i8_param_0]; +; SM70-NEXT: ld.relaxed.sys.shared.u8 %rs1, [%rd1]; +; SM70-NEXT: add.s16 %rs2, %rs1, 1; +; SM70-NEXT: st.relaxed.sys.shared.u8 [%rd1], %rs2; +; SM70-NEXT: ret; %a.load = load atomic i8, ptr addrspace(3) %a monotonic, align 1 %a.add = add i8 %a.load, 1 - ; SM60: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - ; SM70: st.relaxed.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(3) %a monotonic, align 1 + ret void +} + +define void @shared_monotonic_sys_i16(ptr addrspace(3) %a) { +; SM60-LABEL: shared_monotonic_sys_i16( +; SM60: { +; SM60-NEXT: .reg .b16 %rs<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i16_param_0]; +; SM60-NEXT: ld.volatile.shared.u16 %rs1, [%rd1]; +; SM60-NEXT: add.s16 %rs2, %rs1, 1; +; SM60-NEXT: st.volatile.shared.u16 [%rd1], %rs2; +; SM60-NEXT: ret; +; +; SM70-LABEL: shared_monotonic_sys_i16( +; SM70: { +; SM70-NEXT: .reg .b16 %rs<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i16_param_0]; +; SM70-NEXT: ld.relaxed.sys.shared.u16 %rs1, [%rd1]; +; SM70-NEXT: add.s16 %rs2, %rs1, 1; +; SM70-NEXT: st.relaxed.sys.shared.u16 [%rd1], %rs2; +; SM70-NEXT: ret; + %a.load = load atomic i16, ptr addrspace(3) %a monotonic, align 2 + %a.add = add i16 %a.load, 1 + store atomic i16 %a.add, ptr addrspace(3) %a monotonic, align 2 + ret void +} + +define void @shared_monotonic_sys_i32(ptr addrspace(3) %a) { +; SM60-LABEL: shared_monotonic_sys_i32( +; SM60: { +; SM60-NEXT: .reg .b32 %r<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i32_param_0]; +; SM60-NEXT: ld.volatile.shared.u32 %r1, [%rd1]; +; SM60-NEXT: add.s32 %r2, %r1, 1; +; SM60-NEXT: st.volatile.shared.u32 [%rd1], %r2; +; SM60-NEXT: ret; +; +; SM70-LABEL: shared_monotonic_sys_i32( +; SM70: { +; SM70-NEXT: .reg .b32 %r<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i32_param_0]; +; SM70-NEXT: ld.relaxed.sys.shared.u32 %r1, [%rd1]; +; SM70-NEXT: add.s32 %r2, %r1, 1; +; SM70-NEXT: st.relaxed.sys.shared.u32 [%rd1], %r2; +; SM70-NEXT: ret; + %a.load = load atomic i32, ptr addrspace(3) %a monotonic, align 4 + %a.add = add i32 %a.load, 1 + store atomic i32 %a.add, ptr addrspace(3) %a monotonic, align 4 + ret void +} + +define void @shared_monotonic_sys_i64(ptr addrspace(3) %a) { +; SM60-LABEL: shared_monotonic_sys_i64( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<4>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i64_param_0]; +; SM60-NEXT: ld.volatile.shared.u64 %rd2, [%rd1]; +; SM60-NEXT: add.s64 %rd3, %rd2, 1; +; SM60-NEXT: st.volatile.shared.u64 [%rd1], %rd3; +; SM60-NEXT: ret; +; +; SM70-LABEL: shared_monotonic_sys_i64( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<4>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i64_param_0]; +; SM70-NEXT: ld.relaxed.sys.shared.u64 %rd2, [%rd1]; +; SM70-NEXT: add.s64 %rd3, %rd2, 1; +; SM70-NEXT: st.relaxed.sys.shared.u64 [%rd1], %rd3; +; SM70-NEXT: ret; + %a.load = load atomic i64, ptr addrspace(3) %a monotonic, align 8 + %a.add = add i64 %a.load, 1 + store atomic i64 %a.add, ptr addrspace(3) %a monotonic, align 8 + ret void +} - ; SM60: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(3) %b monotonic, align 2 - %b.add = add i16 %b.load, 1 - ; SM60: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - ; SM70: st.relaxed.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(3) %b monotonic, align 2 - - ; SM60: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(3) %c monotonic, align 4 - %c.add = add i32 %c.load, 1 - ; SM60: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - ; SM70: st.relaxed.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(3) %c monotonic, align 4 - - ; SM60: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(3) %d monotonic, align 8 - %d.add = add i64 %d.load, 1 - ; SM60: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - ; SM70: st.relaxed.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(3) %d monotonic, align 8 - - ; SM60: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(3) %e monotonic, align 4 - %e.add = fadd float %e.load, 1. - ; SM60: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - ; SM70: st.relaxed.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(3) %e monotonic, align 4 - - ; SM60: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - ; SM70: ld.relaxed.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(3) %e monotonic, align 8 - %f.add = fadd double %f.load, 1. - ; SM60: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - ; SM70: st.relaxed.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(3) %e monotonic, align 8 - - ret void -} - -; CHECK-LABEL: shared_monotonic_volatile_sys -define void @shared_monotonic_volatile_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +define void @shared_monotonic_sys_float(ptr addrspace(3) %a) { +; SM60-LABEL: shared_monotonic_sys_float( +; SM60: { +; SM60-NEXT: .reg .f32 %f<3>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_float_param_0]; +; SM60-NEXT: ld.volatile.shared.f32 %f1, [%rd1]; +; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; SM60-NEXT: st.volatile.shared.f32 [%rd1], %f2; +; SM60-NEXT: ret; +; +; SM70-LABEL: shared_monotonic_sys_float( +; SM70: { +; SM70-NEXT: .reg .f32 %f<3>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_float_param_0]; +; SM70-NEXT: ld.relaxed.sys.shared.f32 %f1, [%rd1]; +; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; SM70-NEXT: st.relaxed.sys.shared.f32 [%rd1], %f2; +; SM70-NEXT: ret; + %a.load = load atomic float, ptr addrspace(3) %a monotonic, align 4 + %a.add = fadd float %a.load, 1. + store atomic float %a.add, ptr addrspace(3) %a monotonic, align 4 + ret void +} + +define void @shared_monotonic_sys_double(ptr addrspace(3) %a) { +; SM60-LABEL: shared_monotonic_sys_double( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .f64 %fd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_double_param_0]; +; SM60-NEXT: ld.volatile.shared.f64 %fd1, [%rd1]; +; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; SM60-NEXT: st.volatile.shared.f64 [%rd1], %fd2; +; SM60-NEXT: ret; +; +; SM70-LABEL: shared_monotonic_sys_double( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .f64 %fd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_double_param_0]; +; SM70-NEXT: ld.relaxed.sys.shared.f64 %fd1, [%rd1]; +; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; SM70-NEXT: st.relaxed.sys.shared.f64 [%rd1], %fd2; +; SM70-NEXT: ret; + %a.load = load atomic double, ptr addrspace(3) %a monotonic, align 8 + %a.add = fadd double %a.load, 1. + store atomic double %a.add, ptr addrspace(3) %a monotonic, align 8 + ret void +} + +; shared_monotonic_volatile_sys + +define void @shared_monotonic_volatile_sys_i8(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_monotonic_volatile_sys_i8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_monotonic_volatile_sys_i8_param_0]; +; CHECK-NEXT: ld.volatile.shared.u8 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.volatile.shared.u8 [%rd1], %rs2; +; CHECK-NEXT: ret; %a.load = load atomic volatile i8, ptr addrspace(3) %a monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(3) %a monotonic, align 1 + ret void +} - ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(3) %b monotonic, align 2 - %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(3) %b monotonic, align 2 - - ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(3) %c monotonic, align 4 - %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(3) %c monotonic, align 4 +define void @shared_monotonic_volatile_sys_i16(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_monotonic_volatile_sys_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_monotonic_volatile_sys_i16_param_0]; +; CHECK-NEXT: ld.volatile.shared.u16 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.volatile.shared.u16 [%rd1], %rs2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile i16, ptr addrspace(3) %a monotonic, align 2 + %a.add = add i16 %a.load, 1 + store atomic volatile i16 %a.add, ptr addrspace(3) %a monotonic, align 2 + ret void +} - ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(3) %d monotonic, align 8 - %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(3) %d monotonic, align 8 +define void @shared_monotonic_volatile_sys_i32(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_monotonic_volatile_sys_i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_monotonic_volatile_sys_i32_param_0]; +; CHECK-NEXT: ld.volatile.shared.u32 %r1, [%rd1]; +; CHECK-NEXT: add.s32 %r2, %r1, 1; +; CHECK-NEXT: st.volatile.shared.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile i32, ptr addrspace(3) %a monotonic, align 4 + %a.add = add i32 %a.load, 1 + store atomic volatile i32 %a.add, ptr addrspace(3) %a monotonic, align 4 + ret void +} - ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(3) %e monotonic, align 4 - %e.add = fadd float %e.load, 1. - ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(3) %e monotonic, align 4 +define void @shared_monotonic_volatile_sys_i64(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_monotonic_volatile_sys_i64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_monotonic_volatile_sys_i64_param_0]; +; CHECK-NEXT: ld.volatile.shared.u64 %rd2, [%rd1]; +; CHECK-NEXT: add.s64 %rd3, %rd2, 1; +; CHECK-NEXT: st.volatile.shared.u64 [%rd1], %rd3; +; CHECK-NEXT: ret; + %a.load = load atomic volatile i64, ptr addrspace(3) %a monotonic, align 8 + %a.add = add i64 %a.load, 1 + store atomic volatile i64 %a.add, ptr addrspace(3) %a monotonic, align 8 + ret void +} - ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(3) %e monotonic, align 8 - %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(3) %e monotonic, align 8 +define void @shared_monotonic_volatile_sys_float(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_monotonic_volatile_sys_float( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_monotonic_volatile_sys_float_param_0]; +; CHECK-NEXT: ld.volatile.shared.f32 %f1, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; CHECK-NEXT: st.volatile.shared.f32 [%rd1], %f2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile float, ptr addrspace(3) %a monotonic, align 4 + %a.add = fadd float %a.load, 1. + store atomic volatile float %a.add, ptr addrspace(3) %a monotonic, align 4 + ret void +} +define void @shared_monotonic_volatile_sys_double(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_monotonic_volatile_sys_double( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_monotonic_volatile_sys_double_param_0]; +; CHECK-NEXT: ld.volatile.shared.f64 %fd1, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.volatile.shared.f64 [%rd1], %fd2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile double, ptr addrspace(3) %a monotonic, align 8 + %a.add = fadd double %a.load, 1. + store atomic volatile double %a.add, ptr addrspace(3) %a monotonic, align 8 ret void } ;; local statespace -; CHECK-LABEL: local_weak -define void @local_weak(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +; local + +define void @local_i8(ptr addrspace(5) %a) { +; CHECK-LABEL: local_i8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_i8_param_0]; +; CHECK-NEXT: ld.local.u8 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.local.u8 [%rd1], %rs2; +; CHECK-NEXT: ret; %a.load = load i8, ptr addrspace(5) %a %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store i8 %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_i16(ptr addrspace(5) %a) { +; CHECK-LABEL: local_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_i16_param_0]; +; CHECK-NEXT: ld.local.u16 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.local.u16 [%rd1], %rs2; +; CHECK-NEXT: ret; + %a.load = load i16, ptr addrspace(5) %a + %a.add = add i16 %a.load, 1 + store i16 %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_i32(ptr addrspace(5) %a) { +; CHECK-LABEL: local_i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_i32_param_0]; +; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; +; CHECK-NEXT: add.s32 %r2, %r1, 1; +; CHECK-NEXT: st.local.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load i32, ptr addrspace(5) %a + %a.add = add i32 %a.load, 1 + store i32 %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_i64(ptr addrspace(5) %a) { +; CHECK-LABEL: local_i64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_i64_param_0]; +; CHECK-NEXT: ld.local.u64 %rd2, [%rd1]; +; CHECK-NEXT: add.s64 %rd3, %rd2, 1; +; CHECK-NEXT: st.local.u64 [%rd1], %rd3; +; CHECK-NEXT: ret; + %a.load = load i64, ptr addrspace(5) %a + %a.add = add i64 %a.load, 1 + store i64 %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_float(ptr addrspace(5) %a) { +; CHECK-LABEL: local_float( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_float_param_0]; +; CHECK-NEXT: ld.local.f32 %f1, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; CHECK-NEXT: st.local.f32 [%rd1], %f2; +; CHECK-NEXT: ret; + %a.load = load float, ptr addrspace(5) %a + %a.add = fadd float %a.load, 1. + store float %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_double(ptr addrspace(5) %a) { +; CHECK-LABEL: local_double( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_double_param_0]; +; CHECK-NEXT: ld.local.f64 %fd1, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.local.f64 [%rd1], %fd2; +; CHECK-NEXT: ret; + %a.load = load double, ptr addrspace(5) %a + %a.add = fadd double %a.load, 1. + store double %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_2xi8(ptr addrspace(5) %a) { +; CHECK-LABEL: local_2xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_2xi8_param_0]; +; CHECK-NEXT: ld.local.v2.u8 {%rs1, %rs2}, [%rd1]; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: st.local.v2.u8 [%rd1], {%rs4, %rs3}; +; CHECK-NEXT: ret; + %a.load = load <2 x i8>, ptr addrspace(5) %a + %a.add = add <2 x i8> %a.load, <i8 1, i8 1> + store <2 x i8> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_4xi8(ptr addrspace(5) %a) { +; CHECK-LABEL: local_4xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b32 %r<13>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_4xi8_param_0]; +; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; +; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; +; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; +; CHECK-NEXT: add.s16 %rs4, %rs3, 1; +; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; +; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; +; CHECK-NEXT: add.s16 %rs6, %rs5, 1; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; +; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8; +; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; +; CHECK-NEXT: add.s16 %rs8, %rs7, 1; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; +; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8; +; CHECK-NEXT: st.local.u32 [%rd1], %r12; +; CHECK-NEXT: ret; + %a.load = load <4 x i8>, ptr addrspace(5) %a + %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1> + store <4 x i8> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_2xi16(ptr addrspace(5) %a) { +; CHECK-LABEL: local_2xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_2xi16_param_0]; +; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; CHECK-NEXT: st.local.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load <2 x i16>, ptr addrspace(5) %a + %a.add = add <2 x i16> %a.load, <i16 1, i16 1> + store <2 x i16> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_4xi16(ptr addrspace(5) %a) { +; CHECK-LABEL: local_4xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_4xi16_param_0]; +; CHECK-NEXT: ld.local.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; +; CHECK-NEXT: add.s16 %rs5, %rs4, 1; +; CHECK-NEXT: add.s16 %rs6, %rs3, 1; +; CHECK-NEXT: add.s16 %rs7, %rs2, 1; +; CHECK-NEXT: add.s16 %rs8, %rs1, 1; +; CHECK-NEXT: st.local.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; +; CHECK-NEXT: ret; + %a.load = load <4 x i16>, ptr addrspace(5) %a + %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1> + store <4 x i16> %a.add, ptr addrspace(5) %a + ret void +} - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load i16, ptr addrspace(5) %b - %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store i16 %b.add, ptr addrspace(5) %b - - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load i32, ptr addrspace(5) %c - %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store i32 %c.add, ptr addrspace(5) %c - - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load i64, ptr addrspace(5) %d - %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store i64 %d.add, ptr addrspace(5) %d - - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load float, ptr addrspace(5) %c - %e.add = fadd float %e.load, 1. - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store float %e.add, ptr addrspace(5) %c - - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load double, ptr addrspace(5) %c - %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store double %f.add, ptr addrspace(5) %c - - ; CHECK: ld.local.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] - %h.load = load <2 x i8>, ptr addrspace(5) %b - %h.add = add <2 x i8> %h.load, <i8 1, i8 1> - ; CHECK: st.local.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}} - store <2 x i8> %h.add, ptr addrspace(5) %b - - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %i.load = load <4 x i8>, ptr addrspace(5) %c - %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1> - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store <4 x i8> %i.add, ptr addrspace(5) %c - - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %j.load = load <2 x i16>, ptr addrspace(5) %c - %j.add = add <2 x i16> %j.load, <i16 1, i16 1> - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store <2 x i16> %j.add, ptr addrspace(5) %c - - ; CHECK: ld.local.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] - %k.load = load <4 x i16>, ptr addrspace(5) %d - %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1> - ; CHECK: st.local.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} - store <4 x i16> %k.add, ptr addrspace(5) %d - - ; CHECK: ld.local.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] - %l.load = load <2 x i32>, ptr addrspace(5) %d - %l.add = add <2 x i32> %l.load, <i32 1, i32 1> - ; CHECK: st.local.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} - store <2 x i32> %l.add, ptr addrspace(5) %d - - ; CHECK: ld.local.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] - %m.load = load <4 x i32>, ptr addrspace(5) %d - %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1> - ; CHECK: st.local.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} - store <4 x i32> %m.add, ptr addrspace(5) %d - - ; CHECK: ld.local.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] - %n.load = load <2 x i64>, ptr addrspace(5) %d - %n.add = add <2 x i64> %n.load, <i64 1, i64 1> - ; CHECK: st.local.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} - store <2 x i64> %n.add, ptr addrspace(5) %d - - ; CHECK: ld.local.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] - %o.load = load <2 x float>, ptr addrspace(5) %d - %o.add = fadd <2 x float> %o.load, <float 1., float 1.> - ; CHECK: st.local.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} - store <2 x float> %o.add, ptr addrspace(5) %d - - ; CHECK: ld.local.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] - %p.load = load <4 x float>, ptr addrspace(5) %d - %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.> - ; CHECK: st.local.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} - store <4 x float> %p.add, ptr addrspace(5) %d - - ; CHECK: ld.local.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] - %q.load = load <2 x double>, ptr addrspace(5) %d - %q.add = fadd <2 x double> %q.load, <double 1., double 1.> - ; CHECK: st.local.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}} - store <2 x double> %q.add, ptr addrspace(5) %d - - ret void -} - -; CHECK-LABEL: local_volatile -define void @local_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +define void @local_2xi32(ptr addrspace(5) %a) { +; CHECK-LABEL: local_2xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_2xi32_param_0]; +; CHECK-NEXT: ld.local.v2.u32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: add.s32 %r3, %r2, 1; +; CHECK-NEXT: add.s32 %r4, %r1, 1; +; CHECK-NEXT: st.local.v2.u32 [%rd1], {%r4, %r3}; +; CHECK-NEXT: ret; + %a.load = load <2 x i32>, ptr addrspace(5) %a + %a.add = add <2 x i32> %a.load, <i32 1, i32 1> + store <2 x i32> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_4xi32(ptr addrspace(5) %a) { +; CHECK-LABEL: local_4xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_4xi32_param_0]; +; CHECK-NEXT: ld.local.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: add.s32 %r5, %r4, 1; +; CHECK-NEXT: add.s32 %r6, %r3, 1; +; CHECK-NEXT: add.s32 %r7, %r2, 1; +; CHECK-NEXT: add.s32 %r8, %r1, 1; +; CHECK-NEXT: st.local.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; +; CHECK-NEXT: ret; + %a.load = load <4 x i32>, ptr addrspace(5) %a + %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1> + store <4 x i32> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_2xi64(ptr addrspace(5) %a) { +; CHECK-LABEL: local_2xi64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_2xi64_param_0]; +; CHECK-NEXT: ld.local.v2.u64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: add.s64 %rd4, %rd3, 1; +; CHECK-NEXT: add.s64 %rd5, %rd2, 1; +; CHECK-NEXT: st.local.v2.u64 [%rd1], {%rd5, %rd4}; +; CHECK-NEXT: ret; + %a.load = load <2 x i64>, ptr addrspace(5) %a + %a.add = add <2 x i64> %a.load, <i64 1, i64 1> + store <2 x i64> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_2xfloat(ptr addrspace(5) %a) { +; CHECK-LABEL: local_2xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_2xfloat_param_0]; +; CHECK-NEXT: ld.local.v2.f32 {%f1, %f2}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; +; CHECK-NEXT: st.local.v2.f32 [%rd1], {%f4, %f3}; +; CHECK-NEXT: ret; + %a.load = load <2 x float>, ptr addrspace(5) %a + %a.add = fadd <2 x float> %a.load, <float 1., float 1.> + store <2 x float> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_4xfloat(ptr addrspace(5) %a) { +; CHECK-LABEL: local_4xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_4xfloat_param_0]; +; CHECK-NEXT: ld.local.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; +; CHECK-NEXT: st.local.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; +; CHECK-NEXT: ret; + %a.load = load <4 x float>, ptr addrspace(5) %a + %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.> + store <4 x float> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_2xdouble(ptr addrspace(5) %a) { +; CHECK-LABEL: local_2xdouble( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_2xdouble_param_0]; +; CHECK-NEXT: ld.local.v2.f64 {%fd1, %fd2}, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; +; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.local.v2.f64 [%rd1], {%fd4, %fd3}; +; CHECK-NEXT: ret; + %a.load = load <2 x double>, ptr addrspace(5) %a + %a.add = fadd <2 x double> %a.load, <double 1., double 1.> + store <2 x double> %a.add, ptr addrspace(5) %a + ret void +} + +; local_volatile + +define void @local_volatile_i8(ptr addrspace(5) %a) { +; CHECK-LABEL: local_volatile_i8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_i8_param_0]; +; CHECK-NEXT: ld.local.u8 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.local.u8 [%rd1], %rs2; +; CHECK-NEXT: ret; %a.load = load volatile i8, ptr addrspace(5) %a %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store volatile i8 %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_volatile_i16(ptr addrspace(5) %a) { +; CHECK-LABEL: local_volatile_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_i16_param_0]; +; CHECK-NEXT: ld.local.u16 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.local.u16 [%rd1], %rs2; +; CHECK-NEXT: ret; + %a.load = load volatile i16, ptr addrspace(5) %a + %a.add = add i16 %a.load, 1 + store volatile i16 %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_volatile_i32(ptr addrspace(5) %a) { +; CHECK-LABEL: local_volatile_i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_i32_param_0]; +; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; +; CHECK-NEXT: add.s32 %r2, %r1, 1; +; CHECK-NEXT: st.local.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load volatile i32, ptr addrspace(5) %a + %a.add = add i32 %a.load, 1 + store volatile i32 %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_volatile_i64(ptr addrspace(5) %a) { +; CHECK-LABEL: local_volatile_i64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_i64_param_0]; +; CHECK-NEXT: ld.local.u64 %rd2, [%rd1]; +; CHECK-NEXT: add.s64 %rd3, %rd2, 1; +; CHECK-NEXT: st.local.u64 [%rd1], %rd3; +; CHECK-NEXT: ret; + %a.load = load volatile i64, ptr addrspace(5) %a + %a.add = add i64 %a.load, 1 + store volatile i64 %a.add, ptr addrspace(5) %a + ret void +} - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load volatile i16, ptr addrspace(5) %b - %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store volatile i16 %b.add, ptr addrspace(5) %b - - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load volatile i32, ptr addrspace(5) %c - %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store volatile i32 %c.add, ptr addrspace(5) %c - - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load volatile i64, ptr addrspace(5) %d - %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store volatile i64 %d.add, ptr addrspace(5) %d - - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load volatile float, ptr addrspace(5) %c - %e.add = fadd float %e.load, 1. - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store volatile float %e.add, ptr addrspace(5) %c - - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load volatile double, ptr addrspace(5) %c - %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store volatile double %f.add, ptr addrspace(5) %c - - ; CHECK: ld.local.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] - %h.load = load volatile <2 x i8>, ptr addrspace(5) %b - %h.add = add <2 x i8> %h.load, <i8 1, i8 1> - ; CHECK: st.local.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}} - store volatile <2 x i8> %h.add, ptr addrspace(5) %b - - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %i.load = load volatile <4 x i8>, ptr addrspace(5) %c - %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1> - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store volatile <4 x i8> %i.add, ptr addrspace(5) %c - - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %j.load = load volatile <2 x i16>, ptr addrspace(5) %c - %j.add = add <2 x i16> %j.load, <i16 1, i16 1> - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store volatile <2 x i16> %j.add, ptr addrspace(5) %c - - ; CHECK: ld.local.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] - %k.load = load volatile <4 x i16>, ptr addrspace(5) %d - %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1> - ; CHECK: st.local.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} - store volatile <4 x i16> %k.add, ptr addrspace(5) %d - - ; CHECK: ld.local.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] - %l.load = load volatile <2 x i32>, ptr addrspace(5) %d - %l.add = add <2 x i32> %l.load, <i32 1, i32 1> - ; CHECK: st.local.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} - store volatile <2 x i32> %l.add, ptr addrspace(5) %d - - ; CHECK: ld.local.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] - %m.load = load volatile <4 x i32>, ptr addrspace(5) %d - %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1> - ; CHECK: st.local.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} - store volatile <4 x i32> %m.add, ptr addrspace(5) %d - - ; CHECK: ld.local.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] - %n.load = load volatile <2 x i64>, ptr addrspace(5) %d - %n.add = add <2 x i64> %n.load, <i64 1, i64 1> - ; CHECK: st.local.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} - store volatile <2 x i64> %n.add, ptr addrspace(5) %d - - ; CHECK: ld.local.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] - %o.load = load volatile <2 x float>, ptr addrspace(5) %d - %o.add = fadd <2 x float> %o.load, <float 1., float 1.> - ; CHECK: st.local.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} - store volatile <2 x float> %o.add, ptr addrspace(5) %d - - ; CHECK: ld.local.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] - %p.load = load volatile <4 x float>, ptr addrspace(5) %d - %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.> - ; CHECK: st.local.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} - store volatile <4 x float> %p.add, ptr addrspace(5) %d - - ; CHECK: ld.local.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] - %q.load = load volatile <2 x double>, ptr addrspace(5) %d - %q.add = fadd <2 x double> %q.load, <double 1., double 1.> - ; CHECK: st.local.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}} - store volatile <2 x double> %q.add, ptr addrspace(5) %d - - ret void -} - -; CHECK-LABEL: local_unordered_sys -define void @local_unordered_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +define void @local_volatile_float(ptr addrspace(5) %a) { +; CHECK-LABEL: local_volatile_float( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_float_param_0]; +; CHECK-NEXT: ld.local.f32 %f1, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; CHECK-NEXT: st.local.f32 [%rd1], %f2; +; CHECK-NEXT: ret; + %a.load = load volatile float, ptr addrspace(5) %a + %a.add = fadd float %a.load, 1. + store volatile float %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_volatile_double(ptr addrspace(5) %a) { +; CHECK-LABEL: local_volatile_double( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_double_param_0]; +; CHECK-NEXT: ld.local.f64 %fd1, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.local.f64 [%rd1], %fd2; +; CHECK-NEXT: ret; + %a.load = load volatile double, ptr addrspace(5) %a + %a.add = fadd double %a.load, 1. + store volatile double %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_volatile_2xi8(ptr addrspace(5) %a) { +; CHECK-LABEL: local_volatile_2xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xi8_param_0]; +; CHECK-NEXT: ld.local.v2.u8 {%rs1, %rs2}, [%rd1]; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: st.local.v2.u8 [%rd1], {%rs4, %rs3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i8>, ptr addrspace(5) %a + %a.add = add <2 x i8> %a.load, <i8 1, i8 1> + store volatile <2 x i8> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_volatile_4xi8(ptr addrspace(5) %a) { +; CHECK-LABEL: local_volatile_4xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b32 %r<13>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_4xi8_param_0]; +; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; +; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; +; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; +; CHECK-NEXT: add.s16 %rs4, %rs3, 1; +; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; +; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; +; CHECK-NEXT: add.s16 %rs6, %rs5, 1; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; +; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8; +; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; +; CHECK-NEXT: add.s16 %rs8, %rs7, 1; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; +; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8; +; CHECK-NEXT: st.local.u32 [%rd1], %r12; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x i8>, ptr addrspace(5) %a + %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1> + store volatile <4 x i8> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_volatile_2xi16(ptr addrspace(5) %a) { +; CHECK-LABEL: local_volatile_2xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xi16_param_0]; +; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; CHECK-NEXT: st.local.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i16>, ptr addrspace(5) %a + %a.add = add <2 x i16> %a.load, <i16 1, i16 1> + store volatile <2 x i16> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_volatile_4xi16(ptr addrspace(5) %a) { +; CHECK-LABEL: local_volatile_4xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_4xi16_param_0]; +; CHECK-NEXT: ld.local.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; +; CHECK-NEXT: add.s16 %rs5, %rs4, 1; +; CHECK-NEXT: add.s16 %rs6, %rs3, 1; +; CHECK-NEXT: add.s16 %rs7, %rs2, 1; +; CHECK-NEXT: add.s16 %rs8, %rs1, 1; +; CHECK-NEXT: st.local.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x i16>, ptr addrspace(5) %a + %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1> + store volatile <4 x i16> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_volatile_2xi32(ptr addrspace(5) %a) { +; CHECK-LABEL: local_volatile_2xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xi32_param_0]; +; CHECK-NEXT: ld.local.v2.u32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: add.s32 %r3, %r2, 1; +; CHECK-NEXT: add.s32 %r4, %r1, 1; +; CHECK-NEXT: st.local.v2.u32 [%rd1], {%r4, %r3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i32>, ptr addrspace(5) %a + %a.add = add <2 x i32> %a.load, <i32 1, i32 1> + store volatile <2 x i32> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_volatile_4xi32(ptr addrspace(5) %a) { +; CHECK-LABEL: local_volatile_4xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_4xi32_param_0]; +; CHECK-NEXT: ld.local.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: add.s32 %r5, %r4, 1; +; CHECK-NEXT: add.s32 %r6, %r3, 1; +; CHECK-NEXT: add.s32 %r7, %r2, 1; +; CHECK-NEXT: add.s32 %r8, %r1, 1; +; CHECK-NEXT: st.local.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x i32>, ptr addrspace(5) %a + %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1> + store volatile <4 x i32> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_volatile_2xi64(ptr addrspace(5) %a) { +; CHECK-LABEL: local_volatile_2xi64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xi64_param_0]; +; CHECK-NEXT: ld.local.v2.u64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: add.s64 %rd4, %rd3, 1; +; CHECK-NEXT: add.s64 %rd5, %rd2, 1; +; CHECK-NEXT: st.local.v2.u64 [%rd1], {%rd5, %rd4}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i64>, ptr addrspace(5) %a + %a.add = add <2 x i64> %a.load, <i64 1, i64 1> + store volatile <2 x i64> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_volatile_2xfloat(ptr addrspace(5) %a) { +; CHECK-LABEL: local_volatile_2xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xfloat_param_0]; +; CHECK-NEXT: ld.local.v2.f32 {%f1, %f2}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; +; CHECK-NEXT: st.local.v2.f32 [%rd1], {%f4, %f3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x float>, ptr addrspace(5) %a + %a.add = fadd <2 x float> %a.load, <float 1., float 1.> + store volatile <2 x float> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_volatile_4xfloat(ptr addrspace(5) %a) { +; CHECK-LABEL: local_volatile_4xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_4xfloat_param_0]; +; CHECK-NEXT: ld.local.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; +; CHECK-NEXT: st.local.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x float>, ptr addrspace(5) %a + %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.> + store volatile <4 x float> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_volatile_2xdouble(ptr addrspace(5) %a) { +; CHECK-LABEL: local_volatile_2xdouble( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xdouble_param_0]; +; CHECK-NEXT: ld.local.v2.f64 {%fd1, %fd2}, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; +; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.local.v2.f64 [%rd1], {%fd4, %fd3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x double>, ptr addrspace(5) %a + %a.add = fadd <2 x double> %a.load, <double 1., double 1.> + store volatile <2 x double> %a.add, ptr addrspace(5) %a + ret void +} + +; local_unordered_sys + +define void @local_unordered_sys_i8(ptr addrspace(5) %a) { +; CHECK-LABEL: local_unordered_sys_i8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_sys_i8_param_0]; +; CHECK-NEXT: ld.local.u8 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.local.u8 [%rd1], %rs2; +; CHECK-NEXT: ret; %a.load = load atomic i8, ptr addrspace(5) %a unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(5) %a unordered, align 1 + ret void +} - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(5) %b unordered, align 2 - %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(5) %b unordered, align 2 - - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(5) %c unordered, align 4 - %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(5) %c unordered, align 4 +define void @local_unordered_sys_i16(ptr addrspace(5) %a) { +; CHECK-LABEL: local_unordered_sys_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_sys_i16_param_0]; +; CHECK-NEXT: ld.local.u16 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.local.u16 [%rd1], %rs2; +; CHECK-NEXT: ret; + %a.load = load atomic i16, ptr addrspace(5) %a unordered, align 2 + %a.add = add i16 %a.load, 1 + store atomic i16 %a.add, ptr addrspace(5) %a unordered, align 2 + ret void +} - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(5) %d unordered, align 8 - %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(5) %d unordered, align 8 +define void @local_unordered_sys_i32(ptr addrspace(5) %a) { +; CHECK-LABEL: local_unordered_sys_i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_sys_i32_param_0]; +; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; +; CHECK-NEXT: add.s32 %r2, %r1, 1; +; CHECK-NEXT: st.local.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load atomic i32, ptr addrspace(5) %a unordered, align 4 + %a.add = add i32 %a.load, 1 + store atomic i32 %a.add, ptr addrspace(5) %a unordered, align 4 + ret void +} - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(5) %e unordered, align 4 - %e.add = fadd float %e.load, 1.0 - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(5) %e unordered, align 4 +define void @local_unordered_sys_i64(ptr addrspace(5) %a) { +; CHECK-LABEL: local_unordered_sys_i64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_sys_i64_param_0]; +; CHECK-NEXT: ld.local.u64 %rd2, [%rd1]; +; CHECK-NEXT: add.s64 %rd3, %rd2, 1; +; CHECK-NEXT: st.local.u64 [%rd1], %rd3; +; CHECK-NEXT: ret; + %a.load = load atomic i64, ptr addrspace(5) %a unordered, align 8 + %a.add = add i64 %a.load, 1 + store atomic i64 %a.add, ptr addrspace(5) %a unordered, align 8 + ret void +} - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(5) %e unordered, align 8 - %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(5) %e unordered, align 8 +define void @local_unordered_sys_float(ptr addrspace(5) %a) { +; CHECK-LABEL: local_unordered_sys_float( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_sys_float_param_0]; +; CHECK-NEXT: ld.local.f32 %f1, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; CHECK-NEXT: st.local.f32 [%rd1], %f2; +; CHECK-NEXT: ret; + %a.load = load atomic float, ptr addrspace(5) %a unordered, align 4 + %a.add = fadd float %a.load, 1. + store atomic float %a.add, ptr addrspace(5) %a unordered, align 4 + ret void +} +define void @local_unordered_sys_double(ptr addrspace(5) %a) { +; CHECK-LABEL: local_unordered_sys_double( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_sys_double_param_0]; +; CHECK-NEXT: ld.local.f64 %fd1, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.local.f64 [%rd1], %fd2; +; CHECK-NEXT: ret; + %a.load = load atomic double, ptr addrspace(5) %a unordered, align 8 + %a.add = fadd double %a.load, 1. + store atomic double %a.add, ptr addrspace(5) %a unordered, align 8 ret void } -; CHECK-LABEL: local_unordered_volatile_sys -define void @local_unordered_volatile_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +; local_unordered_volatile_sys + +define void @local_unordered_volatile_sys_i8(ptr addrspace(5) %a) { +; CHECK-LABEL: local_unordered_volatile_sys_i8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_volatile_sys_i8_param_0]; +; CHECK-NEXT: ld.local.u8 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.local.u8 [%rd1], %rs2; +; CHECK-NEXT: ret; %a.load = load atomic volatile i8, ptr addrspace(5) %a unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(5) %a unordered, align 1 + ret void +} - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(5) %b unordered, align 2 - %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(5) %b unordered, align 2 - - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(5) %c unordered, align 4 - %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(5) %c unordered, align 4 +define void @local_unordered_volatile_sys_i16(ptr addrspace(5) %a) { +; CHECK-LABEL: local_unordered_volatile_sys_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_volatile_sys_i16_param_0]; +; CHECK-NEXT: ld.local.u16 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.local.u16 [%rd1], %rs2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile i16, ptr addrspace(5) %a unordered, align 2 + %a.add = add i16 %a.load, 1 + store atomic volatile i16 %a.add, ptr addrspace(5) %a unordered, align 2 + ret void +} - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(5) %d unordered, align 8 - %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(5) %d unordered, align 8 +define void @local_unordered_volatile_sys_i32(ptr addrspace(5) %a) { +; CHECK-LABEL: local_unordered_volatile_sys_i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_volatile_sys_i32_param_0]; +; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; +; CHECK-NEXT: add.s32 %r2, %r1, 1; +; CHECK-NEXT: st.local.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile i32, ptr addrspace(5) %a unordered, align 4 + %a.add = add i32 %a.load, 1 + store atomic volatile i32 %a.add, ptr addrspace(5) %a unordered, align 4 + ret void +} - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(5) %e unordered, align 4 - %e.add = fadd float %e.load, 1.0 - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(5) %e unordered, align 4 +define void @local_unordered_volatile_sys_i64(ptr addrspace(5) %a) { +; CHECK-LABEL: local_unordered_volatile_sys_i64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_volatile_sys_i64_param_0]; +; CHECK-NEXT: ld.local.u64 %rd2, [%rd1]; +; CHECK-NEXT: add.s64 %rd3, %rd2, 1; +; CHECK-NEXT: st.local.u64 [%rd1], %rd3; +; CHECK-NEXT: ret; + %a.load = load atomic volatile i64, ptr addrspace(5) %a unordered, align 8 + %a.add = add i64 %a.load, 1 + store atomic volatile i64 %a.add, ptr addrspace(5) %a unordered, align 8 + ret void +} - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(5) %e unordered, align 8 - %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(5) %e unordered, align 8 +define void @local_unordered_volatile_sys_float(ptr addrspace(5) %a) { +; CHECK-LABEL: local_unordered_volatile_sys_float( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_volatile_sys_float_param_0]; +; CHECK-NEXT: ld.local.f32 %f1, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; CHECK-NEXT: st.local.f32 [%rd1], %f2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile float, ptr addrspace(5) %a unordered, align 4 + %a.add = fadd float %a.load, 1. + store atomic volatile float %a.add, ptr addrspace(5) %a unordered, align 4 + ret void +} +define void @local_unordered_volatile_sys_double(ptr addrspace(5) %a) { +; CHECK-LABEL: local_unordered_volatile_sys_double( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_volatile_sys_double_param_0]; +; CHECK-NEXT: ld.local.f64 %fd1, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.local.f64 [%rd1], %fd2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile double, ptr addrspace(5) %a unordered, align 8 + %a.add = fadd double %a.load, 1. + store atomic volatile double %a.add, ptr addrspace(5) %a unordered, align 8 ret void } -; CHECK-LABEL: local_monotonic_sys -define void @local_monotonic_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +; local_monotonic_sys + +define void @local_monotonic_sys_i8(ptr addrspace(5) %a) { +; CHECK-LABEL: local_monotonic_sys_i8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_sys_i8_param_0]; +; CHECK-NEXT: ld.local.u8 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.local.u8 [%rd1], %rs2; +; CHECK-NEXT: ret; %a.load = load atomic i8, ptr addrspace(5) %a monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(5) %a monotonic, align 1 + ret void +} - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(5) %b monotonic, align 2 - %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(5) %b monotonic, align 2 - - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(5) %c monotonic, align 4 - %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(5) %c monotonic, align 4 +define void @local_monotonic_sys_i16(ptr addrspace(5) %a) { +; CHECK-LABEL: local_monotonic_sys_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_sys_i16_param_0]; +; CHECK-NEXT: ld.local.u16 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.local.u16 [%rd1], %rs2; +; CHECK-NEXT: ret; + %a.load = load atomic i16, ptr addrspace(5) %a monotonic, align 2 + %a.add = add i16 %a.load, 1 + store atomic i16 %a.add, ptr addrspace(5) %a monotonic, align 2 + ret void +} - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(5) %d monotonic, align 8 - %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(5) %d monotonic, align 8 +define void @local_monotonic_sys_i32(ptr addrspace(5) %a) { +; CHECK-LABEL: local_monotonic_sys_i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_sys_i32_param_0]; +; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; +; CHECK-NEXT: add.s32 %r2, %r1, 1; +; CHECK-NEXT: st.local.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load atomic i32, ptr addrspace(5) %a monotonic, align 4 + %a.add = add i32 %a.load, 1 + store atomic i32 %a.add, ptr addrspace(5) %a monotonic, align 4 + ret void +} - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(5) %e monotonic, align 4 - %e.add = fadd float %e.load, 1. - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(5) %e monotonic, align 4 +define void @local_monotonic_sys_i64(ptr addrspace(5) %a) { +; CHECK-LABEL: local_monotonic_sys_i64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_sys_i64_param_0]; +; CHECK-NEXT: ld.local.u64 %rd2, [%rd1]; +; CHECK-NEXT: add.s64 %rd3, %rd2, 1; +; CHECK-NEXT: st.local.u64 [%rd1], %rd3; +; CHECK-NEXT: ret; + %a.load = load atomic i64, ptr addrspace(5) %a monotonic, align 8 + %a.add = add i64 %a.load, 1 + store atomic i64 %a.add, ptr addrspace(5) %a monotonic, align 8 + ret void +} - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(5) %e monotonic, align 8 - %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(5) %e monotonic, align 8 +define void @local_monotonic_sys_float(ptr addrspace(5) %a) { +; CHECK-LABEL: local_monotonic_sys_float( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_sys_float_param_0]; +; CHECK-NEXT: ld.local.f32 %f1, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; CHECK-NEXT: st.local.f32 [%rd1], %f2; +; CHECK-NEXT: ret; + %a.load = load atomic float, ptr addrspace(5) %a monotonic, align 4 + %a.add = fadd float %a.load, 1. + store atomic float %a.add, ptr addrspace(5) %a monotonic, align 4 + ret void +} +define void @local_monotonic_sys_double(ptr addrspace(5) %a) { +; CHECK-LABEL: local_monotonic_sys_double( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_sys_double_param_0]; +; CHECK-NEXT: ld.local.f64 %fd1, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.local.f64 [%rd1], %fd2; +; CHECK-NEXT: ret; + %a.load = load atomic double, ptr addrspace(5) %a monotonic, align 8 + %a.add = fadd double %a.load, 1. + store atomic double %a.add, ptr addrspace(5) %a monotonic, align 8 ret void } -; CHECK-LABEL: local_monotonic_volatile -define void @local_monotonic_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +; local_monotonic_volatile_sys + +define void @local_monotonic_volatile_sys_i8(ptr addrspace(5) %a) { +; CHECK-LABEL: local_monotonic_volatile_sys_i8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_volatile_sys_i8_param_0]; +; CHECK-NEXT: ld.local.u8 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.local.u8 [%rd1], %rs2; +; CHECK-NEXT: ret; %a.load = load atomic volatile i8, ptr addrspace(5) %a monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(5) %a monotonic, align 1 + ret void +} + +define void @local_monotonic_volatile_sys_i16(ptr addrspace(5) %a) { +; CHECK-LABEL: local_monotonic_volatile_sys_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_volatile_sys_i16_param_0]; +; CHECK-NEXT: ld.local.u16 %rs1, [%rd1]; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: st.local.u16 [%rd1], %rs2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile i16, ptr addrspace(5) %a monotonic, align 2 + %a.add = add i16 %a.load, 1 + store atomic volatile i16 %a.add, ptr addrspace(5) %a monotonic, align 2 + ret void +} - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(5) %b monotonic, align 2 - %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(5) %b monotonic, align 2 - - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(5) %c monotonic, align 4 - %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(5) %c monotonic, align 4 - - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(5) %d monotonic, align 8 - %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(5) %d monotonic, align 8 - - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(5) %e monotonic, align 4 - %e.add = fadd float %e.load, 1. - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(5) %e monotonic, align 4 - - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(5) %e monotonic, align 8 - %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(5) %e monotonic, align 8 +define void @local_monotonic_volatile_sys_i32(ptr addrspace(5) %a) { +; CHECK-LABEL: local_monotonic_volatile_sys_i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_volatile_sys_i32_param_0]; +; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; +; CHECK-NEXT: add.s32 %r2, %r1, 1; +; CHECK-NEXT: st.local.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile i32, ptr addrspace(5) %a monotonic, align 4 + %a.add = add i32 %a.load, 1 + store atomic volatile i32 %a.add, ptr addrspace(5) %a monotonic, align 4 + ret void +} + +define void @local_monotonic_volatile_sys_i64(ptr addrspace(5) %a) { +; CHECK-LABEL: local_monotonic_volatile_sys_i64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_volatile_sys_i64_param_0]; +; CHECK-NEXT: ld.local.u64 %rd2, [%rd1]; +; CHECK-NEXT: add.s64 %rd3, %rd2, 1; +; CHECK-NEXT: st.local.u64 [%rd1], %rd3; +; CHECK-NEXT: ret; + %a.load = load atomic volatile i64, ptr addrspace(5) %a monotonic, align 8 + %a.add = add i64 %a.load, 1 + store atomic volatile i64 %a.add, ptr addrspace(5) %a monotonic, align 8 + ret void +} + +define void @local_monotonic_volatile_sys_float(ptr addrspace(5) %a) { +; CHECK-LABEL: local_monotonic_volatile_sys_float( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_volatile_sys_float_param_0]; +; CHECK-NEXT: ld.local.f32 %f1, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; CHECK-NEXT: st.local.f32 [%rd1], %f2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile float, ptr addrspace(5) %a monotonic, align 4 + %a.add = fadd float %a.load, 1. + store atomic volatile float %a.add, ptr addrspace(5) %a monotonic, align 4 + ret void +} +define void @local_monotonic_volatile_sys_double(ptr addrspace(5) %a) { +; CHECK-LABEL: local_monotonic_volatile_sys_double( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_volatile_sys_double_param_0]; +; CHECK-NEXT: ld.local.f64 %fd1, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.local.f64 [%rd1], %fd2; +; CHECK-NEXT: ret; + %a.load = load atomic volatile double, ptr addrspace(5) %a monotonic, align 8 + %a.add = fadd double %a.load, 1. + store atomic volatile double %a.add, ptr addrspace(5) %a monotonic, align 8 ret void } diff --git a/llvm/test/CodeGen/PowerPC/big-endian-store-forward.ll b/llvm/test/CodeGen/PowerPC/big-endian-store-forward.ll index e139d3c..5bd3580 100644 --- a/llvm/test/CodeGen/PowerPC/big-endian-store-forward.ll +++ b/llvm/test/CodeGen/PowerPC/big-endian-store-forward.ll @@ -1,12 +1,18 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s ; The load is to the high byte of the 2-byte store @g = global i8 -75 define void @f(i16 %v) { -; CHECK-LABEL: f -; CHECK: sth 3, -2(1) -; CHECK: lbz 3, -2(1) +; CHECK-LABEL: f: +; CHECK: # %bb.0: +; CHECK-NEXT: addis 4, 2, .LC0@toc@ha +; CHECK-NEXT: sth 3, -2(1) +; CHECK-NEXT: ld 4, .LC0@toc@l(4) +; CHECK-NEXT: lbz 3, -2(1) +; CHECK-NEXT: stb 3, 0(4) +; CHECK-NEXT: blr %p32 = alloca i16 store i16 %v, ptr %p32 %tmp = load i8, ptr %p32 diff --git a/llvm/test/CodeGen/SPIRV/decoration-order.ll b/llvm/test/CodeGen/SPIRV/decoration-order.ll new file mode 100644 index 0000000..e8299e9 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/decoration-order.ll @@ -0,0 +1,15 @@ +; RUN: %if spirv-tools %{ llc -O0 -verify-machineinstrs -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} +; This test checks the OpDecorate MIR is generated after the associated +; vreg definition in the case of an array size declared through this lowering. + +define spir_func i32 @foo() { +entry: + %var = alloca i64 + br label %block + +block: + call void @llvm.memset.p0.i64(ptr align 8 %var, i8 0, i64 24, i1 false) + ret i32 0 +} + +declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) diff --git a/llvm/test/CodeGen/X86/tls-function-argument.ll b/llvm/test/CodeGen/X86/tls-function-argument.ll new file mode 100644 index 0000000..9b6ab52 --- /dev/null +++ b/llvm/test/CodeGen/X86/tls-function-argument.ll @@ -0,0 +1,30 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=x86_64 -verify-machineinstrs -relocation-model=pic < %s | FileCheck %s + +; Passing a pointer to thread-local storage to a function can be problematic +; since computing such addresses requires a function call that is introduced +; very late in instruction selection. We need to ensure that we don't introduce +; nested call sequence markers if this function call happens in a call sequence. + +@TLS = internal thread_local global i64 zeroinitializer, align 8 +declare void @bar(ptr) +define internal void @foo() { +; CHECK-LABEL: foo: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbx, -16 +; CHECK-NEXT: leaq TLS@TLSLD(%rip), %rdi +; CHECK-NEXT: callq __tls_get_addr@PLT +; CHECK-NEXT: leaq TLS@DTPOFF(%rax), %rbx +; CHECK-NEXT: movq %rbx, %rdi +; CHECK-NEXT: callq bar@PLT +; CHECK-NEXT: movq %rbx, %rdi +; CHECK-NEXT: callq bar@PLT +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + call void @bar(ptr @TLS) + call void @bar(ptr @TLS) + ret void +} diff --git a/llvm/test/Transforms/FunctionSpecialization/cmp-with-range.ll b/llvm/test/Transforms/FunctionSpecialization/cmp-with-range.ll new file mode 100644 index 0000000..d692294 --- /dev/null +++ b/llvm/test/Transforms/FunctionSpecialization/cmp-with-range.ll @@ -0,0 +1,127 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 5 +; RUN: opt -passes="ipsccp<func-spec>" -funcspec-min-function-size=1 \ +; RUN: -funcspec-for-literal-constant=true \ +; RUN: -funcspec-min-codesize-savings=50 \ +; RUN: -funcspec-min-latency-savings=0 \ +; RUN: -S < %s | FileCheck %s + +; Verify that we are able to estimate the codesize savings arising from a branch +; based on a comparison with a value found to have a constant range by IPSCCP. +define i32 @main() { + %notspec = call i32 @test_use_on_lhs(i32 8) + %spec1 = call i32 @test_use_on_lhs(i32 0) + %spec2 = call i32 @test_use_on_rhs(i32 1) + %sum1 = add i32 %notspec, %spec1 + %sum2 = add i32 %sum1, %spec2 + ret i32 %sum2 +} + +define i32 @test_use_on_lhs(i32 %x) { +entry: + %range = call i32 @foo(), !range !{ i32 1, i32 0 } + %bound = shl nsw nuw i32 %range, 3 + %cmp = icmp uge i32 %x, %bound + br i1 %cmp, label %if.then, label %if.end + +if.then: + call void @do_something() + call void @do_something() + call void @do_something() + call void @do_something() + br label %if.end + +if.end: + %res = phi i32 [ 0, %entry ], [ 1, %if.then] + ret i32 %res +} + +define i32 @test_use_on_rhs(i32 %x) { +entry: + %range = call i32 @foo(), !range !{ i32 1, i32 0 } + %bound = shl nsw nuw i32 %range, 3 + %x.sub = sub nsw nuw i32 %x, 1 + %cmp = icmp ult i32 %bound, %x.sub + br i1 %cmp, label %if.then, label %if.end + +if.then: + call void @do_something() + call void @do_something() + call void @do_something() + call void @do_something() + br label %if.end + +if.end: + %res = phi i32 [ 0, %entry ], [ 1, %if.then] + ret i32 %res +} + +declare i32 @foo() +declare void @do_something() +; CHECK-LABEL: define range(i32 0, 2) i32 @main() { +; CHECK-NEXT: [[NOTSPEC:%.*]] = call i32 @test_use_on_lhs(i32 8) +; CHECK-NEXT: [[SPEC1:%.*]] = call i32 @test_use_on_lhs.specialized.1(i32 0) +; CHECK-NEXT: [[SPEC2:%.*]] = call i32 @test_use_on_rhs.specialized.2(i32 1) +; CHECK-NEXT: [[SUM:%.*]] = add nuw nsw i32 [[NOTSPEC]], 0 +; CHECK-NEXT: [[RES:%.*]] = add nuw nsw i32 [[SUM]], 0 +; CHECK-NEXT: ret i32 [[RES]] +; +; +; CHECK-LABEL: define range(i32 0, 2) i32 @test_use_on_lhs( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[RANGE:%.*]] = call i32 @foo(), !range [[RNG0:![0-9]+]] +; CHECK-NEXT: [[BOUND:%.*]] = shl nuw nsw i32 [[RANGE]], 3 +; CHECK-NEXT: [[CMP:%.*]] = icmp uge i32 [[X]], [[BOUND]] +; CHECK-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: call void @do_something() +; CHECK-NEXT: call void @do_something() +; CHECK-NEXT: call void @do_something() +; CHECK-NEXT: call void @do_something() +; CHECK-NEXT: br label %[[IF_END]] +; CHECK: [[IF_END]]: +; CHECK-NEXT: [[RES:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ 1, %[[IF_THEN]] ] +; CHECK-NEXT: ret i32 [[RES]] +; +; +; CHECK-LABEL: define range(i32 0, 2) i32 @test_use_on_rhs( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[RANGE:%.*]] = call i32 @foo(), !range [[RNG0]] +; CHECK-NEXT: [[BOUND:%.*]] = shl nuw nsw i32 [[RANGE]], 3 +; CHECK-NEXT: [[X_SUB:%.*]] = sub nuw nsw i32 [[X]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[BOUND]], [[X_SUB]] +; CHECK-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: call void @do_something() +; CHECK-NEXT: call void @do_something() +; CHECK-NEXT: call void @do_something() +; CHECK-NEXT: call void @do_something() +; CHECK-NEXT: br label %[[IF_END]] +; CHECK: [[IF_END]]: +; CHECK-NEXT: [[RES:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ 1, %[[IF_THEN]] ] +; CHECK-NEXT: ret i32 [[RES]] +; +; +; CHECK-LABEL: define internal i32 @test_use_on_lhs.specialized.1( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[RANGE:%.*]] = call i32 @foo(), !range [[RNG0]] +; CHECK-NEXT: [[BOUND:%.*]] = shl nuw nsw i32 [[RANGE]], 3 +; CHECK-NEXT: br label %[[IF_END:.*]] +; CHECK: [[IF_END]]: +; CHECK-NEXT: ret i32 poison +; +; +; CHECK-LABEL: define internal i32 @test_use_on_rhs.specialized.2( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[RANGE:%.*]] = call i32 @foo(), !range [[RNG0]] +; CHECK-NEXT: [[BOUND:%.*]] = shl nuw nsw i32 [[RANGE]], 3 +; CHECK-NEXT: br label %[[IF_END:.*]] +; CHECK: [[IF_END]]: +; CHECK-NEXT: ret i32 poison +; +;. +; CHECK: [[RNG0]] = !{i32 1, i32 0} +;. diff --git a/llvm/test/Transforms/SLPVectorizer/reudction-or-non-poisoned.ll b/llvm/test/Transforms/SLPVectorizer/reudction-or-non-poisoned.ll new file mode 100644 index 0000000..ac47c60 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/reudction-or-non-poisoned.ll @@ -0,0 +1,24 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S --passes=slp-vectorizer < %s | FileCheck %s + +define i1 @test(i32 %x, i32 %a, i32 %b, i32 %c, i32 %d) { +; CHECK-LABEL: define i1 @test( +; CHECK-SAME: i32 [[X:%.*]], i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]], i32 [[D:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[D]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[B]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[C]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[X]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt <4 x i32> [[TMP4]], <i32 1, i32 1, i32 1, i32 1> +; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) +; CHECK-NEXT: ret i1 [[TMP6]] +; + %cmp = icmp sgt i32 %x, 1 + %cmp2 = icmp sgt i32 %b, 1 + %cmp3 = icmp sgt i32 %c, 1 + %cmp4 = icmp sgt i32 %d, 1 + %sel2 = select i1 %cmp4, i1 true, i1 %cmp2 + %sel3 = select i1 %sel2, i1 true, i1 %cmp3 + %sel4 = select i1 %cmp, i1 true, i1 %cmp4 + %ret = or i1 %sel3, %sel4 + ret i1 %ret +} diff --git a/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp b/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp index c8fd366..9f76e9f 100644 --- a/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp +++ b/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp @@ -469,3 +469,38 @@ TEST_F(FunctionSpecializationTest, PhiNode) { EXPECT_TRUE(Test > 0); } +TEST_F(FunctionSpecializationTest, BinOp) { + // Verify that we can handle binary operators even when only one operand is + // constant. + const char *ModuleString = R"( + define i32 @foo(i1 %a, i1 %b) { + %and1 = and i1 %a, %b + %and2 = and i1 %b, %and1 + %sel = select i1 %and2, i32 1, i32 0 + ret i32 %sel + } + )"; + + Module &M = parseModule(ModuleString); + Function *F = M.getFunction("foo"); + FunctionSpecializer Specializer = getSpecializerFor(F); + InstCostVisitor Visitor = Specializer.getInstCostVisitorFor(F); + + Constant *False = ConstantInt::getFalse(M.getContext()); + BasicBlock &BB = F->front(); + Instruction &And1 = BB.front(); + Instruction &And2 = *++BB.begin(); + Instruction &Select = *++BB.begin(); + + Cost RefCodeSize = getCodeSizeSavings(And1) + getCodeSizeSavings(And2) + + getCodeSizeSavings(Select); + Cost RefLatency = getLatencySavings(F); + + Cost TestCodeSize = Visitor.getCodeSizeSavingsForArg(F->getArg(0), False); + Cost TestLatency = Visitor.getLatencySavingsForKnownConstants(); + + EXPECT_EQ(TestCodeSize, RefCodeSize); + EXPECT_TRUE(TestCodeSize > 0); + EXPECT_EQ(TestLatency, RefLatency); + EXPECT_TRUE(TestLatency > 0); +} |