aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorU-BERGUFFLEN\meinersbur <llvm-project@meinersbur.de>2024-11-04 14:08:55 +0100
committerU-BERGUFFLEN\meinersbur <llvm-project@meinersbur.de>2024-11-04 14:08:55 +0100
commit54bc5758988e505e17618c04a9a36d1cfa72913d (patch)
tree37e93309d367876bc1b6ffac21988665f85dc43d
parentdacf10168ae0dfd222ed301b93369834d411d139 (diff)
parent3cffa3474fd20518e19afa0c0ad3ff602864f688 (diff)
downloadllvm-users/meinersbur/irbuilder-extract.zip
llvm-users/meinersbur/irbuilder-extract.tar.gz
llvm-users/meinersbur/irbuilder-extract.tar.bz2
Merge commit '3cffa3474fd20518e19afa0c0ad3ff602864f688' into users/meinersbur/irbuilder-extractusers/meinersbur/irbuilder-extract
-rw-r--r--clang/docs/APINotes.rst18
-rw-r--r--clang/lib/AST/Type.cpp18
-rw-r--r--clang/test/SemaCXX/nullability_redecl.cpp27
-rw-r--r--libclc/generic/lib/math/clc_fma.cl232
-rw-r--r--lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/optional/TestDataFormatterLibcxxOptionalSimulator.py6
-rw-r--r--llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h2
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp2
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp2
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp140
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.h3
-rw-r--r--llvm/lib/Target/X86/X86InstrCompiler.td6
-rw-r--r--llvm/lib/Transforms/IPO/FunctionSpecialization.cpp41
-rw-r--r--llvm/lib/Transforms/Utils/CodeExtractor.cpp11
-rw-r--r--llvm/test/CodeGen/NVPTX/load-store.ll6221
-rw-r--r--llvm/test/CodeGen/PowerPC/big-endian-store-forward.ll12
-rw-r--r--llvm/test/CodeGen/SPIRV/decoration-order.ll15
-rw-r--r--llvm/test/CodeGen/X86/tls-function-argument.ll30
-rw-r--r--llvm/test/Transforms/FunctionSpecialization/cmp-with-range.ll127
-rw-r--r--llvm/test/Transforms/SLPVectorizer/reudction-or-non-poisoned.ll24
-rw-r--r--llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp35
20 files changed, 5299 insertions, 1673 deletions
diff --git a/clang/docs/APINotes.rst b/clang/docs/APINotes.rst
index dcefa68..d20c4f9 100644
--- a/clang/docs/APINotes.rst
+++ b/clang/docs/APINotes.rst
@@ -172,10 +172,28 @@ declaration kind), all of which are optional:
::
Tags:
+ - Name: OwnedStorage
+ SwiftImportAs: owned
+
+:SwiftRetainOp, SwiftReleaseOp:
+
+ Controls the lifetime operations of a class which uses custom reference
+ counting. The class must be annotated as a reference type using
+ ``SwiftImportAs: reference``. The values are either names of global functions,
+ each taking a single parameter of a pointer type, or ``immortal`` for a type
+ that is considered alive for the duration of the program.
+
+ ::
+
+ Tags:
- Name: RefCountedStorage
SwiftImportAs: reference
SwiftReleaseOp: RCRelease
SwiftRetainOp: RCRetain
+ - Name: ImmortalSingleton
+ SwiftImportAs: reference
+ SwiftReleaseOp: immortal
+ SwiftRetainOp: immortal
:SwiftCopyable:
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index 229721a..6bf2908 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -43,6 +43,7 @@
#include "llvm/ADT/APSInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/ErrorHandling.h"
@@ -4774,7 +4775,10 @@ bool Type::canHaveNullability(bool ResultIfUnknown) const {
->getTemplateName()
.getAsTemplateDecl())
if (auto *CTD = dyn_cast<ClassTemplateDecl>(templateDecl))
- return CTD->getTemplatedDecl()->hasAttr<TypeNullableAttr>();
+ return llvm::any_of(
+ CTD->redecls(), [](const RedeclarableTemplateDecl *RTD) {
+ return RTD->getTemplatedDecl()->hasAttr<TypeNullableAttr>();
+ });
return ResultIfUnknown;
case Type::Builtin:
@@ -4841,10 +4845,14 @@ bool Type::canHaveNullability(bool ResultIfUnknown) const {
// For template specializations, look only at primary template attributes.
// This is a consistent regardless of whether the instantiation is known.
if (const auto *CTSD = dyn_cast<ClassTemplateSpecializationDecl>(RD))
- return CTSD->getSpecializedTemplate()
- ->getTemplatedDecl()
- ->hasAttr<TypeNullableAttr>();
- return RD->hasAttr<TypeNullableAttr>();
+ return llvm::any_of(
+ CTSD->getSpecializedTemplate()->redecls(),
+ [](const RedeclarableTemplateDecl *RTD) {
+ return RTD->getTemplatedDecl()->hasAttr<TypeNullableAttr>();
+ });
+ return llvm::any_of(RD->redecls(), [](const TagDecl *RD) {
+ return RD->hasAttr<TypeNullableAttr>();
+ });
}
// Non-pointer types.
diff --git a/clang/test/SemaCXX/nullability_redecl.cpp b/clang/test/SemaCXX/nullability_redecl.cpp
new file mode 100644
index 0000000..99bc521
--- /dev/null
+++ b/clang/test/SemaCXX/nullability_redecl.cpp
@@ -0,0 +1,27 @@
+// RUN: %clang_cc1 -std=c++11 -fsyntax-only -Wno-nullability-declspec %s -verify -Wnullable-to-nonnull-conversion -I%S/Inputs
+
+class Foo;
+using Foo1 = Foo _Nonnull; // expected-error{{nullability specifier '_Nonnull' cannot be applied to non-pointer type 'Foo'}}
+class _Nullable Foo;
+using Foo2 = Foo _Nonnull;
+class Foo;
+using Foo3 = Foo _Nonnull;
+
+template <class T>
+class Bar;
+using Bar1 = Bar<int> _Nonnull; // expected-error{{nullability specifier '_Nonnull' cannot be applied to non-pointer type 'Bar<int>'}}
+template <class T>
+class _Nullable Bar;
+using Bar2 = Bar<int> _Nonnull;
+template <class T>
+class Bar;
+using Bar3 = Bar<int> _Nonnull;
+
+namespace std {
+ template<class T> class unique_ptr;
+ using UP1 = unique_ptr<int> _Nonnull;
+ class X { template<class T> friend class unique_ptr; };
+ using UP2 = unique_ptr<int> _Nonnull;
+ template<class T> class unique_ptr;
+ using UP3 = unique_ptr<int> _Nonnull;
+}
diff --git a/libclc/generic/lib/math/clc_fma.cl b/libclc/generic/lib/math/clc_fma.cl
index dee90e9..34355a3 100644
--- a/libclc/generic/lib/math/clc_fma.cl
+++ b/libclc/generic/lib/math/clc_fma.cl
@@ -22,137 +22,141 @@
#include <clc/clc.h>
+#include "../clcmacro.h"
#include "config.h"
#include "math.h"
-#include "../clcmacro.h"
struct fp {
- ulong mantissa;
- int exponent;
- uint sign;
+ ulong mantissa;
+ int exponent;
+ uint sign;
};
-_CLC_DEF _CLC_OVERLOAD float __clc_sw_fma(float a, float b, float c)
-{
- /* special cases */
- if (isnan(a) || isnan(b) || isnan(c) || isinf(a) || isinf(b))
- return mad(a, b, c);
+_CLC_DEF _CLC_OVERLOAD float __clc_sw_fma(float a, float b, float c) {
+ /* special cases */
+ if (isnan(a) || isnan(b) || isnan(c) || isinf(a) || isinf(b))
+ return mad(a, b, c);
- /* If only c is inf, and both a,b are regular numbers, the result is c*/
- if (isinf(c))
- return c;
+ /* If only c is inf, and both a,b are regular numbers, the result is c*/
+ if (isinf(c))
+ return c;
- a = __clc_flush_denormal_if_not_supported(a);
- b = __clc_flush_denormal_if_not_supported(b);
- c = __clc_flush_denormal_if_not_supported(c);
+ a = __clc_flush_denormal_if_not_supported(a);
+ b = __clc_flush_denormal_if_not_supported(b);
+ c = __clc_flush_denormal_if_not_supported(c);
- if (c == 0)
- return a * b;
+ if (c == 0)
+ return a * b;
- struct fp st_a, st_b, st_c;
+ struct fp st_a, st_b, st_c;
- st_a.exponent = a == .0f ? 0 : ((as_uint(a) & 0x7f800000) >> 23) - 127;
- st_b.exponent = b == .0f ? 0 : ((as_uint(b) & 0x7f800000) >> 23) - 127;
- st_c.exponent = c == .0f ? 0 : ((as_uint(c) & 0x7f800000) >> 23) - 127;
+ st_a.exponent = a == .0f ? 0 : ((as_uint(a) & 0x7f800000) >> 23) - 127;
+ st_b.exponent = b == .0f ? 0 : ((as_uint(b) & 0x7f800000) >> 23) - 127;
+ st_c.exponent = c == .0f ? 0 : ((as_uint(c) & 0x7f800000) >> 23) - 127;
- st_a.mantissa = a == .0f ? 0 : (as_uint(a) & 0x7fffff) | 0x800000;
- st_b.mantissa = b == .0f ? 0 : (as_uint(b) & 0x7fffff) | 0x800000;
- st_c.mantissa = c == .0f ? 0 : (as_uint(c) & 0x7fffff) | 0x800000;
+ st_a.mantissa = a == .0f ? 0 : (as_uint(a) & 0x7fffff) | 0x800000;
+ st_b.mantissa = b == .0f ? 0 : (as_uint(b) & 0x7fffff) | 0x800000;
+ st_c.mantissa = c == .0f ? 0 : (as_uint(c) & 0x7fffff) | 0x800000;
- st_a.sign = as_uint(a) & 0x80000000;
- st_b.sign = as_uint(b) & 0x80000000;
- st_c.sign = as_uint(c) & 0x80000000;
+ st_a.sign = as_uint(a) & 0x80000000;
+ st_b.sign = as_uint(b) & 0x80000000;
+ st_c.sign = as_uint(c) & 0x80000000;
- // Multiplication.
- // Move the product to the highest bits to maximize precision
- // mantissa is 24 bits => product is 48 bits, 2bits non-fraction.
- // Add one bit for future addition overflow,
- // add another bit to detect subtraction underflow
- struct fp st_mul;
- st_mul.sign = st_a.sign ^ st_b.sign;
- st_mul.mantissa = (st_a.mantissa * st_b.mantissa) << 14ul;
- st_mul.exponent = st_mul.mantissa ? st_a.exponent + st_b.exponent : 0;
+ // Multiplication.
+ // Move the product to the highest bits to maximize precision
+ // mantissa is 24 bits => product is 48 bits, 2bits non-fraction.
+ // Add one bit for future addition overflow,
+ // add another bit to detect subtraction underflow
+ struct fp st_mul;
+ st_mul.sign = st_a.sign ^ st_b.sign;
+ st_mul.mantissa = (st_a.mantissa * st_b.mantissa) << 14ul;
+ st_mul.exponent = st_mul.mantissa ? st_a.exponent + st_b.exponent : 0;
- // FIXME: Detecting a == 0 || b == 0 above crashed GCN isel
- if (st_mul.exponent == 0 && st_mul.mantissa == 0)
- return c;
+ // FIXME: Detecting a == 0 || b == 0 above crashed GCN isel
+ if (st_mul.exponent == 0 && st_mul.mantissa == 0)
+ return c;
// Mantissa is 23 fractional bits, shift it the same way as product mantissa
#define C_ADJUST 37ul
- // both exponents are bias adjusted
- int exp_diff = st_mul.exponent - st_c.exponent;
-
- st_c.mantissa <<= C_ADJUST;
- ulong cutoff_bits = 0;
- ulong cutoff_mask = (1ul << abs(exp_diff)) - 1ul;
- if (exp_diff > 0) {
- cutoff_bits = exp_diff >= 64 ? st_c.mantissa : (st_c.mantissa & cutoff_mask);
- st_c.mantissa = exp_diff >= 64 ? 0 : (st_c.mantissa >> exp_diff);
- } else {
- cutoff_bits = -exp_diff >= 64 ? st_mul.mantissa : (st_mul.mantissa & cutoff_mask);
- st_mul.mantissa = -exp_diff >= 64 ? 0 : (st_mul.mantissa >> -exp_diff);
- }
-
- struct fp st_fma;
- st_fma.sign = st_mul.sign;
- st_fma.exponent = max(st_mul.exponent, st_c.exponent);
- if (st_c.sign == st_mul.sign) {
- st_fma.mantissa = st_mul.mantissa + st_c.mantissa;
- } else {
- // cutoff bits borrow one
- st_fma.mantissa = st_mul.mantissa - st_c.mantissa - (cutoff_bits && (st_mul.exponent > st_c.exponent) ? 1 : 0);
- }
-
- // underflow: st_c.sign != st_mul.sign, and magnitude switches the sign
- if (st_fma.mantissa > LONG_MAX) {
- st_fma.mantissa = 0 - st_fma.mantissa;
- st_fma.sign = st_mul.sign ^ 0x80000000;
- }
-
- // detect overflow/underflow
- int overflow_bits = 3 - clz(st_fma.mantissa);
-
- // adjust exponent
- st_fma.exponent += overflow_bits;
-
- // handle underflow
- if (overflow_bits < 0) {
- st_fma.mantissa <<= -overflow_bits;
- overflow_bits = 0;
- }
-
- // rounding
- ulong trunc_mask = (1ul << (C_ADJUST + overflow_bits)) - 1;
- ulong trunc_bits = (st_fma.mantissa & trunc_mask) | (cutoff_bits != 0);
- ulong last_bit = st_fma.mantissa & (1ul << (C_ADJUST + overflow_bits));
- ulong grs_bits = (0x4ul << (C_ADJUST - 3 + overflow_bits));
-
- // round to nearest even
- if ((trunc_bits > grs_bits) ||
- (trunc_bits == grs_bits && last_bit != 0))
- st_fma.mantissa += (1ul << (C_ADJUST + overflow_bits));
-
- // Shift mantissa back to bit 23
- st_fma.mantissa = (st_fma.mantissa >> (C_ADJUST + overflow_bits));
-
- // Detect rounding overflow
- if (st_fma.mantissa > 0xffffff) {
- ++st_fma.exponent;
- st_fma.mantissa >>= 1;
- }
-
- if (st_fma.mantissa == 0)
- return .0f;
-
- // Flating point range limit
- if (st_fma.exponent > 127)
- return as_float(as_uint(INFINITY) | st_fma.sign);
-
- // Flush denormals
- if (st_fma.exponent <= -127)
- return as_float(st_fma.sign);
-
- return as_float(st_fma.sign | ((st_fma.exponent + 127) << 23) | ((uint)st_fma.mantissa & 0x7fffff));
+ // both exponents are bias adjusted
+ int exp_diff = st_mul.exponent - st_c.exponent;
+
+ st_c.mantissa <<= C_ADJUST;
+ ulong cutoff_bits = 0;
+ ulong cutoff_mask = (1ul << abs(exp_diff)) - 1ul;
+ if (exp_diff > 0) {
+ cutoff_bits =
+ exp_diff >= 64 ? st_c.mantissa : (st_c.mantissa & cutoff_mask);
+ st_c.mantissa = exp_diff >= 64 ? 0 : (st_c.mantissa >> exp_diff);
+ } else {
+ cutoff_bits =
+ -exp_diff >= 64 ? st_mul.mantissa : (st_mul.mantissa & cutoff_mask);
+ st_mul.mantissa = -exp_diff >= 64 ? 0 : (st_mul.mantissa >> -exp_diff);
+ }
+
+ struct fp st_fma;
+ st_fma.sign = st_mul.sign;
+ st_fma.exponent = max(st_mul.exponent, st_c.exponent);
+ if (st_c.sign == st_mul.sign) {
+ st_fma.mantissa = st_mul.mantissa + st_c.mantissa;
+ } else {
+ // cutoff bits borrow one
+ st_fma.mantissa =
+ st_mul.mantissa - st_c.mantissa -
+ (cutoff_bits && (st_mul.exponent > st_c.exponent) ? 1 : 0);
+ }
+
+ // underflow: st_c.sign != st_mul.sign, and magnitude switches the sign
+ if (st_fma.mantissa > LONG_MAX) {
+ st_fma.mantissa = 0 - st_fma.mantissa;
+ st_fma.sign = st_mul.sign ^ 0x80000000;
+ }
+
+ // detect overflow/underflow
+ int overflow_bits = 3 - clz(st_fma.mantissa);
+
+ // adjust exponent
+ st_fma.exponent += overflow_bits;
+
+ // handle underflow
+ if (overflow_bits < 0) {
+ st_fma.mantissa <<= -overflow_bits;
+ overflow_bits = 0;
+ }
+
+ // rounding
+ ulong trunc_mask = (1ul << (C_ADJUST + overflow_bits)) - 1;
+ ulong trunc_bits = (st_fma.mantissa & trunc_mask) | (cutoff_bits != 0);
+ ulong last_bit = st_fma.mantissa & (1ul << (C_ADJUST + overflow_bits));
+ ulong grs_bits = (0x4ul << (C_ADJUST - 3 + overflow_bits));
+
+ // round to nearest even
+ if ((trunc_bits > grs_bits) || (trunc_bits == grs_bits && last_bit != 0))
+ st_fma.mantissa += (1ul << (C_ADJUST + overflow_bits));
+
+ // Shift mantissa back to bit 23
+ st_fma.mantissa = (st_fma.mantissa >> (C_ADJUST + overflow_bits));
+
+ // Detect rounding overflow
+ if (st_fma.mantissa > 0xffffff) {
+ ++st_fma.exponent;
+ st_fma.mantissa >>= 1;
+ }
+
+ if (st_fma.mantissa == 0)
+ return .0f;
+
+ // Flating point range limit
+ if (st_fma.exponent > 127)
+ return as_float(as_uint(INFINITY) | st_fma.sign);
+
+ // Flush denormals
+ if (st_fma.exponent <= -127)
+ return as_float(st_fma.sign);
+
+ return as_float(st_fma.sign | ((st_fma.exponent + 127) << 23) |
+ ((uint)st_fma.mantissa & 0x7fffff));
}
-_CLC_TERNARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_sw_fma, float, float, float)
+_CLC_TERNARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_sw_fma, float,
+ float, float)
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/optional/TestDataFormatterLibcxxOptionalSimulator.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/optional/TestDataFormatterLibcxxOptionalSimulator.py
index 7167f4c..3fefe87 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/optional/TestDataFormatterLibcxxOptionalSimulator.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/optional/TestDataFormatterLibcxxOptionalSimulator.py
@@ -47,6 +47,12 @@ for r in range(2):
name = f"test_r{r}"
defines = [f"REVISION={r}"]
+ # LLDB's FormatterCache caches on DW_AT_name. A change introduced in
+ # clang-17 (commit bee886052) changed the contents of DW_AT_name for
+ # template specializations, which broke FormatterCache assumptions
+ # causing this test to fail. This was reverted in newer version of clang
+ # with commit 52a9ba7ca.
+ @skipIf(compiler="clang", compiler_version=["=", "17"])
@functools.wraps(LibcxxOptionalDataFormatterSimulatorTestCase._run_test)
def test_method(self, defines=defines):
LibcxxOptionalDataFormatterSimulatorTestCase._run_test(self, defines)
diff --git a/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h b/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h
index e82155a..ff5af59 100644
--- a/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h
+++ b/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h
@@ -155,7 +155,7 @@ class InstCostVisitor : public InstVisitor<InstCostVisitor, Constant *> {
Function *F;
const DataLayout &DL;
TargetTransformInfo &TTI;
- SCCPSolver &Solver;
+ const SCCPSolver &Solver;
ConstMap KnownConstants;
// Basic blocks known to be unreachable after constant propagation.
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 526305d..892912a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -990,13 +990,13 @@ bool SPIRVInstructionSelector::selectMemOperation(Register ResVReg,
Register VarReg = MRI->createGenericVirtualRegister(LLT::scalar(64));
GR.add(GV, GR.CurMF, VarReg);
- buildOpDecorate(VarReg, I, TII, SPIRV::Decoration::Constant, {});
BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpVariable))
.addDef(VarReg)
.addUse(GR.getSPIRVTypeID(VarTy))
.addImm(SPIRV::StorageClass::UniformConstant)
.addUse(Const)
.constrainAllUses(TII, TRI, RBI);
+ buildOpDecorate(VarReg, I, TII, SPIRV::Decoration::Constant, {});
SPIRVType *SourceTy = GR.getOrCreateSPIRVPointerType(
ValTy, I, TII, SPIRV::StorageClass::UniformConstant);
SrcReg = MRI->createGenericVirtualRegister(LLT::scalar(64));
diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
index cc34cf8..790d86f 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
@@ -829,7 +829,7 @@ static void insertSpirvDecorations(MachineFunction &MF, MachineIRBuilder MIB) {
for (MachineInstr &MI : MBB) {
if (!isSpvIntrinsic(MI, Intrinsic::spv_assign_decoration))
continue;
- MIB.setInsertPt(*MI.getParent(), MI);
+ MIB.setInsertPt(*MI.getParent(), MI.getNextNode());
buildOpSpirvDecorations(MI.getOperand(1).getReg(), MIB,
MI.getOperand(2).getMetadata());
ToErase.push_back(&MI);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 0ae814d..1fd6b24 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -18831,44 +18831,66 @@ X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
}
-static SDValue
-GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
- SDValue *InGlue, const EVT PtrVT, unsigned ReturnReg,
- unsigned char OperandFlags, bool LocalDynamic = false) {
+static SDValue GetTLSADDR(SelectionDAG &DAG, GlobalAddressSDNode *GA,
+ const EVT PtrVT, unsigned ReturnReg,
+ unsigned char OperandFlags,
+ bool LoadGlobalBaseReg = false,
+ bool LocalDynamic = false) {
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
SDLoc dl(GA);
SDValue TGA;
bool UseTLSDESC = DAG.getTarget().useTLSDESC();
+ SDValue Chain = DAG.getEntryNode();
+ SDValue Ret;
if (LocalDynamic && UseTLSDESC) {
TGA = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, OperandFlags);
auto UI = TGA->use_begin();
// Reuse existing GetTLSADDR node if we can find it.
- if (UI != TGA->use_end())
- return SDValue(*UI->use_begin()->use_begin(), 0);
+ if (UI != TGA->use_end()) {
+ // TLSDESC uses TGA.
+ auto TLSDescOp = UI;
+ assert(TLSDescOp->getOpcode() == X86ISD::TLSDESC &&
+ "Unexpected TLSDESC DAG");
+ // CALLSEQ_END uses TGA via a chain and glue.
+ auto *CallSeqEndOp = TLSDescOp->getGluedUser();
+ assert(CallSeqEndOp && CallSeqEndOp->getOpcode() == ISD::CALLSEQ_END &&
+ "Unexpected TLSDESC DAG");
+ // CopyFromReg uses CALLSEQ_END via a chain and glue.
+ auto *CopyFromRegOp = CallSeqEndOp->getGluedUser();
+ assert(CopyFromRegOp && CopyFromRegOp->getOpcode() == ISD::CopyFromReg &&
+ "Unexpected TLSDESC DAG");
+ Ret = SDValue(CopyFromRegOp, 0);
+ }
} else {
TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
GA->getOffset(), OperandFlags);
}
- X86ISD::NodeType CallType = UseTLSDESC ? X86ISD::TLSDESC
- : LocalDynamic ? X86ISD::TLSBASEADDR
- : X86ISD::TLSADDR;
+ if (!Ret) {
+ X86ISD::NodeType CallType = UseTLSDESC ? X86ISD::TLSDESC
+ : LocalDynamic ? X86ISD::TLSBASEADDR
+ : X86ISD::TLSADDR;
- if (InGlue) {
- SDValue Ops[] = { Chain, TGA, *InGlue };
- Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
- } else {
- SDValue Ops[] = { Chain, TGA };
- Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
- }
+ Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
+ if (LoadGlobalBaseReg) {
+ SDValue InGlue;
+ Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
+ DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT),
+ InGlue);
+ InGlue = Chain.getValue(1);
+ Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA, InGlue});
+ } else {
+ Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA});
+ }
+ Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), dl);
- // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
- MFI.setAdjustsStack(true);
- MFI.setHasCalls(true);
+ // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
+ MFI.setHasCalls(true);
- SDValue Glue = Chain.getValue(1);
- SDValue Ret = DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
+ SDValue Glue = Chain.getValue(1);
+ Ret = DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
+ }
if (!UseTLSDESC)
return Ret;
@@ -18887,30 +18909,22 @@ GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
static SDValue
LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
const EVT PtrVT) {
- SDValue InGlue;
- SDLoc dl(GA); // ? function entry point might be better
- SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
- DAG.getNode(X86ISD::GlobalBaseReg,
- SDLoc(), PtrVT), InGlue);
- InGlue = Chain.getValue(1);
-
- return GetTLSADDR(DAG, Chain, GA, &InGlue, PtrVT, X86::EAX, X86II::MO_TLSGD);
+ return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD,
+ /*LoadGlobalBaseReg=*/true);
}
// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
static SDValue
LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
const EVT PtrVT) {
- return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
- X86::RAX, X86II::MO_TLSGD);
+ return GetTLSADDR(DAG, GA, PtrVT, X86::RAX, X86II::MO_TLSGD);
}
// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
static SDValue
LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
const EVT PtrVT) {
- return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
- X86::EAX, X86II::MO_TLSGD);
+ return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD);
}
static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
@@ -18919,22 +18933,20 @@ static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
SDLoc dl(GA);
// Get the start address of the TLS block for this module.
- X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
- .getInfo<X86MachineFunctionInfo>();
+ X86MachineFunctionInfo *MFI =
+ DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
MFI->incNumLocalDynamicTLSAccesses();
SDValue Base;
if (Is64Bit) {
unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
- Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, ReturnReg,
- X86II::MO_TLSLD, /*LocalDynamic=*/true);
+ Base = GetTLSADDR(DAG, GA, PtrVT, ReturnReg, X86II::MO_TLSLD,
+ /*LoadGlobalBaseReg=*/false,
+ /*LocalDynamic=*/true);
} else {
- SDValue InGlue;
- SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
- DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InGlue);
- InGlue = Chain.getValue(1);
- Base = GetTLSADDR(DAG, Chain, GA, &InGlue, PtrVT, X86::EAX,
- X86II::MO_TLSLDM, /*LocalDynamic=*/true);
+ Base = GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSLDM,
+ /*LoadGlobalBaseReg=*/true,
+ /*LocalDynamic=*/true);
}
// Note: the CleanupLocalDynamicTLSPass will remove redundant computations
@@ -36060,36 +36072,6 @@ X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
}
MachineBasicBlock *
-X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
- MachineBasicBlock *BB) const {
- // So, here we replace TLSADDR with the sequence:
- // adjust_stackdown -> TLSADDR -> adjust_stackup.
- // We need this because TLSADDR is lowered into calls
- // inside MC, therefore without the two markers shrink-wrapping
- // may push the prologue/epilogue pass them.
- const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
- const MIMetadata MIMD(MI);
- MachineFunction &MF = *BB->getParent();
-
- // Emit CALLSEQ_START right before the instruction.
- MF.getFrameInfo().setAdjustsStack(true);
- unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
- MachineInstrBuilder CallseqStart =
- BuildMI(MF, MIMD, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
- BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
-
- // Emit CALLSEQ_END right after the instruction.
- // We don't call erase from parent because we want to keep the
- // original instruction around.
- unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
- MachineInstrBuilder CallseqEnd =
- BuildMI(MF, MIMD, TII.get(AdjStackUp)).addImm(0).addImm(0);
- BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
-
- return BB;
-}
-
-MachineBasicBlock *
X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
MachineBasicBlock *BB) const {
// This is pretty easy. We're taking the value that we received from
@@ -37091,16 +37073,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return X86::TMM0_TMM1 + Imm / 2;
};
switch (MI.getOpcode()) {
- default: llvm_unreachable("Unexpected instr type to insert");
- case X86::TLS_addr32:
- case X86::TLS_addr64:
- case X86::TLS_addrX32:
- case X86::TLS_base_addr32:
- case X86::TLS_base_addr64:
- case X86::TLS_base_addrX32:
- case X86::TLS_desc32:
- case X86::TLS_desc64:
- return EmitLoweredTLSAddr(MI, BB);
+ default:
+ llvm_unreachable("Unexpected instr type to insert");
case X86::INDIRECT_THUNK_CALL32:
case X86::INDIRECT_THUNK_CALL64:
case X86::INDIRECT_THUNK_TCRETURN32:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 14ada17..2db25d6 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1848,9 +1848,6 @@ namespace llvm {
MachineBasicBlock *EmitLoweredProbedAlloca(MachineInstr &MI,
MachineBasicBlock *BB) const;
- MachineBasicBlock *EmitLoweredTLSAddr(MachineInstr &MI,
- MachineBasicBlock *BB) const;
-
MachineBasicBlock *EmitLoweredTLSCall(MachineInstr &MI,
MachineBasicBlock *BB) const;
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index a05c3f0..51cee2e 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -478,7 +478,7 @@ let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7,
MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS, DF],
- usesCustomInserter = 1, Uses = [ESP, SSP] in {
+ Uses = [ESP, SSP] in {
def TLS_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
"# TLS_addr32",
[(X86tlsaddr tls32addr:$sym)]>,
@@ -498,7 +498,7 @@ let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS, DF],
- usesCustomInserter = 1, Uses = [RSP, SSP] in {
+ Uses = [RSP, SSP] in {
def TLS_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
"# TLS_addr64",
[(X86tlsaddr tls64addr:$sym)]>,
@@ -520,7 +520,7 @@ def TLS_base_addrX32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
// TLSDESC only clobbers EAX and EFLAGS. ESP is marked as a use to prevent
// stack-pointer assignments that appear immediately before calls from
// potentially appearing dead.
-let Defs = [EAX, EFLAGS], usesCustomInserter = 1, Uses = [RSP, SSP] in {
+let Defs = [EAX, EFLAGS], Uses = [RSP, SSP] in {
def TLS_desc32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
"# TLS_desc32", [(X86tlsdesc tls32addr:$sym)]>;
def TLS_desc64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
index 1efec22..17d8283 100644
--- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
@@ -472,16 +472,24 @@ Constant *InstCostVisitor::visitCastInst(CastInst &I) {
Constant *InstCostVisitor::visitCmpInst(CmpInst &I) {
assert(LastVisited != KnownConstants.end() && "Invalid iterator!");
- bool Swap = I.getOperand(1) == LastVisited->first;
- Value *V = Swap ? I.getOperand(0) : I.getOperand(1);
+ Constant *Const = LastVisited->second;
+ bool ConstOnRHS = I.getOperand(1) == LastVisited->first;
+ Value *V = ConstOnRHS ? I.getOperand(0) : I.getOperand(1);
Constant *Other = findConstantFor(V, KnownConstants);
- if (!Other)
- return nullptr;
- Constant *Const = LastVisited->second;
- return Swap ?
- ConstantFoldCompareInstOperands(I.getPredicate(), Other, Const, DL)
- : ConstantFoldCompareInstOperands(I.getPredicate(), Const, Other, DL);
+ if (Other) {
+ if (ConstOnRHS)
+ std::swap(Const, Other);
+ return ConstantFoldCompareInstOperands(I.getPredicate(), Const, Other, DL);
+ }
+
+ // If we haven't found Other to be a specific constant value, we may still be
+ // able to constant fold using information from the lattice value.
+ const ValueLatticeElement &ConstLV = ValueLatticeElement::get(Const);
+ const ValueLatticeElement &OtherLV = Solver.getLatticeValueFor(V);
+ auto &V1State = ConstOnRHS ? OtherLV : ConstLV;
+ auto &V2State = ConstOnRHS ? ConstLV : OtherLV;
+ return V1State.getCompare(I.getPredicate(), I.getType(), V2State, DL);
}
Constant *InstCostVisitor::visitUnaryOperator(UnaryOperator &I) {
@@ -493,16 +501,17 @@ Constant *InstCostVisitor::visitUnaryOperator(UnaryOperator &I) {
Constant *InstCostVisitor::visitBinaryOperator(BinaryOperator &I) {
assert(LastVisited != KnownConstants.end() && "Invalid iterator!");
- bool Swap = I.getOperand(1) == LastVisited->first;
- Value *V = Swap ? I.getOperand(0) : I.getOperand(1);
+ bool ConstOnRHS = I.getOperand(1) == LastVisited->first;
+ Value *V = ConstOnRHS ? I.getOperand(0) : I.getOperand(1);
Constant *Other = findConstantFor(V, KnownConstants);
- if (!Other)
- return nullptr;
+ Value *OtherVal = Other ? Other : V;
+ Value *ConstVal = LastVisited->second;
- Constant *Const = LastVisited->second;
- return dyn_cast_or_null<Constant>(Swap ?
- simplifyBinOp(I.getOpcode(), Other, Const, SimplifyQuery(DL))
- : simplifyBinOp(I.getOpcode(), Const, Other, SimplifyQuery(DL)));
+ if (ConstOnRHS)
+ std::swap(ConstVal, OtherVal);
+
+ return dyn_cast_or_null<Constant>(
+ simplifyBinOp(I.getOpcode(), ConstVal, OtherVal, SimplifyQuery(DL)));
}
Constant *FunctionSpecializer::getPromotableAlloca(AllocaInst *Alloca,
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 440912e..b6906ab 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -823,21 +823,16 @@ Function *CodeExtractor::constructFunctionDeclaration(
// Assemble the function's parameter lists.
std::vector<Type *> ParamTy;
std::vector<Type *> AggParamTy;
- std::vector<std::tuple<unsigned, Value *>> NumberedInputs;
- std::vector<std::tuple<unsigned, Value *>> NumberedOutputs;
const DataLayout &DL = M->getDataLayout();
// Add the types of the input values to the function's argument list
- unsigned ArgNum = 0;
for (Value *value : inputs) {
LLVM_DEBUG(dbgs() << "value used in func: " << *value << "\n");
if (AggregateArgs && !ExcludeArgsFromAggregate.contains(value)) {
AggParamTy.push_back(value->getType());
StructValues.insert(value);
- } else {
+ } else
ParamTy.push_back(value->getType());
- NumberedInputs.emplace_back(ArgNum++, value);
- }
}
// Add the types of the output values to the function's argument list.
@@ -846,11 +841,9 @@ Function *CodeExtractor::constructFunctionDeclaration(
if (AggregateArgs && !ExcludeArgsFromAggregate.contains(output)) {
AggParamTy.push_back(output->getType());
StructValues.insert(output);
- } else {
+ } else
ParamTy.push_back(
PointerType::get(output->getType(), DL.getAllocaAddrSpace()));
- NumberedOutputs.emplace_back(ArgNum++, output);
- }
}
assert(
diff --git a/llvm/test/CodeGen/NVPTX/load-store.ll b/llvm/test/CodeGen/NVPTX/load-store.ll
index f922fd9..a4be81a 100644
--- a/llvm/test/CodeGen/NVPTX/load-store.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck -check-prefixes=CHECK,SM60 %s
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | FileCheck %s -check-prefixes=CHECK,SM70
@@ -20,1595 +21,4947 @@
; TODO: optimize .sys.shared into .cta.shared or .cluster.shared .
-; generic statespace
-
-; CHECK-LABEL: generic_weak
-define void @generic_weak(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr {
- ; CHECK: ld.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+;; generic statespace
+
+; generic
+
+define void @generic_i8(ptr %a) {
+; CHECK-LABEL: generic_i8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_i8_param_0];
+; CHECK-NEXT: ld.u8 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.u8 [%rd1], %rs2;
+; CHECK-NEXT: ret;
%a.load = load i8, ptr %a
%a.add = add i8 %a.load, 1
- ; CHECK: st.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
store i8 %a.add, ptr %a
+ ret void
+}
+
+define void @generic_i16(ptr %a) {
+; CHECK-LABEL: generic_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_i16_param_0];
+; CHECK-NEXT: ld.u16 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.u16 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load i16, ptr %a
+ %a.add = add i16 %a.load, 1
+ store i16 %a.add, ptr %a
+ ret void
+}
+
+define void @generic_i32(ptr %a) {
+; CHECK-LABEL: generic_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_i32_param_0];
+; CHECK-NEXT: ld.u32 %r1, [%rd1];
+; CHECK-NEXT: add.s32 %r2, %r1, 1;
+; CHECK-NEXT: st.u32 [%rd1], %r2;
+; CHECK-NEXT: ret;
+ %a.load = load i32, ptr %a
+ %a.add = add i32 %a.load, 1
+ store i32 %a.add, ptr %a
+ ret void
+}
+
+define void @generic_i64(ptr %a) {
+; CHECK-LABEL: generic_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_i64_param_0];
+; CHECK-NEXT: ld.u64 %rd2, [%rd1];
+; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
+; CHECK-NEXT: st.u64 [%rd1], %rd3;
+; CHECK-NEXT: ret;
+ %a.load = load i64, ptr %a
+ %a.add = add i64 %a.load, 1
+ store i64 %a.add, ptr %a
+ ret void
+}
+
+define void @generic_float(ptr %a) {
+; CHECK-LABEL: generic_float(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_float_param_0];
+; CHECK-NEXT: ld.f32 %f1, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT: st.f32 [%rd1], %f2;
+; CHECK-NEXT: ret;
+ %a.load = load float, ptr %a
+ %a.add = fadd float %a.load, 1.
+ store float %a.add, ptr %a
+ ret void
+}
+
+define void @generic_double(ptr %a) {
+; CHECK-LABEL: generic_double(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_double_param_0];
+; CHECK-NEXT: ld.f64 %fd1, [%rd1];
+; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT: st.f64 [%rd1], %fd2;
+; CHECK-NEXT: ret;
+ %a.load = load double, ptr %a
+ %a.add = fadd double %a.load, 1.
+ store double %a.add, ptr %a
+ ret void
+}
+
+; TODO: make the lowering of this weak vector ops consistent with
+; the ones of the next tests. This test lowers to a weak PTX
+; vector op, but next test lowers to a vector PTX op.
+define void @generic_2xi8(ptr %a) {
+; CHECK-LABEL: generic_2xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<5>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xi8_param_0];
+; CHECK-NEXT: ld.v2.u8 {%rs1, %rs2}, [%rd1];
+; CHECK-NEXT: add.s16 %rs3, %rs2, 1;
+; CHECK-NEXT: add.s16 %rs4, %rs1, 1;
+; CHECK-NEXT: st.v2.u8 [%rd1], {%rs4, %rs3};
+; CHECK-NEXT: ret;
+ %a.load = load <2 x i8>, ptr %a
+ %a.add = add <2 x i8> %a.load, <i8 1, i8 1>
+ store <2 x i8> %a.add, ptr %a
+ ret void
+}
+
+; TODO: make the lowering of this weak vector ops consistent with
+; the ones of the previous test. This test lowers to a weak
+; PTX scalar op, but prior test lowers to a vector PTX op.
+define void @generic_4xi8(ptr %a) {
+; CHECK-LABEL: generic_4xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<9>;
+; CHECK-NEXT: .reg .b32 %r<13>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_4xi8_param_0];
+; CHECK-NEXT: ld.u32 %r1, [%rd1];
+; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: cvt.u32.u16 %r3, %rs2;
+; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r4;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT: cvt.u32.u16 %r5, %rs4;
+; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8;
+; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
+; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
+; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8;
+; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
+; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8;
+; CHECK-NEXT: st.u32 [%rd1], %r12;
+; CHECK-NEXT: ret;
+ %a.load = load <4 x i8>, ptr %a
+ %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1>
+ store <4 x i8> %a.add, ptr %a
+ ret void
+}
- ; CHECK: ld.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load i16, ptr %b
- %b.add = add i16 %b.load, 1
- ; CHECK: st.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store i16 %b.add, ptr %b
-
- ; CHECK: ld.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load i32, ptr %c
- %c.add = add i32 %c.load, 1
- ; CHECK: st.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store i32 %c.add, ptr %c
-
- ; CHECK: ld.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load i64, ptr %d
- %d.add = add i64 %d.load, 1
- ; CHECK: st.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store i64 %d.add, ptr %d
-
- ; CHECK: ld.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load float, ptr %c
- %e.add = fadd float %e.load, 1.
- ; CHECK: st.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store float %e.add, ptr %c
-
- ; CHECK: ld.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load double, ptr %d
- %f.add = fadd double %f.load, 1.
- ; CHECK: st.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store double %f.add, ptr %d
-
- ; TODO: make the lowering of this weak vector ops consistent with
- ; the ones of the next tests. This test lowers to a weak PTX
- ; vector op, but next test lowers to a vector PTX op.
- ; CHECK: ld.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %h.load = load <2 x i8>, ptr %b
- %h.add = add <2 x i8> %h.load, <i8 1, i8 1>
- ; CHECK: st.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}}
- store <2 x i8> %h.add, ptr %b
-
- ; TODO: make the lowering of this weak vector ops consistent with
- ; the ones of the previous test. This test lowers to a weak
- ; PTX scalar op, but prior test lowers to a vector PTX op.
- ; CHECK: ld.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %i.load = load <4 x i8>, ptr %c
- %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1>
- ; CHECK: st.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store <4 x i8> %i.add, ptr %c
-
- ; CHECK: ld.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %j.load = load <2 x i16>, ptr %c
- %j.add = add <2 x i16> %j.load, <i16 1, i16 1>
- ; CHECK: st.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store <2 x i16> %j.add, ptr %c
-
- ; CHECK: ld.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %k.load = load <4 x i16>, ptr %d
- %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1>
- ; CHECK: st.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}
- store <4 x i16> %k.add, ptr %d
-
- ; CHECK: ld.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %l.load = load <2 x i32>, ptr %d
- %l.add = add <2 x i32> %l.load, <i32 1, i32 1>
- ; CHECK: st.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}}
- store <2 x i32> %l.add, ptr %d
-
- ; CHECK: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %m.load = load <4 x i32>, ptr %d
- %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1>
- ; CHECK: st.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
- store <4 x i32> %m.add, ptr %d
-
- ; CHECK: ld.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %n.load = load <2 x i64>, ptr %d
- %n.add = add <2 x i64> %n.load, <i64 1, i64 1>
- ; CHECK: st.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}}
- store <2 x i64> %n.add, ptr %d
-
- ; CHECK: ld.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %o.load = load <2 x float>, ptr %d
- %o.add = fadd <2 x float> %o.load, <float 1., float 1.>
- ; CHECK: st.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}}
- store <2 x float> %o.add, ptr %d
-
- ; CHECK: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %p.load = load <4 x float>, ptr %d
- %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.>
- ; CHECK: st.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
- store <4 x float> %p.add, ptr %d
-
- ; CHECK: ld.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %q.load = load <2 x double>, ptr %d
- %q.add = fadd <2 x double> %q.load, <double 1., double 1.>
- ; CHECK: st.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}}
- store <2 x double> %q.add, ptr %d
-
- ret void
-}
-
-; CHECK-LABEL: generic_volatile
-define void @generic_volatile(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr {
- ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+define void @generic_2xi16(ptr %a) {
+; CHECK-LABEL: generic_2xi16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<5>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xi16_param_0];
+; CHECK-NEXT: ld.u32 %r1, [%rd1];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT: add.s16 %rs3, %rs2, 1;
+; CHECK-NEXT: add.s16 %rs4, %rs1, 1;
+; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3};
+; CHECK-NEXT: st.u32 [%rd1], %r2;
+; CHECK-NEXT: ret;
+ %a.load = load <2 x i16>, ptr %a
+ %a.add = add <2 x i16> %a.load, <i16 1, i16 1>
+ store <2 x i16> %a.add, ptr %a
+ ret void
+}
+
+define void @generic_4xi16(ptr %a) {
+; CHECK-LABEL: generic_4xi16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_4xi16_param_0];
+; CHECK-NEXT: ld.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
+; CHECK-NEXT: add.s16 %rs5, %rs4, 1;
+; CHECK-NEXT: add.s16 %rs6, %rs3, 1;
+; CHECK-NEXT: add.s16 %rs7, %rs2, 1;
+; CHECK-NEXT: add.s16 %rs8, %rs1, 1;
+; CHECK-NEXT: st.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5};
+; CHECK-NEXT: ret;
+ %a.load = load <4 x i16>, ptr %a
+ %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1>
+ store <4 x i16> %a.add, ptr %a
+ ret void
+}
+
+define void @generic_2xi32(ptr %a) {
+; CHECK-LABEL: generic_2xi32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xi32_param_0];
+; CHECK-NEXT: ld.v2.u32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT: add.s32 %r3, %r2, 1;
+; CHECK-NEXT: add.s32 %r4, %r1, 1;
+; CHECK-NEXT: st.v2.u32 [%rd1], {%r4, %r3};
+; CHECK-NEXT: ret;
+ %a.load = load <2 x i32>, ptr %a
+ %a.add = add <2 x i32> %a.load, <i32 1, i32 1>
+ store <2 x i32> %a.add, ptr %a
+ ret void
+}
+
+define void @generic_4xi32(ptr %a) {
+; CHECK-LABEL: generic_4xi32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_4xi32_param_0];
+; CHECK-NEXT: ld.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: add.s32 %r5, %r4, 1;
+; CHECK-NEXT: add.s32 %r6, %r3, 1;
+; CHECK-NEXT: add.s32 %r7, %r2, 1;
+; CHECK-NEXT: add.s32 %r8, %r1, 1;
+; CHECK-NEXT: st.v4.u32 [%rd1], {%r8, %r7, %r6, %r5};
+; CHECK-NEXT: ret;
+ %a.load = load <4 x i32>, ptr %a
+ %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1>
+ store <4 x i32> %a.add, ptr %a
+ ret void
+}
+
+define void @generic_2xi64(ptr %a) {
+; CHECK-LABEL: generic_2xi64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xi64_param_0];
+; CHECK-NEXT: ld.v2.u64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT: add.s64 %rd4, %rd3, 1;
+; CHECK-NEXT: add.s64 %rd5, %rd2, 1;
+; CHECK-NEXT: st.v2.u64 [%rd1], {%rd5, %rd4};
+; CHECK-NEXT: ret;
+ %a.load = load <2 x i64>, ptr %a
+ %a.add = add <2 x i64> %a.load, <i64 1, i64 1>
+ store <2 x i64> %a.add, ptr %a
+ ret void
+}
+
+define void @generic_2xfloat(ptr %a) {
+; CHECK-LABEL: generic_2xfloat(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xfloat_param_0];
+; CHECK-NEXT: ld.v2.f32 {%f1, %f2}, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000;
+; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000;
+; CHECK-NEXT: st.v2.f32 [%rd1], {%f4, %f3};
+; CHECK-NEXT: ret;
+ %a.load = load <2 x float>, ptr %a
+ %a.add = fadd <2 x float> %a.load, <float 1., float 1.>
+ store <2 x float> %a.add, ptr %a
+ ret void
+}
+
+define void @generic_4xfloat(ptr %a) {
+; CHECK-LABEL: generic_4xfloat(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_4xfloat_param_0];
+; CHECK-NEXT: ld.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000;
+; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000;
+; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000;
+; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000;
+; CHECK-NEXT: st.v4.f32 [%rd1], {%f8, %f7, %f6, %f5};
+; CHECK-NEXT: ret;
+ %a.load = load <4 x float>, ptr %a
+ %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.>
+ store <4 x float> %a.add, ptr %a
+ ret void
+}
+
+define void @generic_2xdouble(ptr %a) {
+; CHECK-LABEL: generic_2xdouble(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f64 %fd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xdouble_param_0];
+; CHECK-NEXT: ld.v2.f64 {%fd1, %fd2}, [%rd1];
+; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000;
+; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT: st.v2.f64 [%rd1], {%fd4, %fd3};
+; CHECK-NEXT: ret;
+ %a.load = load <2 x double>, ptr %a
+ %a.add = fadd <2 x double> %a.load, <double 1., double 1.>
+ store <2 x double> %a.add, ptr %a
+ ret void
+}
+
+; generic_volatile
+
+define void @generic_volatile_i8(ptr %a) {
+; CHECK-LABEL: generic_volatile_i8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_i8_param_0];
+; CHECK-NEXT: ld.volatile.u8 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.volatile.u8 [%rd1], %rs2;
+; CHECK-NEXT: ret;
%a.load = load volatile i8, ptr %a
%a.add = add i8 %a.load, 1
- ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
store volatile i8 %a.add, ptr %a
+ ret void
+}
+
+define void @generic_volatile_i16(ptr %a) {
+; CHECK-LABEL: generic_volatile_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_i16_param_0];
+; CHECK-NEXT: ld.volatile.u16 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.volatile.u16 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load volatile i16, ptr %a
+ %a.add = add i16 %a.load, 1
+ store volatile i16 %a.add, ptr %a
+ ret void
+}
+
+define void @generic_volatile_i32(ptr %a) {
+; CHECK-LABEL: generic_volatile_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_i32_param_0];
+; CHECK-NEXT: ld.volatile.u32 %r1, [%rd1];
+; CHECK-NEXT: add.s32 %r2, %r1, 1;
+; CHECK-NEXT: st.volatile.u32 [%rd1], %r2;
+; CHECK-NEXT: ret;
+ %a.load = load volatile i32, ptr %a
+ %a.add = add i32 %a.load, 1
+ store volatile i32 %a.add, ptr %a
+ ret void
+}
+
+define void @generic_volatile_i64(ptr %a) {
+; CHECK-LABEL: generic_volatile_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_i64_param_0];
+; CHECK-NEXT: ld.volatile.u64 %rd2, [%rd1];
+; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
+; CHECK-NEXT: st.volatile.u64 [%rd1], %rd3;
+; CHECK-NEXT: ret;
+ %a.load = load volatile i64, ptr %a
+ %a.add = add i64 %a.load, 1
+ store volatile i64 %a.add, ptr %a
+ ret void
+}
+
+define void @generic_volatile_float(ptr %a) {
+; CHECK-LABEL: generic_volatile_float(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_float_param_0];
+; CHECK-NEXT: ld.volatile.f32 %f1, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT: st.volatile.f32 [%rd1], %f2;
+; CHECK-NEXT: ret;
+ %a.load = load volatile float, ptr %a
+ %a.add = fadd float %a.load, 1.
+ store volatile float %a.add, ptr %a
+ ret void
+}
+
+define void @generic_volatile_double(ptr %a) {
+; CHECK-LABEL: generic_volatile_double(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_double_param_0];
+; CHECK-NEXT: ld.volatile.f64 %fd1, [%rd1];
+; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT: st.volatile.f64 [%rd1], %fd2;
+; CHECK-NEXT: ret;
+ %a.load = load volatile double, ptr %a
+ %a.add = fadd double %a.load, 1.
+ store volatile double %a.add, ptr %a
+ ret void
+}
+
+; TODO: volatile, atomic, and volatile atomic memory operations on vector types.
+; Currently, LLVM:
+; - does not allow atomic operations on vectors.
+; - it allows volatile operations but not clear what that means.
+; Following both semantics make sense in general and PTX supports both:
+; - volatile/atomic/volatile atomic applies to the whole vector
+; - volatile/atomic/volatile atomic applies elementwise
+; Actions required:
+; - clarify LLVM semantics for volatile on vectors and align the NVPTX backend with those
+; Below tests show that the current implementation picks the semantics in an inconsistent way
+; * volatile <2 x i8> lowers to "elementwise volatile"
+; * <4 x i8> lowers to "full vector volatile"
+; - provide support for vector atomics, e.g., by extending LLVM IR or via intrinsics
+; - update tests in load-store-sm70.ll as well.
+
+; TODO: make this operation consistent with the one for <4 x i8>
+; This operation lowers to a "element wise volatile PTX operation".
+define void @generic_volatile_2xi8(ptr %a) {
+; CHECK-LABEL: generic_volatile_2xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<5>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xi8_param_0];
+; CHECK-NEXT: ld.volatile.v2.u8 {%rs1, %rs2}, [%rd1];
+; CHECK-NEXT: add.s16 %rs3, %rs2, 1;
+; CHECK-NEXT: add.s16 %rs4, %rs1, 1;
+; CHECK-NEXT: st.volatile.v2.u8 [%rd1], {%rs4, %rs3};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <2 x i8>, ptr %a
+ %a.add = add <2 x i8> %a.load, <i8 1, i8 1>
+ store volatile <2 x i8> %a.add, ptr %a
+ ret void
+}
+
+; TODO: make this operation consistent with the one for <2 x i8>
+; This operation lowers to a "full vector volatile PTX operation".
+define void @generic_volatile_4xi8(ptr %a) {
+; CHECK-LABEL: generic_volatile_4xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<9>;
+; CHECK-NEXT: .reg .b32 %r<13>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_4xi8_param_0];
+; CHECK-NEXT: ld.volatile.u32 %r1, [%rd1];
+; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: cvt.u32.u16 %r3, %rs2;
+; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r4;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT: cvt.u32.u16 %r5, %rs4;
+; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8;
+; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
+; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
+; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8;
+; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
+; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8;
+; CHECK-NEXT: st.volatile.u32 [%rd1], %r12;
+; CHECK-NEXT: ret;
+ %a.load = load volatile <4 x i8>, ptr %a
+ %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1>
+ store volatile <4 x i8> %a.add, ptr %a
+ ret void
+}
+
+define void @generic_volatile_2xi16(ptr %a) {
+; CHECK-LABEL: generic_volatile_2xi16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<5>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xi16_param_0];
+; CHECK-NEXT: ld.volatile.u32 %r1, [%rd1];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT: add.s16 %rs3, %rs2, 1;
+; CHECK-NEXT: add.s16 %rs4, %rs1, 1;
+; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3};
+; CHECK-NEXT: st.volatile.u32 [%rd1], %r2;
+; CHECK-NEXT: ret;
+ %a.load = load volatile <2 x i16>, ptr %a
+ %a.add = add <2 x i16> %a.load, <i16 1, i16 1>
+ store volatile <2 x i16> %a.add, ptr %a
+ ret void
+}
+
+define void @generic_volatile_4xi16(ptr %a) {
+; CHECK-LABEL: generic_volatile_4xi16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_4xi16_param_0];
+; CHECK-NEXT: ld.volatile.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
+; CHECK-NEXT: add.s16 %rs5, %rs4, 1;
+; CHECK-NEXT: add.s16 %rs6, %rs3, 1;
+; CHECK-NEXT: add.s16 %rs7, %rs2, 1;
+; CHECK-NEXT: add.s16 %rs8, %rs1, 1;
+; CHECK-NEXT: st.volatile.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <4 x i16>, ptr %a
+ %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1>
+ store volatile <4 x i16> %a.add, ptr %a
+ ret void
+}
+
+define void @generic_volatile_2xi32(ptr %a) {
+; CHECK-LABEL: generic_volatile_2xi32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xi32_param_0];
+; CHECK-NEXT: ld.volatile.v2.u32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT: add.s32 %r3, %r2, 1;
+; CHECK-NEXT: add.s32 %r4, %r1, 1;
+; CHECK-NEXT: st.volatile.v2.u32 [%rd1], {%r4, %r3};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <2 x i32>, ptr %a
+ %a.add = add <2 x i32> %a.load, <i32 1, i32 1>
+ store volatile <2 x i32> %a.add, ptr %a
+ ret void
+}
+
+define void @generic_volatile_4xi32(ptr %a) {
+; CHECK-LABEL: generic_volatile_4xi32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_4xi32_param_0];
+; CHECK-NEXT: ld.volatile.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: add.s32 %r5, %r4, 1;
+; CHECK-NEXT: add.s32 %r6, %r3, 1;
+; CHECK-NEXT: add.s32 %r7, %r2, 1;
+; CHECK-NEXT: add.s32 %r8, %r1, 1;
+; CHECK-NEXT: st.volatile.v4.u32 [%rd1], {%r8, %r7, %r6, %r5};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <4 x i32>, ptr %a
+ %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1>
+ store volatile <4 x i32> %a.add, ptr %a
+ ret void
+}
+
+define void @generic_volatile_2xi64(ptr %a) {
+; CHECK-LABEL: generic_volatile_2xi64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xi64_param_0];
+; CHECK-NEXT: ld.volatile.v2.u64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT: add.s64 %rd4, %rd3, 1;
+; CHECK-NEXT: add.s64 %rd5, %rd2, 1;
+; CHECK-NEXT: st.volatile.v2.u64 [%rd1], {%rd5, %rd4};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <2 x i64>, ptr %a
+ %a.add = add <2 x i64> %a.load, <i64 1, i64 1>
+ store volatile <2 x i64> %a.add, ptr %a
+ ret void
+}
+
+define void @generic_volatile_2xfloat(ptr %a) {
+; CHECK-LABEL: generic_volatile_2xfloat(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xfloat_param_0];
+; CHECK-NEXT: ld.volatile.v2.f32 {%f1, %f2}, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000;
+; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000;
+; CHECK-NEXT: st.volatile.v2.f32 [%rd1], {%f4, %f3};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <2 x float>, ptr %a
+ %a.add = fadd <2 x float> %a.load, <float 1., float 1.>
+ store volatile <2 x float> %a.add, ptr %a
+ ret void
+}
+
+define void @generic_volatile_4xfloat(ptr %a) {
+; CHECK-LABEL: generic_volatile_4xfloat(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_4xfloat_param_0];
+; CHECK-NEXT: ld.volatile.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000;
+; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000;
+; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000;
+; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000;
+; CHECK-NEXT: st.volatile.v4.f32 [%rd1], {%f8, %f7, %f6, %f5};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <4 x float>, ptr %a
+ %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.>
+ store volatile <4 x float> %a.add, ptr %a
+ ret void
+}
+
+define void @generic_volatile_2xdouble(ptr %a) {
+; CHECK-LABEL: generic_volatile_2xdouble(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f64 %fd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xdouble_param_0];
+; CHECK-NEXT: ld.volatile.v2.f64 {%fd1, %fd2}, [%rd1];
+; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000;
+; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT: st.volatile.v2.f64 [%rd1], {%fd4, %fd3};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <2 x double>, ptr %a
+ %a.add = fadd <2 x double> %a.load, <double 1., double 1.>
+ store volatile <2 x double> %a.add, ptr %a
+ ret void
+}
- ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load volatile i16, ptr %b
- %b.add = add i16 %b.load, 1
- ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store volatile i16 %b.add, ptr %b
-
- ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load volatile i32, ptr %c
- %c.add = add i32 %c.load, 1
- ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store volatile i32 %c.add, ptr %c
-
- ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load volatile i64, ptr %d
- %d.add = add i64 %d.load, 1
- ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store volatile i64 %d.add, ptr %d
-
- ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load volatile float, ptr %c
- %e.add = fadd float %e.load, 1.
- ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store volatile float %e.add, ptr %c
-
- ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load volatile double, ptr %c
- %f.add = fadd double %f.load, 1.
- ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store volatile double %f.add, ptr %c
-
- ; TODO: volatile, atomic, and volatile atomic memory operations on vector types.
- ; Currently, LLVM:
- ; - does not allow atomic operations on vectors.
- ; - it allows volatile operations but not clear what that means.
- ; Following both semantics make sense in general and PTX supports both:
- ; - volatile/atomic/volatile atomic applies to the whole vector
- ; - volatile/atomic/volatile atomic applies elementwise
- ; Actions required:
- ; - clarify LLVM semantics for volatile on vectors and align the NVPTX backend with those
- ; Below tests show that the current implementation picks the semantics in an inconsistent way
- ; * volatile <2 x i8> lowers to "elementwise volatile"
- ; * <4 x i8> lowers to "full vector volatile"
- ; - provide support for vector atomics, e.g., by extending LLVM IR or via intrinsics
- ; - update tests in load-store-sm70.ll as well.
-
- ; TODO: make this operation consistent with the one for <4 x i8>
- ; This operation lowers to a "element wise volatile PTX operation".
- ; CHECK: ld.volatile.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %h.load = load volatile <2 x i8>, ptr %b
- %h.add = add <2 x i8> %h.load, <i8 1, i8 1>
- ; CHECK: st.volatile.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}}
- store volatile <2 x i8> %h.add, ptr %b
-
- ; TODO: make this operation consistent with the one for <2 x i8>
- ; This operation lowers to a "full vector volatile PTX operation".
- ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %i.load = load volatile <4 x i8>, ptr %c
- %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1>
- ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store volatile <4 x i8> %i.add, ptr %c
-
- ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %j.load = load volatile <2 x i16>, ptr %c
- %j.add = add <2 x i16> %j.load, <i16 1, i16 1>
- ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store volatile <2 x i16> %j.add, ptr %c
-
- ; CHECK: ld.volatile.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %k.load = load volatile <4 x i16>, ptr %d
- %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1>
- ; CHECK: st.volatile.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}
- store volatile <4 x i16> %k.add, ptr %d
-
- ; CHECK: ld.volatile.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %l.load = load volatile <2 x i32>, ptr %d
- %l.add = add <2 x i32> %l.load, <i32 1, i32 1>
- ; CHECK: st.volatile.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}}
- store volatile <2 x i32> %l.add, ptr %d
-
- ; CHECK: ld.volatile.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %m.load = load volatile <4 x i32>, ptr %d
- %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1>
- ; CHECK: st.volatile.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
- store volatile <4 x i32> %m.add, ptr %d
-
- ; CHECK: ld.volatile.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %n.load = load volatile <2 x i64>, ptr %d
- %n.add = add <2 x i64> %n.load, <i64 1, i64 1>
- ; CHECK: st.volatile.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}}
- store volatile <2 x i64> %n.add, ptr %d
-
- ; CHECK: ld.volatile.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %o.load = load volatile <2 x float>, ptr %d
- %o.add = fadd <2 x float> %o.load, <float 1., float 1.>
- ; CHECK: st.volatile.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}}
- store volatile <2 x float> %o.add, ptr %d
-
- ; CHECK: ld.volatile.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %p.load = load volatile <4 x float>, ptr %d
- %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.>
- ; CHECK: st.volatile.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
- store volatile <4 x float> %p.add, ptr %d
-
- ; CHECK: ld.volatile.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %q.load = load volatile <2 x double>, ptr %d
- %q.add = fadd <2 x double> %q.load, <double 1., double 1.>
- ; CHECK: st.volatile.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}}
- store volatile <2 x double> %q.add, ptr %d
-
- ret void
-}
-
-; CHECK-LABEL: generic_unordered_sys
-define void @generic_unordered_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
- ; SM60: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.relaxed.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; generic_unordered_sys
+
+define void @generic_unordered_sys_i8(ptr %a) {
+; SM60-LABEL: generic_unordered_sys_i8(
+; SM60: {
+; SM60-NEXT: .reg .b16 %rs<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i8_param_0];
+; SM60-NEXT: ld.volatile.u8 %rs1, [%rd1];
+; SM60-NEXT: add.s16 %rs2, %rs1, 1;
+; SM60-NEXT: st.volatile.u8 [%rd1], %rs2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: generic_unordered_sys_i8(
+; SM70: {
+; SM70-NEXT: .reg .b16 %rs<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i8_param_0];
+; SM70-NEXT: ld.relaxed.sys.u8 %rs1, [%rd1];
+; SM70-NEXT: add.s16 %rs2, %rs1, 1;
+; SM70-NEXT: st.relaxed.sys.u8 [%rd1], %rs2;
+; SM70-NEXT: ret;
%a.load = load atomic i8, ptr %a unordered, align 1
%a.add = add i8 %a.load, 1
- ; SM60: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- ; SM70: st.relaxed.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
store atomic i8 %a.add, ptr %a unordered, align 1
+ ret void
+}
+
+define void @generic_unordered_sys_i16(ptr %a) {
+; SM60-LABEL: generic_unordered_sys_i16(
+; SM60: {
+; SM60-NEXT: .reg .b16 %rs<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i16_param_0];
+; SM60-NEXT: ld.volatile.u16 %rs1, [%rd1];
+; SM60-NEXT: add.s16 %rs2, %rs1, 1;
+; SM60-NEXT: st.volatile.u16 [%rd1], %rs2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: generic_unordered_sys_i16(
+; SM70: {
+; SM70-NEXT: .reg .b16 %rs<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i16_param_0];
+; SM70-NEXT: ld.relaxed.sys.u16 %rs1, [%rd1];
+; SM70-NEXT: add.s16 %rs2, %rs1, 1;
+; SM70-NEXT: st.relaxed.sys.u16 [%rd1], %rs2;
+; SM70-NEXT: ret;
+ %a.load = load atomic i16, ptr %a unordered, align 2
+ %a.add = add i16 %a.load, 1
+ store atomic i16 %a.add, ptr %a unordered, align 2
+ ret void
+}
- ; SM60: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.relaxed.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr %b unordered, align 2
- %b.add = add i16 %b.load, 1
- ; SM60: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- ; SM70: st.relaxed.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr %b unordered, align 2
-
- ; SM60: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.relaxed.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr %c unordered, align 4
- %c.add = add i32 %c.load, 1
- ; SM60: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- ; SM70: st.relaxed.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr %c unordered, align 4
-
- ; SM60: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.relaxed.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr %d unordered, align 8
- %d.add = add i64 %d.load, 1
- ; SM60: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- ; SM70: st.relaxed.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr %d unordered, align 8
-
- ; SM60: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.relaxed.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr %e unordered, align 4
- %e.add = fadd float %e.load, 1.0
- ; SM60: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- ; SM70: st.relaxed.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr %e unordered, align 4
-
- ; SM60: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.relaxed.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr %e unordered, align 8
- %f.add = fadd double %f.load, 1.
- ; SM60: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- ; SM70: st.relaxed.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr %e unordered, align 8
-
- ret void
-}
-
-; CHECK-LABEL: generic_unordered_volatile_sys
-define void @generic_unordered_volatile_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
- ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+define void @generic_unordered_sys_i32(ptr %a) {
+; SM60-LABEL: generic_unordered_sys_i32(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i32_param_0];
+; SM60-NEXT: ld.volatile.u32 %r1, [%rd1];
+; SM60-NEXT: add.s32 %r2, %r1, 1;
+; SM60-NEXT: st.volatile.u32 [%rd1], %r2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: generic_unordered_sys_i32(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i32_param_0];
+; SM70-NEXT: ld.relaxed.sys.u32 %r1, [%rd1];
+; SM70-NEXT: add.s32 %r2, %r1, 1;
+; SM70-NEXT: st.relaxed.sys.u32 [%rd1], %r2;
+; SM70-NEXT: ret;
+ %a.load = load atomic i32, ptr %a unordered, align 4
+ %a.add = add i32 %a.load, 1
+ store atomic i32 %a.add, ptr %a unordered, align 4
+ ret void
+}
+
+define void @generic_unordered_sys_i64(ptr %a) {
+; SM60-LABEL: generic_unordered_sys_i64(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<4>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i64_param_0];
+; SM60-NEXT: ld.volatile.u64 %rd2, [%rd1];
+; SM60-NEXT: add.s64 %rd3, %rd2, 1;
+; SM60-NEXT: st.volatile.u64 [%rd1], %rd3;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: generic_unordered_sys_i64(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<4>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i64_param_0];
+; SM70-NEXT: ld.relaxed.sys.u64 %rd2, [%rd1];
+; SM70-NEXT: add.s64 %rd3, %rd2, 1;
+; SM70-NEXT: st.relaxed.sys.u64 [%rd1], %rd3;
+; SM70-NEXT: ret;
+ %a.load = load atomic i64, ptr %a unordered, align 8
+ %a.add = add i64 %a.load, 1
+ store atomic i64 %a.add, ptr %a unordered, align 8
+ ret void
+}
+
+define void @generic_unordered_sys_float(ptr %a) {
+; SM60-LABEL: generic_unordered_sys_float(
+; SM60: {
+; SM60-NEXT: .reg .f32 %f<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_float_param_0];
+; SM60-NEXT: ld.volatile.f32 %f1, [%rd1];
+; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; SM60-NEXT: st.volatile.f32 [%rd1], %f2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: generic_unordered_sys_float(
+; SM70: {
+; SM70-NEXT: .reg .f32 %f<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_float_param_0];
+; SM70-NEXT: ld.relaxed.sys.f32 %f1, [%rd1];
+; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; SM70-NEXT: st.relaxed.sys.f32 [%rd1], %f2;
+; SM70-NEXT: ret;
+ %a.load = load atomic float, ptr %a unordered, align 4
+ %a.add = fadd float %a.load, 1.
+ store atomic float %a.add, ptr %a unordered, align 4
+ ret void
+}
+
+define void @generic_unordered_sys_double(ptr %a) {
+; SM60-LABEL: generic_unordered_sys_double(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .f64 %fd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_double_param_0];
+; SM60-NEXT: ld.volatile.f64 %fd1, [%rd1];
+; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM60-NEXT: st.volatile.f64 [%rd1], %fd2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: generic_unordered_sys_double(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .f64 %fd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_double_param_0];
+; SM70-NEXT: ld.relaxed.sys.f64 %fd1, [%rd1];
+; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM70-NEXT: st.relaxed.sys.f64 [%rd1], %fd2;
+; SM70-NEXT: ret;
+ %a.load = load atomic double, ptr %a unordered, align 8
+ %a.add = fadd double %a.load, 1.
+ store atomic double %a.add, ptr %a unordered, align 8
+ ret void
+}
+
+; generic_unordered_volatile_sys
+
+define void @generic_unordered_volatile_sys_i8(ptr %a) {
+; CHECK-LABEL: generic_unordered_volatile_sys_i8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_unordered_volatile_sys_i8_param_0];
+; CHECK-NEXT: ld.volatile.u8 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.volatile.u8 [%rd1], %rs2;
+; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr %a unordered, align 1
%a.add = add i8 %a.load, 1
- ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
store atomic volatile i8 %a.add, ptr %a unordered, align 1
+ ret void
+}
- ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic volatile i16, ptr %b unordered, align 2
- %b.add = add i16 %b.load, 1
- ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i16 %b.add, ptr %b unordered, align 2
-
- ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic volatile i32, ptr %c unordered, align 4
- %c.add = add i32 %c.load, 1
- ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic volatile i32 %c.add, ptr %c unordered, align 4
+define void @generic_unordered_volatile_sys_i16(ptr %a) {
+; CHECK-LABEL: generic_unordered_volatile_sys_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_unordered_volatile_sys_i16_param_0];
+; CHECK-NEXT: ld.volatile.u16 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.volatile.u16 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile i16, ptr %a unordered, align 2
+ %a.add = add i16 %a.load, 1
+ store atomic volatile i16 %a.add, ptr %a unordered, align 2
+ ret void
+}
- ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic volatile i64, ptr %d unordered, align 8
- %d.add = add i64 %d.load, 1
- ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic volatile i64 %d.add, ptr %d unordered, align 8
+define void @generic_unordered_volatile_sys_i32(ptr %a) {
+; CHECK-LABEL: generic_unordered_volatile_sys_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_unordered_volatile_sys_i32_param_0];
+; CHECK-NEXT: ld.volatile.u32 %r1, [%rd1];
+; CHECK-NEXT: add.s32 %r2, %r1, 1;
+; CHECK-NEXT: st.volatile.u32 [%rd1], %r2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile i32, ptr %a unordered, align 4
+ %a.add = add i32 %a.load, 1
+ store atomic volatile i32 %a.add, ptr %a unordered, align 4
+ ret void
+}
- ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic volatile float, ptr %e unordered, align 4
- %e.add = fadd float %e.load, 1.0
- ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic volatile float %e.add, ptr %e unordered, align 4
+define void @generic_unordered_volatile_sys_i64(ptr %a) {
+; CHECK-LABEL: generic_unordered_volatile_sys_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_unordered_volatile_sys_i64_param_0];
+; CHECK-NEXT: ld.volatile.u64 %rd2, [%rd1];
+; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
+; CHECK-NEXT: st.volatile.u64 [%rd1], %rd3;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile i64, ptr %a unordered, align 8
+ %a.add = add i64 %a.load, 1
+ store atomic volatile i64 %a.add, ptr %a unordered, align 8
+ ret void
+}
- ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic volatile double, ptr %e unordered, align 8
- %f.add = fadd double %f.load, 1.
- ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic volatile double %f.add, ptr %e unordered, align 8
+define void @generic_unordered_volatile_sys_float(ptr %a) {
+; CHECK-LABEL: generic_unordered_volatile_sys_float(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_unordered_volatile_sys_float_param_0];
+; CHECK-NEXT: ld.volatile.f32 %f1, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT: st.volatile.f32 [%rd1], %f2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile float, ptr %a unordered, align 4
+ %a.add = fadd float %a.load, 1.
+ store atomic volatile float %a.add, ptr %a unordered, align 4
+ ret void
+}
+define void @generic_unordered_volatile_sys_double(ptr %a) {
+; CHECK-LABEL: generic_unordered_volatile_sys_double(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_unordered_volatile_sys_double_param_0];
+; CHECK-NEXT: ld.volatile.f64 %fd1, [%rd1];
+; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT: st.volatile.f64 [%rd1], %fd2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile double, ptr %a unordered, align 8
+ %a.add = fadd double %a.load, 1.
+ store atomic volatile double %a.add, ptr %a unordered, align 8
ret void
}
-; CHECK-LABEL: generic_monotonic_sys
-define void @generic_monotonic_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
- ; SM60: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.relaxed.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; generic_monotonic_sys
+
+define void @generic_monotonic_sys_i8(ptr %a) {
+; SM60-LABEL: generic_monotonic_sys_i8(
+; SM60: {
+; SM60-NEXT: .reg .b16 %rs<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i8_param_0];
+; SM60-NEXT: ld.volatile.u8 %rs1, [%rd1];
+; SM60-NEXT: add.s16 %rs2, %rs1, 1;
+; SM60-NEXT: st.volatile.u8 [%rd1], %rs2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: generic_monotonic_sys_i8(
+; SM70: {
+; SM70-NEXT: .reg .b16 %rs<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i8_param_0];
+; SM70-NEXT: ld.relaxed.sys.u8 %rs1, [%rd1];
+; SM70-NEXT: add.s16 %rs2, %rs1, 1;
+; SM70-NEXT: st.relaxed.sys.u8 [%rd1], %rs2;
+; SM70-NEXT: ret;
%a.load = load atomic i8, ptr %a monotonic, align 1
%a.add = add i8 %a.load, 1
- ; SM60: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- ; SM70: st.relaxed.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
store atomic i8 %a.add, ptr %a monotonic, align 1
+ ret void
+}
+
+define void @generic_monotonic_sys_i16(ptr %a) {
+; SM60-LABEL: generic_monotonic_sys_i16(
+; SM60: {
+; SM60-NEXT: .reg .b16 %rs<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i16_param_0];
+; SM60-NEXT: ld.volatile.u16 %rs1, [%rd1];
+; SM60-NEXT: add.s16 %rs2, %rs1, 1;
+; SM60-NEXT: st.volatile.u16 [%rd1], %rs2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: generic_monotonic_sys_i16(
+; SM70: {
+; SM70-NEXT: .reg .b16 %rs<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i16_param_0];
+; SM70-NEXT: ld.relaxed.sys.u16 %rs1, [%rd1];
+; SM70-NEXT: add.s16 %rs2, %rs1, 1;
+; SM70-NEXT: st.relaxed.sys.u16 [%rd1], %rs2;
+; SM70-NEXT: ret;
+ %a.load = load atomic i16, ptr %a monotonic, align 2
+ %a.add = add i16 %a.load, 1
+ store atomic i16 %a.add, ptr %a monotonic, align 2
+ ret void
+}
- ; SM60: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.relaxed.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr %b monotonic, align 2
- %b.add = add i16 %b.load, 1
- ; SM60: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- ; SM70: st.relaxed.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr %b monotonic, align 2
-
- ; SM60: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.relaxed.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr %c monotonic, align 4
- %c.add = add i32 %c.load, 1
- ; SM60: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- ; SM70: st.relaxed.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr %c monotonic, align 4
-
- ; SM60: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.relaxed.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr %d monotonic, align 8
- %d.add = add i64 %d.load, 1
- ; SM60: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- ; SM70: st.relaxed.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr %d monotonic, align 8
-
- ; SM60: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.relaxed.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr %e monotonic, align 4
- %e.add = fadd float %e.load, 1.
- ; SM60: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- ; SM70: st.relaxed.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr %e monotonic, align 4
-
- ; SM60: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.relaxed.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr %e monotonic, align 8
- %f.add = fadd double %f.load, 1.
- ; SM60: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- ; SM70: st.relaxed.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr %e monotonic, align 8
-
- ret void
-}
-
-; CHECK-LABEL: generic_monotonic_volatile_sys
-define void @generic_monotonic_volatile_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
- ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+define void @generic_monotonic_sys_i32(ptr %a) {
+; SM60-LABEL: generic_monotonic_sys_i32(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i32_param_0];
+; SM60-NEXT: ld.volatile.u32 %r1, [%rd1];
+; SM60-NEXT: add.s32 %r2, %r1, 1;
+; SM60-NEXT: st.volatile.u32 [%rd1], %r2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: generic_monotonic_sys_i32(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i32_param_0];
+; SM70-NEXT: ld.relaxed.sys.u32 %r1, [%rd1];
+; SM70-NEXT: add.s32 %r2, %r1, 1;
+; SM70-NEXT: st.relaxed.sys.u32 [%rd1], %r2;
+; SM70-NEXT: ret;
+ %a.load = load atomic i32, ptr %a monotonic, align 4
+ %a.add = add i32 %a.load, 1
+ store atomic i32 %a.add, ptr %a monotonic, align 4
+ ret void
+}
+
+define void @generic_monotonic_sys_i64(ptr %a) {
+; SM60-LABEL: generic_monotonic_sys_i64(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<4>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i64_param_0];
+; SM60-NEXT: ld.volatile.u64 %rd2, [%rd1];
+; SM60-NEXT: add.s64 %rd3, %rd2, 1;
+; SM60-NEXT: st.volatile.u64 [%rd1], %rd3;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: generic_monotonic_sys_i64(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<4>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i64_param_0];
+; SM70-NEXT: ld.relaxed.sys.u64 %rd2, [%rd1];
+; SM70-NEXT: add.s64 %rd3, %rd2, 1;
+; SM70-NEXT: st.relaxed.sys.u64 [%rd1], %rd3;
+; SM70-NEXT: ret;
+ %a.load = load atomic i64, ptr %a monotonic, align 8
+ %a.add = add i64 %a.load, 1
+ store atomic i64 %a.add, ptr %a monotonic, align 8
+ ret void
+}
+
+define void @generic_monotonic_sys_float(ptr %a) {
+; SM60-LABEL: generic_monotonic_sys_float(
+; SM60: {
+; SM60-NEXT: .reg .f32 %f<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_float_param_0];
+; SM60-NEXT: ld.volatile.f32 %f1, [%rd1];
+; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; SM60-NEXT: st.volatile.f32 [%rd1], %f2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: generic_monotonic_sys_float(
+; SM70: {
+; SM70-NEXT: .reg .f32 %f<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_float_param_0];
+; SM70-NEXT: ld.relaxed.sys.f32 %f1, [%rd1];
+; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; SM70-NEXT: st.relaxed.sys.f32 [%rd1], %f2;
+; SM70-NEXT: ret;
+ %a.load = load atomic float, ptr %a monotonic, align 4
+ %a.add = fadd float %a.load, 1.
+ store atomic float %a.add, ptr %a monotonic, align 4
+ ret void
+}
+
+define void @generic_monotonic_sys_double(ptr %a) {
+; SM60-LABEL: generic_monotonic_sys_double(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .f64 %fd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_double_param_0];
+; SM60-NEXT: ld.volatile.f64 %fd1, [%rd1];
+; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM60-NEXT: st.volatile.f64 [%rd1], %fd2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: generic_monotonic_sys_double(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .f64 %fd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_double_param_0];
+; SM70-NEXT: ld.relaxed.sys.f64 %fd1, [%rd1];
+; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM70-NEXT: st.relaxed.sys.f64 [%rd1], %fd2;
+; SM70-NEXT: ret;
+ %a.load = load atomic double, ptr %a monotonic, align 8
+ %a.add = fadd double %a.load, 1.
+ store atomic double %a.add, ptr %a monotonic, align 8
+ ret void
+}
+
+; generic_monotonic_volatile_sys
+
+define void @generic_monotonic_volatile_sys_i8(ptr %a) {
+; CHECK-LABEL: generic_monotonic_volatile_sys_i8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_monotonic_volatile_sys_i8_param_0];
+; CHECK-NEXT: ld.volatile.u8 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.volatile.u8 [%rd1], %rs2;
+; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr %a monotonic, align 1
%a.add = add i8 %a.load, 1
- ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
store atomic volatile i8 %a.add, ptr %a monotonic, align 1
+ ret void
+}
- ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic volatile i16, ptr %b monotonic, align 2
- %b.add = add i16 %b.load, 1
- ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i16 %b.add, ptr %b monotonic, align 2
-
- ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic volatile i32, ptr %c monotonic, align 4
- %c.add = add i32 %c.load, 1
- ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic volatile i32 %c.add, ptr %c monotonic, align 4
+define void @generic_monotonic_volatile_sys_i16(ptr %a) {
+; CHECK-LABEL: generic_monotonic_volatile_sys_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_monotonic_volatile_sys_i16_param_0];
+; CHECK-NEXT: ld.volatile.u16 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.volatile.u16 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile i16, ptr %a monotonic, align 2
+ %a.add = add i16 %a.load, 1
+ store atomic volatile i16 %a.add, ptr %a monotonic, align 2
+ ret void
+}
- ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic volatile i64, ptr %d monotonic, align 8
- %d.add = add i64 %d.load, 1
- ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic volatile i64 %d.add, ptr %d monotonic, align 8
+define void @generic_monotonic_volatile_sys_i32(ptr %a) {
+; CHECK-LABEL: generic_monotonic_volatile_sys_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_monotonic_volatile_sys_i32_param_0];
+; CHECK-NEXT: ld.volatile.u32 %r1, [%rd1];
+; CHECK-NEXT: add.s32 %r2, %r1, 1;
+; CHECK-NEXT: st.volatile.u32 [%rd1], %r2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile i32, ptr %a monotonic, align 4
+ %a.add = add i32 %a.load, 1
+ store atomic volatile i32 %a.add, ptr %a monotonic, align 4
+ ret void
+}
- ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic volatile float, ptr %e monotonic, align 4
- %e.add = fadd float %e.load, 1.
- ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic volatile float %e.add, ptr %e monotonic, align 4
+define void @generic_monotonic_volatile_sys_i64(ptr %a) {
+; CHECK-LABEL: generic_monotonic_volatile_sys_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_monotonic_volatile_sys_i64_param_0];
+; CHECK-NEXT: ld.volatile.u64 %rd2, [%rd1];
+; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
+; CHECK-NEXT: st.volatile.u64 [%rd1], %rd3;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile i64, ptr %a monotonic, align 8
+ %a.add = add i64 %a.load, 1
+ store atomic volatile i64 %a.add, ptr %a monotonic, align 8
+ ret void
+}
- ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic volatile double, ptr %e monotonic, align 8
- %f.add = fadd double %f.load, 1.
- ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic volatile double %f.add, ptr %e monotonic, align 8
+define void @generic_monotonic_volatile_sys_float(ptr %a) {
+; CHECK-LABEL: generic_monotonic_volatile_sys_float(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_monotonic_volatile_sys_float_param_0];
+; CHECK-NEXT: ld.volatile.f32 %f1, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT: st.volatile.f32 [%rd1], %f2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile float, ptr %a monotonic, align 4
+ %a.add = fadd float %a.load, 1.
+ store atomic volatile float %a.add, ptr %a monotonic, align 4
+ ret void
+}
+define void @generic_monotonic_volatile_sys_double(ptr %a) {
+; CHECK-LABEL: generic_monotonic_volatile_sys_double(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_monotonic_volatile_sys_double_param_0];
+; CHECK-NEXT: ld.volatile.f64 %fd1, [%rd1];
+; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT: st.volatile.f64 [%rd1], %fd2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile double, ptr %a monotonic, align 8
+ %a.add = fadd double %a.load, 1.
+ store atomic volatile double %a.add, ptr %a monotonic, align 8
ret void
}
;; global statespace
-; CHECK-LABEL: global_weak
-define void @global_weak(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr {
- ; CHECK: ld.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; global
+
+define void @global_i8(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_i8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_i8_param_0];
+; CHECK-NEXT: ld.global.u8 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.global.u8 [%rd1], %rs2;
+; CHECK-NEXT: ret;
%a.load = load i8, ptr addrspace(1) %a
%a.add = add i8 %a.load, 1
- ; CHECK: st.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
store i8 %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+define void @global_i16(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_i16_param_0];
+; CHECK-NEXT: ld.global.u16 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.global.u16 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load i16, ptr addrspace(1) %a
+ %a.add = add i16 %a.load, 1
+ store i16 %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+define void @global_i32(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_i32_param_0];
+; CHECK-NEXT: ld.global.u32 %r1, [%rd1];
+; CHECK-NEXT: add.s32 %r2, %r1, 1;
+; CHECK-NEXT: st.global.u32 [%rd1], %r2;
+; CHECK-NEXT: ret;
+ %a.load = load i32, ptr addrspace(1) %a
+ %a.add = add i32 %a.load, 1
+ store i32 %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+define void @global_i64(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_i64_param_0];
+; CHECK-NEXT: ld.global.u64 %rd2, [%rd1];
+; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
+; CHECK-NEXT: st.global.u64 [%rd1], %rd3;
+; CHECK-NEXT: ret;
+ %a.load = load i64, ptr addrspace(1) %a
+ %a.add = add i64 %a.load, 1
+ store i64 %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+define void @global_float(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_float(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_float_param_0];
+; CHECK-NEXT: ld.global.f32 %f1, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT: st.global.f32 [%rd1], %f2;
+; CHECK-NEXT: ret;
+ %a.load = load float, ptr addrspace(1) %a
+ %a.add = fadd float %a.load, 1.
+ store float %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+define void @global_double(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_double(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_double_param_0];
+; CHECK-NEXT: ld.global.f64 %fd1, [%rd1];
+; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT: st.global.f64 [%rd1], %fd2;
+; CHECK-NEXT: ret;
+ %a.load = load double, ptr addrspace(1) %a
+ %a.add = fadd double %a.load, 1.
+ store double %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+define void @global_2xi8(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_2xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<5>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_2xi8_param_0];
+; CHECK-NEXT: ld.global.v2.u8 {%rs1, %rs2}, [%rd1];
+; CHECK-NEXT: add.s16 %rs3, %rs2, 1;
+; CHECK-NEXT: add.s16 %rs4, %rs1, 1;
+; CHECK-NEXT: st.global.v2.u8 [%rd1], {%rs4, %rs3};
+; CHECK-NEXT: ret;
+ %a.load = load <2 x i8>, ptr addrspace(1) %a
+ %a.add = add <2 x i8> %a.load, <i8 1, i8 1>
+ store <2 x i8> %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+define void @global_4xi8(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_4xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<9>;
+; CHECK-NEXT: .reg .b32 %r<13>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_4xi8_param_0];
+; CHECK-NEXT: ld.global.u32 %r1, [%rd1];
+; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: cvt.u32.u16 %r3, %rs2;
+; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r4;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT: cvt.u32.u16 %r5, %rs4;
+; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8;
+; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
+; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
+; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8;
+; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
+; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8;
+; CHECK-NEXT: st.global.u32 [%rd1], %r12;
+; CHECK-NEXT: ret;
+ %a.load = load <4 x i8>, ptr addrspace(1) %a
+ %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1>
+ store <4 x i8> %a.add, ptr addrspace(1) %a
+ ret void
+}
- ; CHECK: ld.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load i16, ptr addrspace(1) %b
- %b.add = add i16 %b.load, 1
- ; CHECK: st.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store i16 %b.add, ptr addrspace(1) %b
-
- ; CHECK: ld.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load i32, ptr addrspace(1) %c
- %c.add = add i32 %c.load, 1
- ; CHECK: st.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store i32 %c.add, ptr addrspace(1) %c
-
- ; CHECK: ld.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load i64, ptr addrspace(1) %d
- %d.add = add i64 %d.load, 1
- ; CHECK: st.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store i64 %d.add, ptr addrspace(1) %d
-
- ; CHECK: ld.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load float, ptr addrspace(1) %c
- %e.add = fadd float %e.load, 1.
- ; CHECK: st.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store float %e.add, ptr addrspace(1) %c
-
- ; CHECK: ld.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load double, ptr addrspace(1) %c
- %f.add = fadd double %f.load, 1.
- ; CHECK: st.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store double %f.add, ptr addrspace(1) %c
-
- ; CHECK: ld.global.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %h.load = load <2 x i8>, ptr addrspace(1) %b
- %h.add = add <2 x i8> %h.load, <i8 1, i8 1>
- ; CHECK: st.global.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}}
- store <2 x i8> %h.add, ptr addrspace(1) %b
-
- ; CHECK: ld.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %i.load = load <4 x i8>, ptr addrspace(1) %c
- %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1>
- ; CHECK: st.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store <4 x i8> %i.add, ptr addrspace(1) %c
-
- ; CHECK: ld.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %j.load = load <2 x i16>, ptr addrspace(1) %c
- %j.add = add <2 x i16> %j.load, <i16 1, i16 1>
- ; CHECK: st.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store <2 x i16> %j.add, ptr addrspace(1) %c
-
- ; CHECK: ld.global.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %k.load = load <4 x i16>, ptr addrspace(1) %d
- %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1>
- ; CHECK: st.global.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}
- store <4 x i16> %k.add, ptr addrspace(1) %d
-
- ; CHECK: ld.global.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %l.load = load <2 x i32>, ptr addrspace(1) %d
- %l.add = add <2 x i32> %l.load, <i32 1, i32 1>
- ; CHECK: st.global.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}}
- store <2 x i32> %l.add, ptr addrspace(1) %d
-
- ; CHECK: ld.global.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %m.load = load <4 x i32>, ptr addrspace(1) %d
- %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1>
- ; CHECK: st.global.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
- store <4 x i32> %m.add, ptr addrspace(1) %d
-
- ; CHECK: ld.global.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %n.load = load <2 x i64>, ptr addrspace(1) %d
- %n.add = add <2 x i64> %n.load, <i64 1, i64 1>
- ; CHECK: st.global.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}}
- store <2 x i64> %n.add, ptr addrspace(1) %d
-
- ; CHECK: ld.global.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %o.load = load <2 x float>, ptr addrspace(1) %d
- %o.add = fadd <2 x float> %o.load, <float 1., float 1.>
- ; CHECK: st.global.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}}
- store <2 x float> %o.add, ptr addrspace(1) %d
-
- ; CHECK: ld.global.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %p.load = load <4 x float>, ptr addrspace(1) %d
- %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.>
- ; CHECK: st.global.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
- store <4 x float> %p.add, ptr addrspace(1) %d
-
- ; CHECK: ld.global.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %q.load = load <2 x double>, ptr addrspace(1) %d
- %q.add = fadd <2 x double> %q.load, <double 1., double 1.>
- ; CHECK: st.global.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}}
- store <2 x double> %q.add, ptr addrspace(1) %d
-
- ret void
-}
-
-; CHECK-LABEL: global_volatile
-define void @global_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr {
- ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+define void @global_2xi16(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_2xi16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<5>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_2xi16_param_0];
+; CHECK-NEXT: ld.global.u32 %r1, [%rd1];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT: add.s16 %rs3, %rs2, 1;
+; CHECK-NEXT: add.s16 %rs4, %rs1, 1;
+; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3};
+; CHECK-NEXT: st.global.u32 [%rd1], %r2;
+; CHECK-NEXT: ret;
+ %a.load = load <2 x i16>, ptr addrspace(1) %a
+ %a.add = add <2 x i16> %a.load, <i16 1, i16 1>
+ store <2 x i16> %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+define void @global_4xi16(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_4xi16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_4xi16_param_0];
+; CHECK-NEXT: ld.global.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
+; CHECK-NEXT: add.s16 %rs5, %rs4, 1;
+; CHECK-NEXT: add.s16 %rs6, %rs3, 1;
+; CHECK-NEXT: add.s16 %rs7, %rs2, 1;
+; CHECK-NEXT: add.s16 %rs8, %rs1, 1;
+; CHECK-NEXT: st.global.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5};
+; CHECK-NEXT: ret;
+ %a.load = load <4 x i16>, ptr addrspace(1) %a
+ %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1>
+ store <4 x i16> %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+define void @global_2xi32(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_2xi32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_2xi32_param_0];
+; CHECK-NEXT: ld.global.v2.u32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT: add.s32 %r3, %r2, 1;
+; CHECK-NEXT: add.s32 %r4, %r1, 1;
+; CHECK-NEXT: st.global.v2.u32 [%rd1], {%r4, %r3};
+; CHECK-NEXT: ret;
+ %a.load = load <2 x i32>, ptr addrspace(1) %a
+ %a.add = add <2 x i32> %a.load, <i32 1, i32 1>
+ store <2 x i32> %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+define void @global_4xi32(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_4xi32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_4xi32_param_0];
+; CHECK-NEXT: ld.global.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: add.s32 %r5, %r4, 1;
+; CHECK-NEXT: add.s32 %r6, %r3, 1;
+; CHECK-NEXT: add.s32 %r7, %r2, 1;
+; CHECK-NEXT: add.s32 %r8, %r1, 1;
+; CHECK-NEXT: st.global.v4.u32 [%rd1], {%r8, %r7, %r6, %r5};
+; CHECK-NEXT: ret;
+ %a.load = load <4 x i32>, ptr addrspace(1) %a
+ %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1>
+ store <4 x i32> %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+define void @global_2xi64(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_2xi64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_2xi64_param_0];
+; CHECK-NEXT: ld.global.v2.u64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT: add.s64 %rd4, %rd3, 1;
+; CHECK-NEXT: add.s64 %rd5, %rd2, 1;
+; CHECK-NEXT: st.global.v2.u64 [%rd1], {%rd5, %rd4};
+; CHECK-NEXT: ret;
+ %a.load = load <2 x i64>, ptr addrspace(1) %a
+ %a.add = add <2 x i64> %a.load, <i64 1, i64 1>
+ store <2 x i64> %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+define void @global_2xfloat(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_2xfloat(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_2xfloat_param_0];
+; CHECK-NEXT: ld.global.v2.f32 {%f1, %f2}, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000;
+; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000;
+; CHECK-NEXT: st.global.v2.f32 [%rd1], {%f4, %f3};
+; CHECK-NEXT: ret;
+ %a.load = load <2 x float>, ptr addrspace(1) %a
+ %a.add = fadd <2 x float> %a.load, <float 1., float 1.>
+ store <2 x float> %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+define void @global_4xfloat(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_4xfloat(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_4xfloat_param_0];
+; CHECK-NEXT: ld.global.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000;
+; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000;
+; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000;
+; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000;
+; CHECK-NEXT: st.global.v4.f32 [%rd1], {%f8, %f7, %f6, %f5};
+; CHECK-NEXT: ret;
+ %a.load = load <4 x float>, ptr addrspace(1) %a
+ %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.>
+ store <4 x float> %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+define void @global_2xdouble(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_2xdouble(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f64 %fd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_2xdouble_param_0];
+; CHECK-NEXT: ld.global.v2.f64 {%fd1, %fd2}, [%rd1];
+; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000;
+; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT: st.global.v2.f64 [%rd1], {%fd4, %fd3};
+; CHECK-NEXT: ret;
+ %a.load = load <2 x double>, ptr addrspace(1) %a
+ %a.add = fadd <2 x double> %a.load, <double 1., double 1.>
+ store <2 x double> %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+; global_volatile
+
+define void @global_volatile_i8(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_volatile_i8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_i8_param_0];
+; CHECK-NEXT: ld.volatile.global.u8 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.volatile.global.u8 [%rd1], %rs2;
+; CHECK-NEXT: ret;
%a.load = load volatile i8, ptr addrspace(1) %a
%a.add = add i8 %a.load, 1
- ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
store volatile i8 %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+define void @global_volatile_i16(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_volatile_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_i16_param_0];
+; CHECK-NEXT: ld.volatile.global.u16 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.volatile.global.u16 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load volatile i16, ptr addrspace(1) %a
+ %a.add = add i16 %a.load, 1
+ store volatile i16 %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+define void @global_volatile_i32(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_volatile_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_i32_param_0];
+; CHECK-NEXT: ld.volatile.global.u32 %r1, [%rd1];
+; CHECK-NEXT: add.s32 %r2, %r1, 1;
+; CHECK-NEXT: st.volatile.global.u32 [%rd1], %r2;
+; CHECK-NEXT: ret;
+ %a.load = load volatile i32, ptr addrspace(1) %a
+ %a.add = add i32 %a.load, 1
+ store volatile i32 %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+define void @global_volatile_i64(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_volatile_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_i64_param_0];
+; CHECK-NEXT: ld.volatile.global.u64 %rd2, [%rd1];
+; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
+; CHECK-NEXT: st.volatile.global.u64 [%rd1], %rd3;
+; CHECK-NEXT: ret;
+ %a.load = load volatile i64, ptr addrspace(1) %a
+ %a.add = add i64 %a.load, 1
+ store volatile i64 %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+define void @global_volatile_float(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_volatile_float(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_float_param_0];
+; CHECK-NEXT: ld.volatile.global.f32 %f1, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT: st.volatile.global.f32 [%rd1], %f2;
+; CHECK-NEXT: ret;
+ %a.load = load volatile float, ptr addrspace(1) %a
+ %a.add = fadd float %a.load, 1.
+ store volatile float %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+define void @global_volatile_double(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_volatile_double(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_double_param_0];
+; CHECK-NEXT: ld.volatile.global.f64 %fd1, [%rd1];
+; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT: st.volatile.global.f64 [%rd1], %fd2;
+; CHECK-NEXT: ret;
+ %a.load = load volatile double, ptr addrspace(1) %a
+ %a.add = fadd double %a.load, 1.
+ store volatile double %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+define void @global_volatile_2xi8(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_volatile_2xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<5>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xi8_param_0];
+; CHECK-NEXT: ld.volatile.global.v2.u8 {%rs1, %rs2}, [%rd1];
+; CHECK-NEXT: add.s16 %rs3, %rs2, 1;
+; CHECK-NEXT: add.s16 %rs4, %rs1, 1;
+; CHECK-NEXT: st.volatile.global.v2.u8 [%rd1], {%rs4, %rs3};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <2 x i8>, ptr addrspace(1) %a
+ %a.add = add <2 x i8> %a.load, <i8 1, i8 1>
+ store volatile <2 x i8> %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+define void @global_volatile_4xi8(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_volatile_4xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<9>;
+; CHECK-NEXT: .reg .b32 %r<13>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_4xi8_param_0];
+; CHECK-NEXT: ld.volatile.global.u32 %r1, [%rd1];
+; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: cvt.u32.u16 %r3, %rs2;
+; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r4;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT: cvt.u32.u16 %r5, %rs4;
+; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8;
+; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
+; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
+; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8;
+; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
+; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8;
+; CHECK-NEXT: st.volatile.global.u32 [%rd1], %r12;
+; CHECK-NEXT: ret;
+ %a.load = load volatile <4 x i8>, ptr addrspace(1) %a
+ %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1>
+ store volatile <4 x i8> %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+define void @global_volatile_2xi16(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_volatile_2xi16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<5>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xi16_param_0];
+; CHECK-NEXT: ld.volatile.global.u32 %r1, [%rd1];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT: add.s16 %rs3, %rs2, 1;
+; CHECK-NEXT: add.s16 %rs4, %rs1, 1;
+; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3};
+; CHECK-NEXT: st.volatile.global.u32 [%rd1], %r2;
+; CHECK-NEXT: ret;
+ %a.load = load volatile <2 x i16>, ptr addrspace(1) %a
+ %a.add = add <2 x i16> %a.load, <i16 1, i16 1>
+ store volatile <2 x i16> %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+define void @global_volatile_4xi16(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_volatile_4xi16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_4xi16_param_0];
+; CHECK-NEXT: ld.volatile.global.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
+; CHECK-NEXT: add.s16 %rs5, %rs4, 1;
+; CHECK-NEXT: add.s16 %rs6, %rs3, 1;
+; CHECK-NEXT: add.s16 %rs7, %rs2, 1;
+; CHECK-NEXT: add.s16 %rs8, %rs1, 1;
+; CHECK-NEXT: st.volatile.global.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <4 x i16>, ptr addrspace(1) %a
+ %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1>
+ store volatile <4 x i16> %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+define void @global_volatile_2xi32(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_volatile_2xi32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xi32_param_0];
+; CHECK-NEXT: ld.volatile.global.v2.u32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT: add.s32 %r3, %r2, 1;
+; CHECK-NEXT: add.s32 %r4, %r1, 1;
+; CHECK-NEXT: st.volatile.global.v2.u32 [%rd1], {%r4, %r3};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <2 x i32>, ptr addrspace(1) %a
+ %a.add = add <2 x i32> %a.load, <i32 1, i32 1>
+ store volatile <2 x i32> %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+define void @global_volatile_4xi32(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_volatile_4xi32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_4xi32_param_0];
+; CHECK-NEXT: ld.volatile.global.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: add.s32 %r5, %r4, 1;
+; CHECK-NEXT: add.s32 %r6, %r3, 1;
+; CHECK-NEXT: add.s32 %r7, %r2, 1;
+; CHECK-NEXT: add.s32 %r8, %r1, 1;
+; CHECK-NEXT: st.volatile.global.v4.u32 [%rd1], {%r8, %r7, %r6, %r5};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <4 x i32>, ptr addrspace(1) %a
+ %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1>
+ store volatile <4 x i32> %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+define void @global_volatile_2xi64(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_volatile_2xi64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xi64_param_0];
+; CHECK-NEXT: ld.volatile.global.v2.u64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT: add.s64 %rd4, %rd3, 1;
+; CHECK-NEXT: add.s64 %rd5, %rd2, 1;
+; CHECK-NEXT: st.volatile.global.v2.u64 [%rd1], {%rd5, %rd4};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <2 x i64>, ptr addrspace(1) %a
+ %a.add = add <2 x i64> %a.load, <i64 1, i64 1>
+ store volatile <2 x i64> %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+define void @global_volatile_2xfloat(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_volatile_2xfloat(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xfloat_param_0];
+; CHECK-NEXT: ld.volatile.global.v2.f32 {%f1, %f2}, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000;
+; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000;
+; CHECK-NEXT: st.volatile.global.v2.f32 [%rd1], {%f4, %f3};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <2 x float>, ptr addrspace(1) %a
+ %a.add = fadd <2 x float> %a.load, <float 1., float 1.>
+ store volatile <2 x float> %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+define void @global_volatile_4xfloat(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_volatile_4xfloat(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_4xfloat_param_0];
+; CHECK-NEXT: ld.volatile.global.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000;
+; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000;
+; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000;
+; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000;
+; CHECK-NEXT: st.volatile.global.v4.f32 [%rd1], {%f8, %f7, %f6, %f5};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <4 x float>, ptr addrspace(1) %a
+ %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.>
+ store volatile <4 x float> %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+define void @global_volatile_2xdouble(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_volatile_2xdouble(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f64 %fd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xdouble_param_0];
+; CHECK-NEXT: ld.volatile.global.v2.f64 {%fd1, %fd2}, [%rd1];
+; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000;
+; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT: st.volatile.global.v2.f64 [%rd1], {%fd4, %fd3};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <2 x double>, ptr addrspace(1) %a
+ %a.add = fadd <2 x double> %a.load, <double 1., double 1.>
+ store volatile <2 x double> %a.add, ptr addrspace(1) %a
+ ret void
+}
- ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load volatile i16, ptr addrspace(1) %b
- %b.add = add i16 %b.load, 1
- ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store volatile i16 %b.add, ptr addrspace(1) %b
-
- ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load volatile i32, ptr addrspace(1) %c
- %c.add = add i32 %c.load, 1
- ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store volatile i32 %c.add, ptr addrspace(1) %c
-
- ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load volatile i64, ptr addrspace(1) %d
- %d.add = add i64 %d.load, 1
- ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store volatile i64 %d.add, ptr addrspace(1) %d
-
- ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load volatile float, ptr addrspace(1) %c
- %e.add = fadd float %e.load, 1.
- ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store volatile float %e.add, ptr addrspace(1) %c
-
- ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load volatile double, ptr addrspace(1) %c
- %f.add = fadd double %f.load, 1.
- ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store volatile double %f.add, ptr addrspace(1) %c
-
- ; CHECK: ld.volatile.global.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %h.load = load volatile <2 x i8>, ptr addrspace(1) %b
- %h.add = add <2 x i8> %h.load, <i8 1, i8 1>
- ; CHECK: st.volatile.global.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}}
- store volatile<2 x i8> %h.add, ptr addrspace(1) %b
-
- ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %i.load = load volatile <4 x i8>, ptr addrspace(1) %c
- %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1>
- ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store volatile<4 x i8> %i.add, ptr addrspace(1) %c
-
- ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %j.load = load volatile <2 x i16>, ptr addrspace(1) %c
- %j.add = add <2 x i16> %j.load, <i16 1, i16 1>
- ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store volatile<2 x i16> %j.add, ptr addrspace(1) %c
-
- ; CHECK: ld.volatile.global.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %k.load = load volatile <4 x i16>, ptr addrspace(1) %d
- %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1>
- ; CHECK: st.volatile.global.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}
- store volatile<4 x i16> %k.add, ptr addrspace(1) %d
-
- ; CHECK: ld.volatile.global.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %l.load = load volatile <2 x i32>, ptr addrspace(1) %d
- %l.add = add <2 x i32> %l.load, <i32 1, i32 1>
- ; CHECK: st.volatile.global.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}}
- store volatile<2 x i32> %l.add, ptr addrspace(1) %d
-
- ; CHECK: ld.volatile.global.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %m.load = load volatile <4 x i32>, ptr addrspace(1) %d
- %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1>
- ; CHECK: st.volatile.global.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
- store volatile<4 x i32> %m.add, ptr addrspace(1) %d
-
- ; CHECK: ld.volatile.global.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %n.load = load volatile <2 x i64>, ptr addrspace(1) %d
- %n.add = add <2 x i64> %n.load, <i64 1, i64 1>
- ; CHECK: st.volatile.global.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}}
- store volatile<2 x i64> %n.add, ptr addrspace(1) %d
-
- ; CHECK: ld.volatile.global.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %o.load = load volatile <2 x float>, ptr addrspace(1) %d
- %o.add = fadd <2 x float> %o.load, <float 1., float 1.>
- ; CHECK: st.volatile.global.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}}
- store volatile<2 x float> %o.add, ptr addrspace(1) %d
-
- ; CHECK: ld.volatile.global.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %p.load = load volatile <4 x float>, ptr addrspace(1) %d
- %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.>
- ; CHECK: st.volatile.global.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
- store volatile<4 x float> %p.add, ptr addrspace(1) %d
-
- ; CHECK: ld.volatile.global.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %q.load = load volatile <2 x double>, ptr addrspace(1) %d
- %q.add = fadd <2 x double> %q.load, <double 1., double 1.>
- ; CHECK: st.volatile.global.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}}
- store volatile<2 x double> %q.add, ptr addrspace(1) %d
-
- ret void
-}
-
-; CHECK-LABEL: global_unordered_sys
-define void @global_unordered_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
- ; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; global_unordered_sys
+
+define void @global_unordered_sys_i8(ptr addrspace(1) %a) {
+; SM60-LABEL: global_unordered_sys_i8(
+; SM60: {
+; SM60-NEXT: .reg .b16 %rs<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i8_param_0];
+; SM60-NEXT: ld.volatile.global.u8 %rs1, [%rd1];
+; SM60-NEXT: add.s16 %rs2, %rs1, 1;
+; SM60-NEXT: st.volatile.global.u8 [%rd1], %rs2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_unordered_sys_i8(
+; SM70: {
+; SM70-NEXT: .reg .b16 %rs<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i8_param_0];
+; SM70-NEXT: ld.relaxed.sys.global.u8 %rs1, [%rd1];
+; SM70-NEXT: add.s16 %rs2, %rs1, 1;
+; SM70-NEXT: st.relaxed.sys.global.u8 [%rd1], %rs2;
+; SM70-NEXT: ret;
%a.load = load atomic i8, ptr addrspace(1) %a unordered, align 1
%a.add = add i8 %a.load, 1
- ; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- ; SM70: st.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
store atomic i8 %a.add, ptr addrspace(1) %a unordered, align 1
+ ret void
+}
+
+define void @global_unordered_sys_i16(ptr addrspace(1) %a) {
+; SM60-LABEL: global_unordered_sys_i16(
+; SM60: {
+; SM60-NEXT: .reg .b16 %rs<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i16_param_0];
+; SM60-NEXT: ld.volatile.global.u16 %rs1, [%rd1];
+; SM60-NEXT: add.s16 %rs2, %rs1, 1;
+; SM60-NEXT: st.volatile.global.u16 [%rd1], %rs2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_unordered_sys_i16(
+; SM70: {
+; SM70-NEXT: .reg .b16 %rs<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i16_param_0];
+; SM70-NEXT: ld.relaxed.sys.global.u16 %rs1, [%rd1];
+; SM70-NEXT: add.s16 %rs2, %rs1, 1;
+; SM70-NEXT: st.relaxed.sys.global.u16 [%rd1], %rs2;
+; SM70-NEXT: ret;
+ %a.load = load atomic i16, ptr addrspace(1) %a unordered, align 2
+ %a.add = add i16 %a.load, 1
+ store atomic i16 %a.add, ptr addrspace(1) %a unordered, align 2
+ ret void
+}
- ; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr addrspace(1) %b unordered, align 2
- %b.add = add i16 %b.load, 1
- ; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- ; SM70: st.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr addrspace(1) %b unordered, align 2
-
- ; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr addrspace(1) %c unordered, align 4
- %c.add = add i32 %c.load, 1
- ; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- ; SM70: st.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr addrspace(1) %c unordered, align 4
-
- ; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr addrspace(1) %d unordered, align 8
- %d.add = add i64 %d.load, 1
- ; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- ; SM70: st.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr addrspace(1) %d unordered, align 8
-
- ; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr addrspace(1) %e unordered, align 4
- %e.add = fadd float %e.load, 1.0
- ; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- ; SM70: st.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr addrspace(1) %e unordered, align 4
-
- ; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr addrspace(1) %e unordered, align 8
- %f.add = fadd double %f.load, 1.
- ; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- ; SM70: st.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr addrspace(1) %e unordered, align 8
-
- ret void
-}
-
-; CHECK-LABEL: global_unordered_volatile_sys
-define void @global_unordered_volatile_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
- ; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+define void @global_unordered_sys_i32(ptr addrspace(1) %a) {
+; SM60-LABEL: global_unordered_sys_i32(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i32_param_0];
+; SM60-NEXT: ld.volatile.global.u32 %r1, [%rd1];
+; SM60-NEXT: add.s32 %r2, %r1, 1;
+; SM60-NEXT: st.volatile.global.u32 [%rd1], %r2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_unordered_sys_i32(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i32_param_0];
+; SM70-NEXT: ld.relaxed.sys.global.u32 %r1, [%rd1];
+; SM70-NEXT: add.s32 %r2, %r1, 1;
+; SM70-NEXT: st.relaxed.sys.global.u32 [%rd1], %r2;
+; SM70-NEXT: ret;
+ %a.load = load atomic i32, ptr addrspace(1) %a unordered, align 4
+ %a.add = add i32 %a.load, 1
+ store atomic i32 %a.add, ptr addrspace(1) %a unordered, align 4
+ ret void
+}
+
+define void @global_unordered_sys_i64(ptr addrspace(1) %a) {
+; SM60-LABEL: global_unordered_sys_i64(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<4>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i64_param_0];
+; SM60-NEXT: ld.volatile.global.u64 %rd2, [%rd1];
+; SM60-NEXT: add.s64 %rd3, %rd2, 1;
+; SM60-NEXT: st.volatile.global.u64 [%rd1], %rd3;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_unordered_sys_i64(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<4>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i64_param_0];
+; SM70-NEXT: ld.relaxed.sys.global.u64 %rd2, [%rd1];
+; SM70-NEXT: add.s64 %rd3, %rd2, 1;
+; SM70-NEXT: st.relaxed.sys.global.u64 [%rd1], %rd3;
+; SM70-NEXT: ret;
+ %a.load = load atomic i64, ptr addrspace(1) %a unordered, align 8
+ %a.add = add i64 %a.load, 1
+ store atomic i64 %a.add, ptr addrspace(1) %a unordered, align 8
+ ret void
+}
+
+define void @global_unordered_sys_float(ptr addrspace(1) %a) {
+; SM60-LABEL: global_unordered_sys_float(
+; SM60: {
+; SM60-NEXT: .reg .f32 %f<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_sys_float_param_0];
+; SM60-NEXT: ld.volatile.global.f32 %f1, [%rd1];
+; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; SM60-NEXT: st.volatile.global.f32 [%rd1], %f2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_unordered_sys_float(
+; SM70: {
+; SM70-NEXT: .reg .f32 %f<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_sys_float_param_0];
+; SM70-NEXT: ld.relaxed.sys.global.f32 %f1, [%rd1];
+; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; SM70-NEXT: st.relaxed.sys.global.f32 [%rd1], %f2;
+; SM70-NEXT: ret;
+ %a.load = load atomic float, ptr addrspace(1) %a unordered, align 4
+ %a.add = fadd float %a.load, 1.
+ store atomic float %a.add, ptr addrspace(1) %a unordered, align 4
+ ret void
+}
+
+define void @global_unordered_sys_double(ptr addrspace(1) %a) {
+; SM60-LABEL: global_unordered_sys_double(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .f64 %fd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_sys_double_param_0];
+; SM60-NEXT: ld.volatile.global.f64 %fd1, [%rd1];
+; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM60-NEXT: st.volatile.global.f64 [%rd1], %fd2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_unordered_sys_double(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .f64 %fd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_sys_double_param_0];
+; SM70-NEXT: ld.relaxed.sys.global.f64 %fd1, [%rd1];
+; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM70-NEXT: st.relaxed.sys.global.f64 [%rd1], %fd2;
+; SM70-NEXT: ret;
+ %a.load = load atomic double, ptr addrspace(1) %a unordered, align 8
+ %a.add = fadd double %a.load, 1.
+ store atomic double %a.add, ptr addrspace(1) %a unordered, align 8
+ ret void
+}
+
+; global_unordered_volatile_sys
+
+define void @global_unordered_volatile_sys_i8(ptr addrspace(1) %a) {
+; SM60-LABEL: global_unordered_volatile_sys_i8(
+; SM60: {
+; SM60-NEXT: .reg .b16 %rs<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i8_param_0];
+; SM60-NEXT: ld.volatile.global.u8 %rs1, [%rd1];
+; SM60-NEXT: add.s16 %rs2, %rs1, 1;
+; SM60-NEXT: st.volatile.global.u8 [%rd1], %rs2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_unordered_volatile_sys_i8(
+; SM70: {
+; SM70-NEXT: .reg .b16 %rs<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i8_param_0];
+; SM70-NEXT: ld.mmio.relaxed.sys.global.u8 %rs1, [%rd1];
+; SM70-NEXT: add.s16 %rs2, %rs1, 1;
+; SM70-NEXT: st.mmio.relaxed.sys.global.u8 [%rd1], %rs2;
+; SM70-NEXT: ret;
%a.load = load atomic volatile i8, ptr addrspace(1) %a unordered, align 1
%a.add = add i8 %a.load, 1
- ; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- ; SM70: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
store atomic volatile i8 %a.add, ptr addrspace(1) %a unordered, align 1
+ ret void
+}
+
+define void @global_unordered_volatile_sys_i16(ptr addrspace(1) %a) {
+; SM60-LABEL: global_unordered_volatile_sys_i16(
+; SM60: {
+; SM60-NEXT: .reg .b16 %rs<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i16_param_0];
+; SM60-NEXT: ld.volatile.global.u16 %rs1, [%rd1];
+; SM60-NEXT: add.s16 %rs2, %rs1, 1;
+; SM60-NEXT: st.volatile.global.u16 [%rd1], %rs2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_unordered_volatile_sys_i16(
+; SM70: {
+; SM70-NEXT: .reg .b16 %rs<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i16_param_0];
+; SM70-NEXT: ld.mmio.relaxed.sys.global.u16 %rs1, [%rd1];
+; SM70-NEXT: add.s16 %rs2, %rs1, 1;
+; SM70-NEXT: st.mmio.relaxed.sys.global.u16 [%rd1], %rs2;
+; SM70-NEXT: ret;
+ %a.load = load atomic volatile i16, ptr addrspace(1) %a unordered, align 2
+ %a.add = add i16 %a.load, 1
+ store atomic volatile i16 %a.add, ptr addrspace(1) %a unordered, align 2
+ ret void
+}
+
+define void @global_unordered_volatile_sys_i32(ptr addrspace(1) %a) {
+; SM60-LABEL: global_unordered_volatile_sys_i32(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i32_param_0];
+; SM60-NEXT: ld.volatile.global.u32 %r1, [%rd1];
+; SM60-NEXT: add.s32 %r2, %r1, 1;
+; SM60-NEXT: st.volatile.global.u32 [%rd1], %r2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_unordered_volatile_sys_i32(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i32_param_0];
+; SM70-NEXT: ld.mmio.relaxed.sys.global.u32 %r1, [%rd1];
+; SM70-NEXT: add.s32 %r2, %r1, 1;
+; SM70-NEXT: st.mmio.relaxed.sys.global.u32 [%rd1], %r2;
+; SM70-NEXT: ret;
+ %a.load = load atomic volatile i32, ptr addrspace(1) %a unordered, align 4
+ %a.add = add i32 %a.load, 1
+ store atomic volatile i32 %a.add, ptr addrspace(1) %a unordered, align 4
+ ret void
+}
+
+define void @global_unordered_volatile_sys_i64(ptr addrspace(1) %a) {
+; SM60-LABEL: global_unordered_volatile_sys_i64(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<4>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i64_param_0];
+; SM60-NEXT: ld.volatile.global.u64 %rd2, [%rd1];
+; SM60-NEXT: add.s64 %rd3, %rd2, 1;
+; SM60-NEXT: st.volatile.global.u64 [%rd1], %rd3;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_unordered_volatile_sys_i64(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<4>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i64_param_0];
+; SM70-NEXT: ld.mmio.relaxed.sys.global.u64 %rd2, [%rd1];
+; SM70-NEXT: add.s64 %rd3, %rd2, 1;
+; SM70-NEXT: st.mmio.relaxed.sys.global.u64 [%rd1], %rd3;
+; SM70-NEXT: ret;
+ %a.load = load atomic volatile i64, ptr addrspace(1) %a unordered, align 8
+ %a.add = add i64 %a.load, 1
+ store atomic volatile i64 %a.add, ptr addrspace(1) %a unordered, align 8
+ ret void
+}
- ; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic volatile i16, ptr addrspace(1) %b unordered, align 2
- %b.add = add i16 %b.load, 1
- ; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- ; SM70: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i16 %b.add, ptr addrspace(1) %b unordered, align 2
-
- ; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic volatile i32, ptr addrspace(1) %c unordered, align 4
- %c.add = add i32 %c.load, 1
- ; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- ; SM70: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic volatile i32 %c.add, ptr addrspace(1) %c unordered, align 4
-
- ; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic volatile i64, ptr addrspace(1) %d unordered, align 8
- %d.add = add i64 %d.load, 1
- ; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- ; SM70: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic volatile i64 %d.add, ptr addrspace(1) %d unordered, align 8
-
- ; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic volatile float, ptr addrspace(1) %e unordered, align 4
- %e.add = fadd float %e.load, 1.0
- ; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- ; SM70: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic volatile float %e.add, ptr addrspace(1) %e unordered, align 4
-
- ; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic volatile double, ptr addrspace(1) %e unordered, align 8
- %f.add = fadd double %f.load, 1.
- ; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- ; SM70: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic volatile double %f.add, ptr addrspace(1) %e unordered, align 8
-
- ret void
-}
-
-; CHECK-LABEL: global_monotonic_sys
-define void @global_monotonic_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
- ; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+define void @global_unordered_volatile_sys_float(ptr addrspace(1) %a) {
+; SM60-LABEL: global_unordered_volatile_sys_float(
+; SM60: {
+; SM60-NEXT: .reg .f32 %f<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_float_param_0];
+; SM60-NEXT: ld.volatile.global.f32 %f1, [%rd1];
+; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; SM60-NEXT: st.volatile.global.f32 [%rd1], %f2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_unordered_volatile_sys_float(
+; SM70: {
+; SM70-NEXT: .reg .f32 %f<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_float_param_0];
+; SM70-NEXT: ld.mmio.relaxed.sys.global.f32 %f1, [%rd1];
+; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; SM70-NEXT: st.mmio.relaxed.sys.global.f32 [%rd1], %f2;
+; SM70-NEXT: ret;
+ %a.load = load atomic volatile float, ptr addrspace(1) %a unordered, align 4
+ %a.add = fadd float %a.load, 1.
+ store atomic volatile float %a.add, ptr addrspace(1) %a unordered, align 4
+ ret void
+}
+
+define void @global_unordered_volatile_sys_double(ptr addrspace(1) %a) {
+; SM60-LABEL: global_unordered_volatile_sys_double(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .f64 %fd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_double_param_0];
+; SM60-NEXT: ld.volatile.global.f64 %fd1, [%rd1];
+; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM60-NEXT: st.volatile.global.f64 [%rd1], %fd2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_unordered_volatile_sys_double(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .f64 %fd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_double_param_0];
+; SM70-NEXT: ld.mmio.relaxed.sys.global.f64 %fd1, [%rd1];
+; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM70-NEXT: st.mmio.relaxed.sys.global.f64 [%rd1], %fd2;
+; SM70-NEXT: ret;
+ %a.load = load atomic volatile double, ptr addrspace(1) %a unordered, align 8
+ %a.add = fadd double %a.load, 1.
+ store atomic volatile double %a.add, ptr addrspace(1) %a unordered, align 8
+ ret void
+}
+
+; global_monotonic_sys
+
+define void @global_monotonic_sys_i8(ptr addrspace(1) %a) {
+; SM60-LABEL: global_monotonic_sys_i8(
+; SM60: {
+; SM60-NEXT: .reg .b16 %rs<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i8_param_0];
+; SM60-NEXT: ld.volatile.global.u8 %rs1, [%rd1];
+; SM60-NEXT: add.s16 %rs2, %rs1, 1;
+; SM60-NEXT: st.volatile.global.u8 [%rd1], %rs2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_monotonic_sys_i8(
+; SM70: {
+; SM70-NEXT: .reg .b16 %rs<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i8_param_0];
+; SM70-NEXT: ld.relaxed.sys.global.u8 %rs1, [%rd1];
+; SM70-NEXT: add.s16 %rs2, %rs1, 1;
+; SM70-NEXT: st.relaxed.sys.global.u8 [%rd1], %rs2;
+; SM70-NEXT: ret;
%a.load = load atomic i8, ptr addrspace(1) %a monotonic, align 1
%a.add = add i8 %a.load, 1
- ; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- ; SM70: st.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
store atomic i8 %a.add, ptr addrspace(1) %a monotonic, align 1
+ ret void
+}
- ; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr addrspace(1) %b monotonic, align 2
- %b.add = add i16 %b.load, 1
- ; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- ; SM70: st.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr addrspace(1) %b monotonic, align 2
-
- ; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr addrspace(1) %c monotonic, align 4
- %c.add = add i32 %c.load, 1
- ; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- ; SM70: st.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr addrspace(1) %c monotonic, align 4
-
- ; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr addrspace(1) %d monotonic, align 8
- %d.add = add i64 %d.load, 1
- ; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- ; SM70: st.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr addrspace(1) %d monotonic, align 8
-
- ; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr addrspace(1) %e monotonic, align 4
- %e.add = fadd float %e.load, 1.
- ; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- ; SM70: st.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr addrspace(1) %e monotonic, align 4
-
- ; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr addrspace(1) %e monotonic, align 8
- %f.add = fadd double %f.load, 1.
- ; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- ; SM70: st.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr addrspace(1) %e monotonic, align 8
-
- ret void
-}
-
-; CHECK-LABEL: global_monotonic_volatile_sys
-define void @global_monotonic_volatile_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
- ; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+define void @global_monotonic_sys_i16(ptr addrspace(1) %a) {
+; SM60-LABEL: global_monotonic_sys_i16(
+; SM60: {
+; SM60-NEXT: .reg .b16 %rs<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i16_param_0];
+; SM60-NEXT: ld.volatile.global.u16 %rs1, [%rd1];
+; SM60-NEXT: add.s16 %rs2, %rs1, 1;
+; SM60-NEXT: st.volatile.global.u16 [%rd1], %rs2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_monotonic_sys_i16(
+; SM70: {
+; SM70-NEXT: .reg .b16 %rs<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i16_param_0];
+; SM70-NEXT: ld.relaxed.sys.global.u16 %rs1, [%rd1];
+; SM70-NEXT: add.s16 %rs2, %rs1, 1;
+; SM70-NEXT: st.relaxed.sys.global.u16 [%rd1], %rs2;
+; SM70-NEXT: ret;
+ %a.load = load atomic i16, ptr addrspace(1) %a monotonic, align 2
+ %a.add = add i16 %a.load, 1
+ store atomic i16 %a.add, ptr addrspace(1) %a monotonic, align 2
+ ret void
+}
+
+define void @global_monotonic_sys_i32(ptr addrspace(1) %a) {
+; SM60-LABEL: global_monotonic_sys_i32(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i32_param_0];
+; SM60-NEXT: ld.volatile.global.u32 %r1, [%rd1];
+; SM60-NEXT: add.s32 %r2, %r1, 1;
+; SM60-NEXT: st.volatile.global.u32 [%rd1], %r2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_monotonic_sys_i32(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i32_param_0];
+; SM70-NEXT: ld.relaxed.sys.global.u32 %r1, [%rd1];
+; SM70-NEXT: add.s32 %r2, %r1, 1;
+; SM70-NEXT: st.relaxed.sys.global.u32 [%rd1], %r2;
+; SM70-NEXT: ret;
+ %a.load = load atomic i32, ptr addrspace(1) %a monotonic, align 4
+ %a.add = add i32 %a.load, 1
+ store atomic i32 %a.add, ptr addrspace(1) %a monotonic, align 4
+ ret void
+}
+
+define void @global_monotonic_sys_i64(ptr addrspace(1) %a) {
+; SM60-LABEL: global_monotonic_sys_i64(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<4>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i64_param_0];
+; SM60-NEXT: ld.volatile.global.u64 %rd2, [%rd1];
+; SM60-NEXT: add.s64 %rd3, %rd2, 1;
+; SM60-NEXT: st.volatile.global.u64 [%rd1], %rd3;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_monotonic_sys_i64(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<4>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i64_param_0];
+; SM70-NEXT: ld.relaxed.sys.global.u64 %rd2, [%rd1];
+; SM70-NEXT: add.s64 %rd3, %rd2, 1;
+; SM70-NEXT: st.relaxed.sys.global.u64 [%rd1], %rd3;
+; SM70-NEXT: ret;
+ %a.load = load atomic i64, ptr addrspace(1) %a monotonic, align 8
+ %a.add = add i64 %a.load, 1
+ store atomic i64 %a.add, ptr addrspace(1) %a monotonic, align 8
+ ret void
+}
+
+define void @global_monotonic_sys_float(ptr addrspace(1) %a) {
+; SM60-LABEL: global_monotonic_sys_float(
+; SM60: {
+; SM60-NEXT: .reg .f32 %f<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_float_param_0];
+; SM60-NEXT: ld.volatile.global.f32 %f1, [%rd1];
+; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; SM60-NEXT: st.volatile.global.f32 [%rd1], %f2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_monotonic_sys_float(
+; SM70: {
+; SM70-NEXT: .reg .f32 %f<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_float_param_0];
+; SM70-NEXT: ld.relaxed.sys.global.f32 %f1, [%rd1];
+; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; SM70-NEXT: st.relaxed.sys.global.f32 [%rd1], %f2;
+; SM70-NEXT: ret;
+ %a.load = load atomic float, ptr addrspace(1) %a monotonic, align 4
+ %a.add = fadd float %a.load, 1.
+ store atomic float %a.add, ptr addrspace(1) %a monotonic, align 4
+ ret void
+}
+
+define void @global_monotonic_sys_double(ptr addrspace(1) %a) {
+; SM60-LABEL: global_monotonic_sys_double(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .f64 %fd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_double_param_0];
+; SM60-NEXT: ld.volatile.global.f64 %fd1, [%rd1];
+; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM60-NEXT: st.volatile.global.f64 [%rd1], %fd2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_monotonic_sys_double(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .f64 %fd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_double_param_0];
+; SM70-NEXT: ld.relaxed.sys.global.f64 %fd1, [%rd1];
+; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM70-NEXT: st.relaxed.sys.global.f64 [%rd1], %fd2;
+; SM70-NEXT: ret;
+ %a.load = load atomic double, ptr addrspace(1) %a monotonic, align 8
+ %a.add = fadd double %a.load, 1.
+ store atomic double %a.add, ptr addrspace(1) %a monotonic, align 8
+ ret void
+}
+
+; global_monotonic_volatile_sys
+
+define void @global_monotonic_volatile_sys_i8(ptr addrspace(1) %a) {
+; SM60-LABEL: global_monotonic_volatile_sys_i8(
+; SM60: {
+; SM60-NEXT: .reg .b16 %rs<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i8_param_0];
+; SM60-NEXT: ld.volatile.global.u8 %rs1, [%rd1];
+; SM60-NEXT: add.s16 %rs2, %rs1, 1;
+; SM60-NEXT: st.volatile.global.u8 [%rd1], %rs2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_monotonic_volatile_sys_i8(
+; SM70: {
+; SM70-NEXT: .reg .b16 %rs<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i8_param_0];
+; SM70-NEXT: ld.mmio.relaxed.sys.global.u8 %rs1, [%rd1];
+; SM70-NEXT: add.s16 %rs2, %rs1, 1;
+; SM70-NEXT: st.mmio.relaxed.sys.global.u8 [%rd1], %rs2;
+; SM70-NEXT: ret;
%a.load = load atomic volatile i8, ptr addrspace(1) %a monotonic, align 1
%a.add = add i8 %a.load, 1
- ; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- ; SM70: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
store atomic volatile i8 %a.add, ptr addrspace(1) %a monotonic, align 1
+ ret void
+}
- ; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic volatile i16, ptr addrspace(1) %b monotonic, align 2
- %b.add = add i16 %b.load, 1
- ; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- ; SM70: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i16 %b.add, ptr addrspace(1) %b monotonic, align 2
-
- ; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic volatile i32, ptr addrspace(1) %c monotonic, align 4
- %c.add = add i32 %c.load, 1
- ; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- ; SM70: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic volatile i32 %c.add, ptr addrspace(1) %c monotonic, align 4
-
- ; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic volatile i64, ptr addrspace(1) %d monotonic, align 8
- %d.add = add i64 %d.load, 1
- ; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- ; SM70: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic volatile i64 %d.add, ptr addrspace(1) %d monotonic, align 8
-
- ; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic volatile float, ptr addrspace(1) %e monotonic, align 4
- %e.add = fadd float %e.load, 1.
- ; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- ; SM70: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic volatile float %e.add, ptr addrspace(1) %e monotonic, align 4
-
- ; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic volatile double, ptr addrspace(1) %e monotonic, align 8
- %f.add = fadd double %f.load, 1.
- ; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- ; SM70: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic volatile double %f.add, ptr addrspace(1) %e monotonic, align 8
+define void @global_monotonic_volatile_sys_i16(ptr addrspace(1) %a) {
+; SM60-LABEL: global_monotonic_volatile_sys_i16(
+; SM60: {
+; SM60-NEXT: .reg .b16 %rs<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i16_param_0];
+; SM60-NEXT: ld.volatile.global.u16 %rs1, [%rd1];
+; SM60-NEXT: add.s16 %rs2, %rs1, 1;
+; SM60-NEXT: st.volatile.global.u16 [%rd1], %rs2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_monotonic_volatile_sys_i16(
+; SM70: {
+; SM70-NEXT: .reg .b16 %rs<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i16_param_0];
+; SM70-NEXT: ld.mmio.relaxed.sys.global.u16 %rs1, [%rd1];
+; SM70-NEXT: add.s16 %rs2, %rs1, 1;
+; SM70-NEXT: st.mmio.relaxed.sys.global.u16 [%rd1], %rs2;
+; SM70-NEXT: ret;
+ %a.load = load atomic volatile i16, ptr addrspace(1) %a monotonic, align 2
+ %a.add = add i16 %a.load, 1
+ store atomic volatile i16 %a.add, ptr addrspace(1) %a monotonic, align 2
+ ret void
+}
+define void @global_monotonic_volatile_sys_i32(ptr addrspace(1) %a) {
+; SM60-LABEL: global_monotonic_volatile_sys_i32(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i32_param_0];
+; SM60-NEXT: ld.volatile.global.u32 %r1, [%rd1];
+; SM60-NEXT: add.s32 %r2, %r1, 1;
+; SM60-NEXT: st.volatile.global.u32 [%rd1], %r2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_monotonic_volatile_sys_i32(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i32_param_0];
+; SM70-NEXT: ld.mmio.relaxed.sys.global.u32 %r1, [%rd1];
+; SM70-NEXT: add.s32 %r2, %r1, 1;
+; SM70-NEXT: st.mmio.relaxed.sys.global.u32 [%rd1], %r2;
+; SM70-NEXT: ret;
+ %a.load = load atomic volatile i32, ptr addrspace(1) %a monotonic, align 4
+ %a.add = add i32 %a.load, 1
+ store atomic volatile i32 %a.add, ptr addrspace(1) %a monotonic, align 4
+ ret void
+}
+
+define void @global_monotonic_volatile_sys_i64(ptr addrspace(1) %a) {
+; SM60-LABEL: global_monotonic_volatile_sys_i64(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<4>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i64_param_0];
+; SM60-NEXT: ld.volatile.global.u64 %rd2, [%rd1];
+; SM60-NEXT: add.s64 %rd3, %rd2, 1;
+; SM60-NEXT: st.volatile.global.u64 [%rd1], %rd3;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_monotonic_volatile_sys_i64(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<4>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i64_param_0];
+; SM70-NEXT: ld.mmio.relaxed.sys.global.u64 %rd2, [%rd1];
+; SM70-NEXT: add.s64 %rd3, %rd2, 1;
+; SM70-NEXT: st.mmio.relaxed.sys.global.u64 [%rd1], %rd3;
+; SM70-NEXT: ret;
+ %a.load = load atomic volatile i64, ptr addrspace(1) %a monotonic, align 8
+ %a.add = add i64 %a.load, 1
+ store atomic volatile i64 %a.add, ptr addrspace(1) %a monotonic, align 8
+ ret void
+}
+
+define void @global_monotonic_volatile_sys_float(ptr addrspace(1) %a) {
+; SM60-LABEL: global_monotonic_volatile_sys_float(
+; SM60: {
+; SM60-NEXT: .reg .f32 %f<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_float_param_0];
+; SM60-NEXT: ld.volatile.global.f32 %f1, [%rd1];
+; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; SM60-NEXT: st.volatile.global.f32 [%rd1], %f2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_monotonic_volatile_sys_float(
+; SM70: {
+; SM70-NEXT: .reg .f32 %f<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_float_param_0];
+; SM70-NEXT: ld.mmio.relaxed.sys.global.f32 %f1, [%rd1];
+; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; SM70-NEXT: st.mmio.relaxed.sys.global.f32 [%rd1], %f2;
+; SM70-NEXT: ret;
+ %a.load = load atomic volatile float, ptr addrspace(1) %a monotonic, align 4
+ %a.add = fadd float %a.load, 1.
+ store atomic volatile float %a.add, ptr addrspace(1) %a monotonic, align 4
+ ret void
+}
+
+define void @global_monotonic_volatile_sys_double(ptr addrspace(1) %a) {
+; SM60-LABEL: global_monotonic_volatile_sys_double(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .f64 %fd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_double_param_0];
+; SM60-NEXT: ld.volatile.global.f64 %fd1, [%rd1];
+; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM60-NEXT: st.volatile.global.f64 [%rd1], %fd2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_monotonic_volatile_sys_double(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .f64 %fd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_double_param_0];
+; SM70-NEXT: ld.mmio.relaxed.sys.global.f64 %fd1, [%rd1];
+; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM70-NEXT: st.mmio.relaxed.sys.global.f64 [%rd1], %fd2;
+; SM70-NEXT: ret;
+ %a.load = load atomic volatile double, ptr addrspace(1) %a monotonic, align 8
+ %a.add = fadd double %a.load, 1.
+ store atomic volatile double %a.add, ptr addrspace(1) %a monotonic, align 8
ret void
}
;; shared statespace
-; CHECK-LABEL: shared_weak
-define void @shared_weak(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr {
- ; CHECK: ld.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; shared
+
+define void @shared_i8(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_i8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_i8_param_0];
+; CHECK-NEXT: ld.shared.u8 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.shared.u8 [%rd1], %rs2;
+; CHECK-NEXT: ret;
%a.load = load i8, ptr addrspace(3) %a
%a.add = add i8 %a.load, 1
- ; CHECK: st.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
store i8 %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+define void @shared_i16(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_i16_param_0];
+; CHECK-NEXT: ld.shared.u16 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.shared.u16 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load i16, ptr addrspace(3) %a
+ %a.add = add i16 %a.load, 1
+ store i16 %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+define void @shared_i32(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_i32_param_0];
+; CHECK-NEXT: ld.shared.u32 %r1, [%rd1];
+; CHECK-NEXT: add.s32 %r2, %r1, 1;
+; CHECK-NEXT: st.shared.u32 [%rd1], %r2;
+; CHECK-NEXT: ret;
+ %a.load = load i32, ptr addrspace(3) %a
+ %a.add = add i32 %a.load, 1
+ store i32 %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+define void @shared_i64(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_i64_param_0];
+; CHECK-NEXT: ld.shared.u64 %rd2, [%rd1];
+; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
+; CHECK-NEXT: st.shared.u64 [%rd1], %rd3;
+; CHECK-NEXT: ret;
+ %a.load = load i64, ptr addrspace(3) %a
+ %a.add = add i64 %a.load, 1
+ store i64 %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+define void @shared_float(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_float(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_float_param_0];
+; CHECK-NEXT: ld.shared.f32 %f1, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT: st.shared.f32 [%rd1], %f2;
+; CHECK-NEXT: ret;
+ %a.load = load float, ptr addrspace(3) %a
+ %a.add = fadd float %a.load, 1.
+ store float %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+define void @shared_double(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_double(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_double_param_0];
+; CHECK-NEXT: ld.shared.f64 %fd1, [%rd1];
+; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT: st.shared.f64 [%rd1], %fd2;
+; CHECK-NEXT: ret;
+ %a.load = load double, ptr addrspace(3) %a
+ %a.add = fadd double %a.load, 1.
+ store double %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+define void @shared_2xi8(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_2xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<5>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xi8_param_0];
+; CHECK-NEXT: ld.shared.v2.u8 {%rs1, %rs2}, [%rd1];
+; CHECK-NEXT: add.s16 %rs3, %rs2, 1;
+; CHECK-NEXT: add.s16 %rs4, %rs1, 1;
+; CHECK-NEXT: st.shared.v2.u8 [%rd1], {%rs4, %rs3};
+; CHECK-NEXT: ret;
+ %a.load = load <2 x i8>, ptr addrspace(3) %a
+ %a.add = add <2 x i8> %a.load, <i8 1, i8 1>
+ store <2 x i8> %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+define void @shared_4xi8(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_4xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<9>;
+; CHECK-NEXT: .reg .b32 %r<13>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_4xi8_param_0];
+; CHECK-NEXT: ld.shared.u32 %r1, [%rd1];
+; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: cvt.u32.u16 %r3, %rs2;
+; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r4;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT: cvt.u32.u16 %r5, %rs4;
+; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8;
+; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
+; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
+; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8;
+; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
+; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8;
+; CHECK-NEXT: st.shared.u32 [%rd1], %r12;
+; CHECK-NEXT: ret;
+ %a.load = load <4 x i8>, ptr addrspace(3) %a
+ %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1>
+ store <4 x i8> %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+define void @shared_2xi16(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_2xi16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<5>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xi16_param_0];
+; CHECK-NEXT: ld.shared.u32 %r1, [%rd1];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT: add.s16 %rs3, %rs2, 1;
+; CHECK-NEXT: add.s16 %rs4, %rs1, 1;
+; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3};
+; CHECK-NEXT: st.shared.u32 [%rd1], %r2;
+; CHECK-NEXT: ret;
+ %a.load = load <2 x i16>, ptr addrspace(3) %a
+ %a.add = add <2 x i16> %a.load, <i16 1, i16 1>
+ store <2 x i16> %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+define void @shared_4xi16(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_4xi16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_4xi16_param_0];
+; CHECK-NEXT: ld.shared.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
+; CHECK-NEXT: add.s16 %rs5, %rs4, 1;
+; CHECK-NEXT: add.s16 %rs6, %rs3, 1;
+; CHECK-NEXT: add.s16 %rs7, %rs2, 1;
+; CHECK-NEXT: add.s16 %rs8, %rs1, 1;
+; CHECK-NEXT: st.shared.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5};
+; CHECK-NEXT: ret;
+ %a.load = load <4 x i16>, ptr addrspace(3) %a
+ %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1>
+ store <4 x i16> %a.add, ptr addrspace(3) %a
+ ret void
+}
- ; CHECK: ld.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load i16, ptr addrspace(3) %b
- %b.add = add i16 %b.load, 1
- ; CHECK: st.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store i16 %b.add, ptr addrspace(3) %b
-
- ; CHECK: ld.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load i32, ptr addrspace(3) %c
- %c.add = add i32 %c.load, 1
- ; CHECK: st.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store i32 %c.add, ptr addrspace(3) %c
-
- ; CHECK: ld.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load i64, ptr addrspace(3) %d
- %d.add = add i64 %d.load, 1
- ; CHECK: st.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store i64 %d.add, ptr addrspace(3) %d
-
- ; CHECK: ld.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load float, ptr addrspace(3) %c
- %e.add = fadd float %e.load, 1.
- ; CHECK: st.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store float %e.add, ptr addrspace(3) %c
-
- ; CHECK: ld.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load double, ptr addrspace(3) %c
- %f.add = fadd double %f.load, 1.
- ; CHECK: st.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store double %f.add, ptr addrspace(3) %c
-
- ; CHECK: ld.shared.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %h.load = load <2 x i8>, ptr addrspace(3) %b
- %h.add = add <2 x i8> %h.load, <i8 1, i8 1>
- ; CHECK: st.shared.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}}
- store <2 x i8> %h.add, ptr addrspace(3) %b
-
- ; CHECK: ld.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %i.load = load <4 x i8>, ptr addrspace(3) %c
- %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1>
- ; CHECK: st.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store <4 x i8> %i.add, ptr addrspace(3) %c
-
- ; CHECK: ld.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %j.load = load <2 x i16>, ptr addrspace(3) %c
- %j.add = add <2 x i16> %j.load, <i16 1, i16 1>
- ; CHECK: st.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store <2 x i16> %j.add, ptr addrspace(3) %c
-
- ; CHECK: ld.shared.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %k.load = load <4 x i16>, ptr addrspace(3) %d
- %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1>
- ; CHECK: st.shared.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}
- store <4 x i16> %k.add, ptr addrspace(3) %d
-
- ; CHECK: ld.shared.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %l.load = load <2 x i32>, ptr addrspace(3) %d
- %l.add = add <2 x i32> %l.load, <i32 1, i32 1>
- ; CHECK: st.shared.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}}
- store <2 x i32> %l.add, ptr addrspace(3) %d
-
- ; CHECK: ld.shared.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %m.load = load <4 x i32>, ptr addrspace(3) %d
- %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1>
- ; CHECK: st.shared.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
- store <4 x i32> %m.add, ptr addrspace(3) %d
-
- ; CHECK: ld.shared.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %n.load = load <2 x i64>, ptr addrspace(3) %d
- %n.add = add <2 x i64> %n.load, <i64 1, i64 1>
- ; CHECK: st.shared.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}}
- store <2 x i64> %n.add, ptr addrspace(3) %d
-
- ; CHECK: ld.shared.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %o.load = load <2 x float>, ptr addrspace(3) %d
- %o.add = fadd <2 x float> %o.load, <float 1., float 1.>
- ; CHECK: st.shared.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}}
- store <2 x float> %o.add, ptr addrspace(3) %d
-
- ; CHECK: ld.shared.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %p.load = load <4 x float>, ptr addrspace(3) %d
- %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.>
- ; CHECK: st.shared.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
- store <4 x float> %p.add, ptr addrspace(3) %d
-
- ; CHECK: ld.shared.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %q.load = load <2 x double>, ptr addrspace(3) %d
- %q.add = fadd <2 x double> %q.load, <double 1., double 1.>
- ; CHECK: st.shared.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}}
- store <2 x double> %q.add, ptr addrspace(3) %d
-
- ret void
-}
-
-; CHECK-LABEL: shared_volatile
-define void @shared_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr {
- ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+define void @shared_2xi32(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_2xi32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xi32_param_0];
+; CHECK-NEXT: ld.shared.v2.u32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT: add.s32 %r3, %r2, 1;
+; CHECK-NEXT: add.s32 %r4, %r1, 1;
+; CHECK-NEXT: st.shared.v2.u32 [%rd1], {%r4, %r3};
+; CHECK-NEXT: ret;
+ %a.load = load <2 x i32>, ptr addrspace(3) %a
+ %a.add = add <2 x i32> %a.load, <i32 1, i32 1>
+ store <2 x i32> %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+define void @shared_4xi32(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_4xi32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_4xi32_param_0];
+; CHECK-NEXT: ld.shared.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: add.s32 %r5, %r4, 1;
+; CHECK-NEXT: add.s32 %r6, %r3, 1;
+; CHECK-NEXT: add.s32 %r7, %r2, 1;
+; CHECK-NEXT: add.s32 %r8, %r1, 1;
+; CHECK-NEXT: st.shared.v4.u32 [%rd1], {%r8, %r7, %r6, %r5};
+; CHECK-NEXT: ret;
+ %a.load = load <4 x i32>, ptr addrspace(3) %a
+ %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1>
+ store <4 x i32> %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+define void @shared_2xi64(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_2xi64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xi64_param_0];
+; CHECK-NEXT: ld.shared.v2.u64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT: add.s64 %rd4, %rd3, 1;
+; CHECK-NEXT: add.s64 %rd5, %rd2, 1;
+; CHECK-NEXT: st.shared.v2.u64 [%rd1], {%rd5, %rd4};
+; CHECK-NEXT: ret;
+ %a.load = load <2 x i64>, ptr addrspace(3) %a
+ %a.add = add <2 x i64> %a.load, <i64 1, i64 1>
+ store <2 x i64> %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+define void @shared_2xfloat(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_2xfloat(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xfloat_param_0];
+; CHECK-NEXT: ld.shared.v2.f32 {%f1, %f2}, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000;
+; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000;
+; CHECK-NEXT: st.shared.v2.f32 [%rd1], {%f4, %f3};
+; CHECK-NEXT: ret;
+ %a.load = load <2 x float>, ptr addrspace(3) %a
+ %a.add = fadd <2 x float> %a.load, <float 1., float 1.>
+ store <2 x float> %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+define void @shared_4xfloat(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_4xfloat(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_4xfloat_param_0];
+; CHECK-NEXT: ld.shared.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000;
+; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000;
+; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000;
+; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000;
+; CHECK-NEXT: st.shared.v4.f32 [%rd1], {%f8, %f7, %f6, %f5};
+; CHECK-NEXT: ret;
+ %a.load = load <4 x float>, ptr addrspace(3) %a
+ %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.>
+ store <4 x float> %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+define void @shared_2xdouble(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_2xdouble(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f64 %fd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xdouble_param_0];
+; CHECK-NEXT: ld.shared.v2.f64 {%fd1, %fd2}, [%rd1];
+; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000;
+; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT: st.shared.v2.f64 [%rd1], {%fd4, %fd3};
+; CHECK-NEXT: ret;
+ %a.load = load <2 x double>, ptr addrspace(3) %a
+ %a.add = fadd <2 x double> %a.load, <double 1., double 1.>
+ store <2 x double> %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+; shared_volatile
+
+define void @shared_volatile_i8(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_volatile_i8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_i8_param_0];
+; CHECK-NEXT: ld.volatile.shared.u8 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.volatile.shared.u8 [%rd1], %rs2;
+; CHECK-NEXT: ret;
%a.load = load volatile i8, ptr addrspace(3) %a
%a.add = add i8 %a.load, 1
- ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
store volatile i8 %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+define void @shared_volatile_i16(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_volatile_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_i16_param_0];
+; CHECK-NEXT: ld.volatile.shared.u16 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.volatile.shared.u16 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load volatile i16, ptr addrspace(3) %a
+ %a.add = add i16 %a.load, 1
+ store volatile i16 %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+define void @shared_volatile_i32(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_volatile_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_i32_param_0];
+; CHECK-NEXT: ld.volatile.shared.u32 %r1, [%rd1];
+; CHECK-NEXT: add.s32 %r2, %r1, 1;
+; CHECK-NEXT: st.volatile.shared.u32 [%rd1], %r2;
+; CHECK-NEXT: ret;
+ %a.load = load volatile i32, ptr addrspace(3) %a
+ %a.add = add i32 %a.load, 1
+ store volatile i32 %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+define void @shared_volatile_i64(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_volatile_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_i64_param_0];
+; CHECK-NEXT: ld.volatile.shared.u64 %rd2, [%rd1];
+; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
+; CHECK-NEXT: st.volatile.shared.u64 [%rd1], %rd3;
+; CHECK-NEXT: ret;
+ %a.load = load volatile i64, ptr addrspace(3) %a
+ %a.add = add i64 %a.load, 1
+ store volatile i64 %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+define void @shared_volatile_float(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_volatile_float(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_float_param_0];
+; CHECK-NEXT: ld.volatile.shared.f32 %f1, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT: st.volatile.shared.f32 [%rd1], %f2;
+; CHECK-NEXT: ret;
+ %a.load = load volatile float, ptr addrspace(3) %a
+ %a.add = fadd float %a.load, 1.
+ store volatile float %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+define void @shared_volatile_double(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_volatile_double(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_double_param_0];
+; CHECK-NEXT: ld.volatile.shared.f64 %fd1, [%rd1];
+; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT: st.volatile.shared.f64 [%rd1], %fd2;
+; CHECK-NEXT: ret;
+ %a.load = load volatile double, ptr addrspace(3) %a
+ %a.add = fadd double %a.load, 1.
+ store volatile double %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+define void @shared_volatile_2xi8(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_volatile_2xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<5>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xi8_param_0];
+; CHECK-NEXT: ld.volatile.shared.v2.u8 {%rs1, %rs2}, [%rd1];
+; CHECK-NEXT: add.s16 %rs3, %rs2, 1;
+; CHECK-NEXT: add.s16 %rs4, %rs1, 1;
+; CHECK-NEXT: st.volatile.shared.v2.u8 [%rd1], {%rs4, %rs3};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <2 x i8>, ptr addrspace(3) %a
+ %a.add = add <2 x i8> %a.load, <i8 1, i8 1>
+ store volatile <2 x i8> %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+define void @shared_volatile_4xi8(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_volatile_4xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<9>;
+; CHECK-NEXT: .reg .b32 %r<13>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_4xi8_param_0];
+; CHECK-NEXT: ld.volatile.shared.u32 %r1, [%rd1];
+; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: cvt.u32.u16 %r3, %rs2;
+; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r4;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT: cvt.u32.u16 %r5, %rs4;
+; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8;
+; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
+; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
+; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8;
+; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
+; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8;
+; CHECK-NEXT: st.volatile.shared.u32 [%rd1], %r12;
+; CHECK-NEXT: ret;
+ %a.load = load volatile <4 x i8>, ptr addrspace(3) %a
+ %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1>
+ store volatile <4 x i8> %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+define void @shared_volatile_2xi16(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_volatile_2xi16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<5>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xi16_param_0];
+; CHECK-NEXT: ld.volatile.shared.u32 %r1, [%rd1];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT: add.s16 %rs3, %rs2, 1;
+; CHECK-NEXT: add.s16 %rs4, %rs1, 1;
+; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3};
+; CHECK-NEXT: st.volatile.shared.u32 [%rd1], %r2;
+; CHECK-NEXT: ret;
+ %a.load = load volatile <2 x i16>, ptr addrspace(3) %a
+ %a.add = add <2 x i16> %a.load, <i16 1, i16 1>
+ store volatile <2 x i16> %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+define void @shared_volatile_4xi16(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_volatile_4xi16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_4xi16_param_0];
+; CHECK-NEXT: ld.volatile.shared.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
+; CHECK-NEXT: add.s16 %rs5, %rs4, 1;
+; CHECK-NEXT: add.s16 %rs6, %rs3, 1;
+; CHECK-NEXT: add.s16 %rs7, %rs2, 1;
+; CHECK-NEXT: add.s16 %rs8, %rs1, 1;
+; CHECK-NEXT: st.volatile.shared.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <4 x i16>, ptr addrspace(3) %a
+ %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1>
+ store volatile <4 x i16> %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+define void @shared_volatile_2xi32(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_volatile_2xi32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xi32_param_0];
+; CHECK-NEXT: ld.volatile.shared.v2.u32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT: add.s32 %r3, %r2, 1;
+; CHECK-NEXT: add.s32 %r4, %r1, 1;
+; CHECK-NEXT: st.volatile.shared.v2.u32 [%rd1], {%r4, %r3};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <2 x i32>, ptr addrspace(3) %a
+ %a.add = add <2 x i32> %a.load, <i32 1, i32 1>
+ store volatile <2 x i32> %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+define void @shared_volatile_4xi32(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_volatile_4xi32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_4xi32_param_0];
+; CHECK-NEXT: ld.volatile.shared.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: add.s32 %r5, %r4, 1;
+; CHECK-NEXT: add.s32 %r6, %r3, 1;
+; CHECK-NEXT: add.s32 %r7, %r2, 1;
+; CHECK-NEXT: add.s32 %r8, %r1, 1;
+; CHECK-NEXT: st.volatile.shared.v4.u32 [%rd1], {%r8, %r7, %r6, %r5};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <4 x i32>, ptr addrspace(3) %a
+ %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1>
+ store volatile <4 x i32> %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+define void @shared_volatile_2xi64(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_volatile_2xi64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xi64_param_0];
+; CHECK-NEXT: ld.volatile.shared.v2.u64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT: add.s64 %rd4, %rd3, 1;
+; CHECK-NEXT: add.s64 %rd5, %rd2, 1;
+; CHECK-NEXT: st.volatile.shared.v2.u64 [%rd1], {%rd5, %rd4};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <2 x i64>, ptr addrspace(3) %a
+ %a.add = add <2 x i64> %a.load, <i64 1, i64 1>
+ store volatile <2 x i64> %a.add, ptr addrspace(3) %a
+ ret void
+}
- ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load volatile i16, ptr addrspace(3) %b
- %b.add = add i16 %b.load, 1
- ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store volatile i16 %b.add, ptr addrspace(3) %b
-
- ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load volatile i32, ptr addrspace(3) %c
- %c.add = add i32 %c.load, 1
- ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store volatile i32 %c.add, ptr addrspace(3) %c
-
- ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load volatile i64, ptr addrspace(3) %d
- %d.add = add i64 %d.load, 1
- ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store volatile i64 %d.add, ptr addrspace(3) %d
-
- ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load volatile float, ptr addrspace(3) %c
- %e.add = fadd float %e.load, 1.
- ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store volatile float %e.add, ptr addrspace(3) %c
-
- ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load volatile double, ptr addrspace(3) %c
- %f.add = fadd double %f.load, 1.
- ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store volatile double %f.add, ptr addrspace(3) %c
-
- ; CHECK: ld.volatile.shared.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %h.load = load volatile <2 x i8>, ptr addrspace(3) %b
- %h.add = add <2 x i8> %h.load, <i8 1, i8 1>
- ; CHECK: st.volatile.shared.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}}
- store volatile <2 x i8> %h.add, ptr addrspace(3) %b
-
- ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %i.load = load volatile <4 x i8>, ptr addrspace(3) %c
- %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1>
- ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store volatile <4 x i8> %i.add, ptr addrspace(3) %c
-
- ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %j.load = load volatile <2 x i16>, ptr addrspace(3) %c
- %j.add = add <2 x i16> %j.load, <i16 1, i16 1>
- ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store volatile <2 x i16> %j.add, ptr addrspace(3) %c
-
- ; CHECK: ld.volatile.shared.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %k.load = load volatile <4 x i16>, ptr addrspace(3) %d
- %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1>
- ; CHECK: st.volatile.shared.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}
- store volatile <4 x i16> %k.add, ptr addrspace(3) %d
-
- ; CHECK: ld.volatile.shared.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %l.load = load volatile <2 x i32>, ptr addrspace(3) %d
- %l.add = add <2 x i32> %l.load, <i32 1, i32 1>
- ; CHECK: st.volatile.shared.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}}
- store volatile <2 x i32> %l.add, ptr addrspace(3) %d
-
- ; CHECK: ld.volatile.shared.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %m.load = load volatile <4 x i32>, ptr addrspace(3) %d
- %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1>
- ; CHECK: st.volatile.shared.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
- store volatile <4 x i32> %m.add, ptr addrspace(3) %d
-
- ; CHECK: ld.volatile.shared.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %n.load = load volatile <2 x i64>, ptr addrspace(3) %d
- %n.add = add <2 x i64> %n.load, <i64 1, i64 1>
- ; CHECK: st.volatile.shared.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}}
- store volatile <2 x i64> %n.add, ptr addrspace(3) %d
-
- ; CHECK: ld.volatile.shared.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %o.load = load volatile <2 x float>, ptr addrspace(3) %d
- %o.add = fadd <2 x float> %o.load, <float 1., float 1.>
- ; CHECK: st.volatile.shared.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}}
- store volatile <2 x float> %o.add, ptr addrspace(3) %d
-
- ; CHECK: ld.volatile.shared.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %p.load = load volatile <4 x float>, ptr addrspace(3) %d
- %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.>
- ; CHECK: st.volatile.shared.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
- store volatile <4 x float> %p.add, ptr addrspace(3) %d
-
- ; CHECK: ld.volatile.shared.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %q.load = load volatile <2 x double>, ptr addrspace(3) %d
- %q.add = fadd <2 x double> %q.load, <double 1., double 1.>
- ; CHECK: st.volatile.shared.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}}
- store volatile <2 x double> %q.add, ptr addrspace(3) %d
-
- ret void
-}
-
-; CHECK-LABEL: shared_unordered_sys
-define void @shared_unordered_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
- ; SM60: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.relaxed.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+define void @shared_volatile_2xfloat(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_volatile_2xfloat(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xfloat_param_0];
+; CHECK-NEXT: ld.volatile.shared.v2.f32 {%f1, %f2}, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000;
+; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000;
+; CHECK-NEXT: st.volatile.shared.v2.f32 [%rd1], {%f4, %f3};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <2 x float>, ptr addrspace(3) %a
+ %a.add = fadd <2 x float> %a.load, <float 1., float 1.>
+ store volatile <2 x float> %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+define void @shared_volatile_4xfloat(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_volatile_4xfloat(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_4xfloat_param_0];
+; CHECK-NEXT: ld.volatile.shared.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000;
+; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000;
+; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000;
+; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000;
+; CHECK-NEXT: st.volatile.shared.v4.f32 [%rd1], {%f8, %f7, %f6, %f5};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <4 x float>, ptr addrspace(3) %a
+ %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.>
+ store volatile <4 x float> %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+define void @shared_volatile_2xdouble(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_volatile_2xdouble(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f64 %fd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xdouble_param_0];
+; CHECK-NEXT: ld.volatile.shared.v2.f64 {%fd1, %fd2}, [%rd1];
+; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000;
+; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT: st.volatile.shared.v2.f64 [%rd1], {%fd4, %fd3};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <2 x double>, ptr addrspace(3) %a
+ %a.add = fadd <2 x double> %a.load, <double 1., double 1.>
+ store volatile <2 x double> %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+; shared_unordered_sys
+
+define void @shared_unordered_sys_i8(ptr addrspace(3) %a) {
+; SM60-LABEL: shared_unordered_sys_i8(
+; SM60: {
+; SM60-NEXT: .reg .b16 %rs<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i8_param_0];
+; SM60-NEXT: ld.volatile.shared.u8 %rs1, [%rd1];
+; SM60-NEXT: add.s16 %rs2, %rs1, 1;
+; SM60-NEXT: st.volatile.shared.u8 [%rd1], %rs2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: shared_unordered_sys_i8(
+; SM70: {
+; SM70-NEXT: .reg .b16 %rs<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i8_param_0];
+; SM70-NEXT: ld.relaxed.sys.shared.u8 %rs1, [%rd1];
+; SM70-NEXT: add.s16 %rs2, %rs1, 1;
+; SM70-NEXT: st.relaxed.sys.shared.u8 [%rd1], %rs2;
+; SM70-NEXT: ret;
%a.load = load atomic i8, ptr addrspace(3) %a unordered, align 1
%a.add = add i8 %a.load, 1
- ; SM60: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- ; SM70: st.relaxed.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
store atomic i8 %a.add, ptr addrspace(3) %a unordered, align 1
+ ret void
+}
- ; SM60: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.relaxed.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr addrspace(3) %b unordered, align 2
- %b.add = add i16 %b.load, 1
- ; SM60: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- ; SM70: st.relaxed.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr addrspace(3) %b unordered, align 2
-
- ; SM60: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.relaxed.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr addrspace(3) %c unordered, align 4
- %c.add = add i32 %c.load, 1
- ; SM60: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- ; SM70: st.relaxed.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr addrspace(3) %c unordered, align 4
-
- ; SM60: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.relaxed.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr addrspace(3) %d unordered, align 8
- %d.add = add i64 %d.load, 1
- ; SM60: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- ; SM70: st.relaxed.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr addrspace(3) %d unordered, align 8
-
- ; SM60: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.relaxed.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr addrspace(3) %e unordered, align 4
- %e.add = fadd float %e.load, 1.0
- ; SM60: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- ; SM70: st.relaxed.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr addrspace(3) %e unordered, align 4
-
- ; SM60: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.relaxed.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr addrspace(3) %e unordered, align 8
- %f.add = fadd double %f.load, 1.
- ; SM60: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- ; SM70: st.relaxed.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr addrspace(3) %e unordered, align 8
-
- ret void
-}
-
-; CHECK-LABEL: shared_unordered_volatile_sys
-define void @shared_unordered_volatile_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
- ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+define void @shared_unordered_sys_i16(ptr addrspace(3) %a) {
+; SM60-LABEL: shared_unordered_sys_i16(
+; SM60: {
+; SM60-NEXT: .reg .b16 %rs<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i16_param_0];
+; SM60-NEXT: ld.volatile.shared.u16 %rs1, [%rd1];
+; SM60-NEXT: add.s16 %rs2, %rs1, 1;
+; SM60-NEXT: st.volatile.shared.u16 [%rd1], %rs2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: shared_unordered_sys_i16(
+; SM70: {
+; SM70-NEXT: .reg .b16 %rs<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i16_param_0];
+; SM70-NEXT: ld.relaxed.sys.shared.u16 %rs1, [%rd1];
+; SM70-NEXT: add.s16 %rs2, %rs1, 1;
+; SM70-NEXT: st.relaxed.sys.shared.u16 [%rd1], %rs2;
+; SM70-NEXT: ret;
+ %a.load = load atomic i16, ptr addrspace(3) %a unordered, align 2
+ %a.add = add i16 %a.load, 1
+ store atomic i16 %a.add, ptr addrspace(3) %a unordered, align 2
+ ret void
+}
+
+define void @shared_unordered_sys_i32(ptr addrspace(3) %a) {
+; SM60-LABEL: shared_unordered_sys_i32(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i32_param_0];
+; SM60-NEXT: ld.volatile.shared.u32 %r1, [%rd1];
+; SM60-NEXT: add.s32 %r2, %r1, 1;
+; SM60-NEXT: st.volatile.shared.u32 [%rd1], %r2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: shared_unordered_sys_i32(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i32_param_0];
+; SM70-NEXT: ld.relaxed.sys.shared.u32 %r1, [%rd1];
+; SM70-NEXT: add.s32 %r2, %r1, 1;
+; SM70-NEXT: st.relaxed.sys.shared.u32 [%rd1], %r2;
+; SM70-NEXT: ret;
+ %a.load = load atomic i32, ptr addrspace(3) %a unordered, align 4
+ %a.add = add i32 %a.load, 1
+ store atomic i32 %a.add, ptr addrspace(3) %a unordered, align 4
+ ret void
+}
+
+define void @shared_unordered_sys_i64(ptr addrspace(3) %a) {
+; SM60-LABEL: shared_unordered_sys_i64(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<4>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i64_param_0];
+; SM60-NEXT: ld.volatile.shared.u64 %rd2, [%rd1];
+; SM60-NEXT: add.s64 %rd3, %rd2, 1;
+; SM60-NEXT: st.volatile.shared.u64 [%rd1], %rd3;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: shared_unordered_sys_i64(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<4>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i64_param_0];
+; SM70-NEXT: ld.relaxed.sys.shared.u64 %rd2, [%rd1];
+; SM70-NEXT: add.s64 %rd3, %rd2, 1;
+; SM70-NEXT: st.relaxed.sys.shared.u64 [%rd1], %rd3;
+; SM70-NEXT: ret;
+ %a.load = load atomic i64, ptr addrspace(3) %a unordered, align 8
+ %a.add = add i64 %a.load, 1
+ store atomic i64 %a.add, ptr addrspace(3) %a unordered, align 8
+ ret void
+}
+
+define void @shared_unordered_sys_float(ptr addrspace(3) %a) {
+; SM60-LABEL: shared_unordered_sys_float(
+; SM60: {
+; SM60-NEXT: .reg .f32 %f<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_float_param_0];
+; SM60-NEXT: ld.volatile.shared.f32 %f1, [%rd1];
+; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; SM60-NEXT: st.volatile.shared.f32 [%rd1], %f2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: shared_unordered_sys_float(
+; SM70: {
+; SM70-NEXT: .reg .f32 %f<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_float_param_0];
+; SM70-NEXT: ld.relaxed.sys.shared.f32 %f1, [%rd1];
+; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; SM70-NEXT: st.relaxed.sys.shared.f32 [%rd1], %f2;
+; SM70-NEXT: ret;
+ %a.load = load atomic float, ptr addrspace(3) %a unordered, align 4
+ %a.add = fadd float %a.load, 1.
+ store atomic float %a.add, ptr addrspace(3) %a unordered, align 4
+ ret void
+}
+
+define void @shared_unordered_sys_double(ptr addrspace(3) %a) {
+; SM60-LABEL: shared_unordered_sys_double(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .f64 %fd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_double_param_0];
+; SM60-NEXT: ld.volatile.shared.f64 %fd1, [%rd1];
+; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM60-NEXT: st.volatile.shared.f64 [%rd1], %fd2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: shared_unordered_sys_double(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .f64 %fd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_double_param_0];
+; SM70-NEXT: ld.relaxed.sys.shared.f64 %fd1, [%rd1];
+; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM70-NEXT: st.relaxed.sys.shared.f64 [%rd1], %fd2;
+; SM70-NEXT: ret;
+ %a.load = load atomic double, ptr addrspace(3) %a unordered, align 8
+ %a.add = fadd double %a.load, 1.
+ store atomic double %a.add, ptr addrspace(3) %a unordered, align 8
+ ret void
+}
+
+; shared_unordered_volatile_sys
+
+define void @shared_unordered_volatile_sys_i8(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_unordered_volatile_sys_i8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_unordered_volatile_sys_i8_param_0];
+; CHECK-NEXT: ld.volatile.shared.u8 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.volatile.shared.u8 [%rd1], %rs2;
+; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr addrspace(3) %a unordered, align 1
%a.add = add i8 %a.load, 1
- ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
store atomic volatile i8 %a.add, ptr addrspace(3) %a unordered, align 1
+ ret void
+}
- ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic volatile i16, ptr addrspace(3) %b unordered, align 2
- %b.add = add i16 %b.load, 1
- ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i16 %b.add, ptr addrspace(3) %b unordered, align 2
-
- ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic volatile i32, ptr addrspace(3) %c unordered, align 4
- %c.add = add i32 %c.load, 1
- ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic volatile i32 %c.add, ptr addrspace(3) %c unordered, align 4
+define void @shared_unordered_volatile_sys_i16(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_unordered_volatile_sys_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_unordered_volatile_sys_i16_param_0];
+; CHECK-NEXT: ld.volatile.shared.u16 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.volatile.shared.u16 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile i16, ptr addrspace(3) %a unordered, align 2
+ %a.add = add i16 %a.load, 1
+ store atomic volatile i16 %a.add, ptr addrspace(3) %a unordered, align 2
+ ret void
+}
- ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic volatile i64, ptr addrspace(3) %d unordered, align 8
- %d.add = add i64 %d.load, 1
- ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic volatile i64 %d.add, ptr addrspace(3) %d unordered, align 8
+define void @shared_unordered_volatile_sys_i32(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_unordered_volatile_sys_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_unordered_volatile_sys_i32_param_0];
+; CHECK-NEXT: ld.volatile.shared.u32 %r1, [%rd1];
+; CHECK-NEXT: add.s32 %r2, %r1, 1;
+; CHECK-NEXT: st.volatile.shared.u32 [%rd1], %r2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile i32, ptr addrspace(3) %a unordered, align 4
+ %a.add = add i32 %a.load, 1
+ store atomic volatile i32 %a.add, ptr addrspace(3) %a unordered, align 4
+ ret void
+}
- ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic volatile float, ptr addrspace(3) %e unordered, align 4
- %e.add = fadd float %e.load, 1.0
- ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic volatile float %e.add, ptr addrspace(3) %e unordered, align 4
+define void @shared_unordered_volatile_sys_i64(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_unordered_volatile_sys_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_unordered_volatile_sys_i64_param_0];
+; CHECK-NEXT: ld.volatile.shared.u64 %rd2, [%rd1];
+; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
+; CHECK-NEXT: st.volatile.shared.u64 [%rd1], %rd3;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile i64, ptr addrspace(3) %a unordered, align 8
+ %a.add = add i64 %a.load, 1
+ store atomic volatile i64 %a.add, ptr addrspace(3) %a unordered, align 8
+ ret void
+}
- ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic volatile double, ptr addrspace(3) %e unordered, align 8
- %f.add = fadd double %f.load, 1.
- ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic volatile double %f.add, ptr addrspace(3) %e unordered, align 8
+define void @shared_unordered_volatile_sys_float(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_unordered_volatile_sys_float(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_unordered_volatile_sys_float_param_0];
+; CHECK-NEXT: ld.volatile.shared.f32 %f1, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT: st.volatile.shared.f32 [%rd1], %f2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile float, ptr addrspace(3) %a unordered, align 4
+ %a.add = fadd float %a.load, 1.
+ store atomic volatile float %a.add, ptr addrspace(3) %a unordered, align 4
+ ret void
+}
+define void @shared_unordered_volatile_sys_double(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_unordered_volatile_sys_double(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_unordered_volatile_sys_double_param_0];
+; CHECK-NEXT: ld.volatile.shared.f64 %fd1, [%rd1];
+; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT: st.volatile.shared.f64 [%rd1], %fd2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile double, ptr addrspace(3) %a unordered, align 8
+ %a.add = fadd double %a.load, 1.
+ store atomic volatile double %a.add, ptr addrspace(3) %a unordered, align 8
ret void
}
-; CHECK-LABEL: shared_monotonic_sys
-define void @shared_monotonic_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
- ; SM60: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.relaxed.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; shared_monotonic_sys
+
+define void @shared_monotonic_sys_i8(ptr addrspace(3) %a) {
+; SM60-LABEL: shared_monotonic_sys_i8(
+; SM60: {
+; SM60-NEXT: .reg .b16 %rs<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i8_param_0];
+; SM60-NEXT: ld.volatile.shared.u8 %rs1, [%rd1];
+; SM60-NEXT: add.s16 %rs2, %rs1, 1;
+; SM60-NEXT: st.volatile.shared.u8 [%rd1], %rs2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: shared_monotonic_sys_i8(
+; SM70: {
+; SM70-NEXT: .reg .b16 %rs<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i8_param_0];
+; SM70-NEXT: ld.relaxed.sys.shared.u8 %rs1, [%rd1];
+; SM70-NEXT: add.s16 %rs2, %rs1, 1;
+; SM70-NEXT: st.relaxed.sys.shared.u8 [%rd1], %rs2;
+; SM70-NEXT: ret;
%a.load = load atomic i8, ptr addrspace(3) %a monotonic, align 1
%a.add = add i8 %a.load, 1
- ; SM60: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- ; SM70: st.relaxed.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
store atomic i8 %a.add, ptr addrspace(3) %a monotonic, align 1
+ ret void
+}
+
+define void @shared_monotonic_sys_i16(ptr addrspace(3) %a) {
+; SM60-LABEL: shared_monotonic_sys_i16(
+; SM60: {
+; SM60-NEXT: .reg .b16 %rs<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i16_param_0];
+; SM60-NEXT: ld.volatile.shared.u16 %rs1, [%rd1];
+; SM60-NEXT: add.s16 %rs2, %rs1, 1;
+; SM60-NEXT: st.volatile.shared.u16 [%rd1], %rs2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: shared_monotonic_sys_i16(
+; SM70: {
+; SM70-NEXT: .reg .b16 %rs<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i16_param_0];
+; SM70-NEXT: ld.relaxed.sys.shared.u16 %rs1, [%rd1];
+; SM70-NEXT: add.s16 %rs2, %rs1, 1;
+; SM70-NEXT: st.relaxed.sys.shared.u16 [%rd1], %rs2;
+; SM70-NEXT: ret;
+ %a.load = load atomic i16, ptr addrspace(3) %a monotonic, align 2
+ %a.add = add i16 %a.load, 1
+ store atomic i16 %a.add, ptr addrspace(3) %a monotonic, align 2
+ ret void
+}
+
+define void @shared_monotonic_sys_i32(ptr addrspace(3) %a) {
+; SM60-LABEL: shared_monotonic_sys_i32(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i32_param_0];
+; SM60-NEXT: ld.volatile.shared.u32 %r1, [%rd1];
+; SM60-NEXT: add.s32 %r2, %r1, 1;
+; SM60-NEXT: st.volatile.shared.u32 [%rd1], %r2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: shared_monotonic_sys_i32(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i32_param_0];
+; SM70-NEXT: ld.relaxed.sys.shared.u32 %r1, [%rd1];
+; SM70-NEXT: add.s32 %r2, %r1, 1;
+; SM70-NEXT: st.relaxed.sys.shared.u32 [%rd1], %r2;
+; SM70-NEXT: ret;
+ %a.load = load atomic i32, ptr addrspace(3) %a monotonic, align 4
+ %a.add = add i32 %a.load, 1
+ store atomic i32 %a.add, ptr addrspace(3) %a monotonic, align 4
+ ret void
+}
+
+define void @shared_monotonic_sys_i64(ptr addrspace(3) %a) {
+; SM60-LABEL: shared_monotonic_sys_i64(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<4>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i64_param_0];
+; SM60-NEXT: ld.volatile.shared.u64 %rd2, [%rd1];
+; SM60-NEXT: add.s64 %rd3, %rd2, 1;
+; SM60-NEXT: st.volatile.shared.u64 [%rd1], %rd3;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: shared_monotonic_sys_i64(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<4>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i64_param_0];
+; SM70-NEXT: ld.relaxed.sys.shared.u64 %rd2, [%rd1];
+; SM70-NEXT: add.s64 %rd3, %rd2, 1;
+; SM70-NEXT: st.relaxed.sys.shared.u64 [%rd1], %rd3;
+; SM70-NEXT: ret;
+ %a.load = load atomic i64, ptr addrspace(3) %a monotonic, align 8
+ %a.add = add i64 %a.load, 1
+ store atomic i64 %a.add, ptr addrspace(3) %a monotonic, align 8
+ ret void
+}
- ; SM60: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.relaxed.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr addrspace(3) %b monotonic, align 2
- %b.add = add i16 %b.load, 1
- ; SM60: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- ; SM70: st.relaxed.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr addrspace(3) %b monotonic, align 2
-
- ; SM60: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.relaxed.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr addrspace(3) %c monotonic, align 4
- %c.add = add i32 %c.load, 1
- ; SM60: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- ; SM70: st.relaxed.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr addrspace(3) %c monotonic, align 4
-
- ; SM60: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.relaxed.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr addrspace(3) %d monotonic, align 8
- %d.add = add i64 %d.load, 1
- ; SM60: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- ; SM70: st.relaxed.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr addrspace(3) %d monotonic, align 8
-
- ; SM60: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.relaxed.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr addrspace(3) %e monotonic, align 4
- %e.add = fadd float %e.load, 1.
- ; SM60: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- ; SM70: st.relaxed.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr addrspace(3) %e monotonic, align 4
-
- ; SM60: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- ; SM70: ld.relaxed.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr addrspace(3) %e monotonic, align 8
- %f.add = fadd double %f.load, 1.
- ; SM60: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- ; SM70: st.relaxed.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr addrspace(3) %e monotonic, align 8
-
- ret void
-}
-
-; CHECK-LABEL: shared_monotonic_volatile_sys
-define void @shared_monotonic_volatile_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
- ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+define void @shared_monotonic_sys_float(ptr addrspace(3) %a) {
+; SM60-LABEL: shared_monotonic_sys_float(
+; SM60: {
+; SM60-NEXT: .reg .f32 %f<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_float_param_0];
+; SM60-NEXT: ld.volatile.shared.f32 %f1, [%rd1];
+; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; SM60-NEXT: st.volatile.shared.f32 [%rd1], %f2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: shared_monotonic_sys_float(
+; SM70: {
+; SM70-NEXT: .reg .f32 %f<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_float_param_0];
+; SM70-NEXT: ld.relaxed.sys.shared.f32 %f1, [%rd1];
+; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; SM70-NEXT: st.relaxed.sys.shared.f32 [%rd1], %f2;
+; SM70-NEXT: ret;
+ %a.load = load atomic float, ptr addrspace(3) %a monotonic, align 4
+ %a.add = fadd float %a.load, 1.
+ store atomic float %a.add, ptr addrspace(3) %a monotonic, align 4
+ ret void
+}
+
+define void @shared_monotonic_sys_double(ptr addrspace(3) %a) {
+; SM60-LABEL: shared_monotonic_sys_double(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .f64 %fd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_double_param_0];
+; SM60-NEXT: ld.volatile.shared.f64 %fd1, [%rd1];
+; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM60-NEXT: st.volatile.shared.f64 [%rd1], %fd2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: shared_monotonic_sys_double(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .f64 %fd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_double_param_0];
+; SM70-NEXT: ld.relaxed.sys.shared.f64 %fd1, [%rd1];
+; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM70-NEXT: st.relaxed.sys.shared.f64 [%rd1], %fd2;
+; SM70-NEXT: ret;
+ %a.load = load atomic double, ptr addrspace(3) %a monotonic, align 8
+ %a.add = fadd double %a.load, 1.
+ store atomic double %a.add, ptr addrspace(3) %a monotonic, align 8
+ ret void
+}
+
+; shared_monotonic_volatile_sys
+
+define void @shared_monotonic_volatile_sys_i8(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_monotonic_volatile_sys_i8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_monotonic_volatile_sys_i8_param_0];
+; CHECK-NEXT: ld.volatile.shared.u8 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.volatile.shared.u8 [%rd1], %rs2;
+; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr addrspace(3) %a monotonic, align 1
%a.add = add i8 %a.load, 1
- ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
store atomic volatile i8 %a.add, ptr addrspace(3) %a monotonic, align 1
+ ret void
+}
- ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic volatile i16, ptr addrspace(3) %b monotonic, align 2
- %b.add = add i16 %b.load, 1
- ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i16 %b.add, ptr addrspace(3) %b monotonic, align 2
-
- ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic volatile i32, ptr addrspace(3) %c monotonic, align 4
- %c.add = add i32 %c.load, 1
- ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic volatile i32 %c.add, ptr addrspace(3) %c monotonic, align 4
+define void @shared_monotonic_volatile_sys_i16(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_monotonic_volatile_sys_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_monotonic_volatile_sys_i16_param_0];
+; CHECK-NEXT: ld.volatile.shared.u16 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.volatile.shared.u16 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile i16, ptr addrspace(3) %a monotonic, align 2
+ %a.add = add i16 %a.load, 1
+ store atomic volatile i16 %a.add, ptr addrspace(3) %a monotonic, align 2
+ ret void
+}
- ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic volatile i64, ptr addrspace(3) %d monotonic, align 8
- %d.add = add i64 %d.load, 1
- ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic volatile i64 %d.add, ptr addrspace(3) %d monotonic, align 8
+define void @shared_monotonic_volatile_sys_i32(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_monotonic_volatile_sys_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_monotonic_volatile_sys_i32_param_0];
+; CHECK-NEXT: ld.volatile.shared.u32 %r1, [%rd1];
+; CHECK-NEXT: add.s32 %r2, %r1, 1;
+; CHECK-NEXT: st.volatile.shared.u32 [%rd1], %r2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile i32, ptr addrspace(3) %a monotonic, align 4
+ %a.add = add i32 %a.load, 1
+ store atomic volatile i32 %a.add, ptr addrspace(3) %a monotonic, align 4
+ ret void
+}
- ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic volatile float, ptr addrspace(3) %e monotonic, align 4
- %e.add = fadd float %e.load, 1.
- ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic volatile float %e.add, ptr addrspace(3) %e monotonic, align 4
+define void @shared_monotonic_volatile_sys_i64(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_monotonic_volatile_sys_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_monotonic_volatile_sys_i64_param_0];
+; CHECK-NEXT: ld.volatile.shared.u64 %rd2, [%rd1];
+; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
+; CHECK-NEXT: st.volatile.shared.u64 [%rd1], %rd3;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile i64, ptr addrspace(3) %a monotonic, align 8
+ %a.add = add i64 %a.load, 1
+ store atomic volatile i64 %a.add, ptr addrspace(3) %a monotonic, align 8
+ ret void
+}
- ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic volatile double, ptr addrspace(3) %e monotonic, align 8
- %f.add = fadd double %f.load, 1.
- ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic volatile double %f.add, ptr addrspace(3) %e monotonic, align 8
+define void @shared_monotonic_volatile_sys_float(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_monotonic_volatile_sys_float(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_monotonic_volatile_sys_float_param_0];
+; CHECK-NEXT: ld.volatile.shared.f32 %f1, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT: st.volatile.shared.f32 [%rd1], %f2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile float, ptr addrspace(3) %a monotonic, align 4
+ %a.add = fadd float %a.load, 1.
+ store atomic volatile float %a.add, ptr addrspace(3) %a monotonic, align 4
+ ret void
+}
+define void @shared_monotonic_volatile_sys_double(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_monotonic_volatile_sys_double(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_monotonic_volatile_sys_double_param_0];
+; CHECK-NEXT: ld.volatile.shared.f64 %fd1, [%rd1];
+; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT: st.volatile.shared.f64 [%rd1], %fd2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile double, ptr addrspace(3) %a monotonic, align 8
+ %a.add = fadd double %a.load, 1.
+ store atomic volatile double %a.add, ptr addrspace(3) %a monotonic, align 8
ret void
}
;; local statespace
-; CHECK-LABEL: local_weak
-define void @local_weak(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr {
- ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; local
+
+define void @local_i8(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_i8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_i8_param_0];
+; CHECK-NEXT: ld.local.u8 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.local.u8 [%rd1], %rs2;
+; CHECK-NEXT: ret;
%a.load = load i8, ptr addrspace(5) %a
%a.add = add i8 %a.load, 1
- ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
store i8 %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+define void @local_i16(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_i16_param_0];
+; CHECK-NEXT: ld.local.u16 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.local.u16 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load i16, ptr addrspace(5) %a
+ %a.add = add i16 %a.load, 1
+ store i16 %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+define void @local_i32(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_i32_param_0];
+; CHECK-NEXT: ld.local.u32 %r1, [%rd1];
+; CHECK-NEXT: add.s32 %r2, %r1, 1;
+; CHECK-NEXT: st.local.u32 [%rd1], %r2;
+; CHECK-NEXT: ret;
+ %a.load = load i32, ptr addrspace(5) %a
+ %a.add = add i32 %a.load, 1
+ store i32 %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+define void @local_i64(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_i64_param_0];
+; CHECK-NEXT: ld.local.u64 %rd2, [%rd1];
+; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
+; CHECK-NEXT: st.local.u64 [%rd1], %rd3;
+; CHECK-NEXT: ret;
+ %a.load = load i64, ptr addrspace(5) %a
+ %a.add = add i64 %a.load, 1
+ store i64 %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+define void @local_float(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_float(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_float_param_0];
+; CHECK-NEXT: ld.local.f32 %f1, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT: st.local.f32 [%rd1], %f2;
+; CHECK-NEXT: ret;
+ %a.load = load float, ptr addrspace(5) %a
+ %a.add = fadd float %a.load, 1.
+ store float %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+define void @local_double(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_double(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_double_param_0];
+; CHECK-NEXT: ld.local.f64 %fd1, [%rd1];
+; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT: st.local.f64 [%rd1], %fd2;
+; CHECK-NEXT: ret;
+ %a.load = load double, ptr addrspace(5) %a
+ %a.add = fadd double %a.load, 1.
+ store double %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+define void @local_2xi8(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_2xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<5>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_2xi8_param_0];
+; CHECK-NEXT: ld.local.v2.u8 {%rs1, %rs2}, [%rd1];
+; CHECK-NEXT: add.s16 %rs3, %rs2, 1;
+; CHECK-NEXT: add.s16 %rs4, %rs1, 1;
+; CHECK-NEXT: st.local.v2.u8 [%rd1], {%rs4, %rs3};
+; CHECK-NEXT: ret;
+ %a.load = load <2 x i8>, ptr addrspace(5) %a
+ %a.add = add <2 x i8> %a.load, <i8 1, i8 1>
+ store <2 x i8> %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+define void @local_4xi8(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_4xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<9>;
+; CHECK-NEXT: .reg .b32 %r<13>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_4xi8_param_0];
+; CHECK-NEXT: ld.local.u32 %r1, [%rd1];
+; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: cvt.u32.u16 %r3, %rs2;
+; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r4;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT: cvt.u32.u16 %r5, %rs4;
+; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8;
+; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
+; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
+; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8;
+; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
+; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8;
+; CHECK-NEXT: st.local.u32 [%rd1], %r12;
+; CHECK-NEXT: ret;
+ %a.load = load <4 x i8>, ptr addrspace(5) %a
+ %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1>
+ store <4 x i8> %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+define void @local_2xi16(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_2xi16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<5>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_2xi16_param_0];
+; CHECK-NEXT: ld.local.u32 %r1, [%rd1];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT: add.s16 %rs3, %rs2, 1;
+; CHECK-NEXT: add.s16 %rs4, %rs1, 1;
+; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3};
+; CHECK-NEXT: st.local.u32 [%rd1], %r2;
+; CHECK-NEXT: ret;
+ %a.load = load <2 x i16>, ptr addrspace(5) %a
+ %a.add = add <2 x i16> %a.load, <i16 1, i16 1>
+ store <2 x i16> %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+define void @local_4xi16(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_4xi16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_4xi16_param_0];
+; CHECK-NEXT: ld.local.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
+; CHECK-NEXT: add.s16 %rs5, %rs4, 1;
+; CHECK-NEXT: add.s16 %rs6, %rs3, 1;
+; CHECK-NEXT: add.s16 %rs7, %rs2, 1;
+; CHECK-NEXT: add.s16 %rs8, %rs1, 1;
+; CHECK-NEXT: st.local.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5};
+; CHECK-NEXT: ret;
+ %a.load = load <4 x i16>, ptr addrspace(5) %a
+ %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1>
+ store <4 x i16> %a.add, ptr addrspace(5) %a
+ ret void
+}
- ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load i16, ptr addrspace(5) %b
- %b.add = add i16 %b.load, 1
- ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store i16 %b.add, ptr addrspace(5) %b
-
- ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load i32, ptr addrspace(5) %c
- %c.add = add i32 %c.load, 1
- ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store i32 %c.add, ptr addrspace(5) %c
-
- ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load i64, ptr addrspace(5) %d
- %d.add = add i64 %d.load, 1
- ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store i64 %d.add, ptr addrspace(5) %d
-
- ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load float, ptr addrspace(5) %c
- %e.add = fadd float %e.load, 1.
- ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store float %e.add, ptr addrspace(5) %c
-
- ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load double, ptr addrspace(5) %c
- %f.add = fadd double %f.load, 1.
- ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store double %f.add, ptr addrspace(5) %c
-
- ; CHECK: ld.local.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %h.load = load <2 x i8>, ptr addrspace(5) %b
- %h.add = add <2 x i8> %h.load, <i8 1, i8 1>
- ; CHECK: st.local.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}}
- store <2 x i8> %h.add, ptr addrspace(5) %b
-
- ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %i.load = load <4 x i8>, ptr addrspace(5) %c
- %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1>
- ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store <4 x i8> %i.add, ptr addrspace(5) %c
-
- ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %j.load = load <2 x i16>, ptr addrspace(5) %c
- %j.add = add <2 x i16> %j.load, <i16 1, i16 1>
- ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store <2 x i16> %j.add, ptr addrspace(5) %c
-
- ; CHECK: ld.local.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %k.load = load <4 x i16>, ptr addrspace(5) %d
- %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1>
- ; CHECK: st.local.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}
- store <4 x i16> %k.add, ptr addrspace(5) %d
-
- ; CHECK: ld.local.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %l.load = load <2 x i32>, ptr addrspace(5) %d
- %l.add = add <2 x i32> %l.load, <i32 1, i32 1>
- ; CHECK: st.local.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}}
- store <2 x i32> %l.add, ptr addrspace(5) %d
-
- ; CHECK: ld.local.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %m.load = load <4 x i32>, ptr addrspace(5) %d
- %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1>
- ; CHECK: st.local.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
- store <4 x i32> %m.add, ptr addrspace(5) %d
-
- ; CHECK: ld.local.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %n.load = load <2 x i64>, ptr addrspace(5) %d
- %n.add = add <2 x i64> %n.load, <i64 1, i64 1>
- ; CHECK: st.local.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}}
- store <2 x i64> %n.add, ptr addrspace(5) %d
-
- ; CHECK: ld.local.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %o.load = load <2 x float>, ptr addrspace(5) %d
- %o.add = fadd <2 x float> %o.load, <float 1., float 1.>
- ; CHECK: st.local.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}}
- store <2 x float> %o.add, ptr addrspace(5) %d
-
- ; CHECK: ld.local.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %p.load = load <4 x float>, ptr addrspace(5) %d
- %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.>
- ; CHECK: st.local.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
- store <4 x float> %p.add, ptr addrspace(5) %d
-
- ; CHECK: ld.local.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %q.load = load <2 x double>, ptr addrspace(5) %d
- %q.add = fadd <2 x double> %q.load, <double 1., double 1.>
- ; CHECK: st.local.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}}
- store <2 x double> %q.add, ptr addrspace(5) %d
-
- ret void
-}
-
-; CHECK-LABEL: local_volatile
-define void @local_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr {
- ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+define void @local_2xi32(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_2xi32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_2xi32_param_0];
+; CHECK-NEXT: ld.local.v2.u32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT: add.s32 %r3, %r2, 1;
+; CHECK-NEXT: add.s32 %r4, %r1, 1;
+; CHECK-NEXT: st.local.v2.u32 [%rd1], {%r4, %r3};
+; CHECK-NEXT: ret;
+ %a.load = load <2 x i32>, ptr addrspace(5) %a
+ %a.add = add <2 x i32> %a.load, <i32 1, i32 1>
+ store <2 x i32> %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+define void @local_4xi32(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_4xi32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_4xi32_param_0];
+; CHECK-NEXT: ld.local.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: add.s32 %r5, %r4, 1;
+; CHECK-NEXT: add.s32 %r6, %r3, 1;
+; CHECK-NEXT: add.s32 %r7, %r2, 1;
+; CHECK-NEXT: add.s32 %r8, %r1, 1;
+; CHECK-NEXT: st.local.v4.u32 [%rd1], {%r8, %r7, %r6, %r5};
+; CHECK-NEXT: ret;
+ %a.load = load <4 x i32>, ptr addrspace(5) %a
+ %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1>
+ store <4 x i32> %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+define void @local_2xi64(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_2xi64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_2xi64_param_0];
+; CHECK-NEXT: ld.local.v2.u64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT: add.s64 %rd4, %rd3, 1;
+; CHECK-NEXT: add.s64 %rd5, %rd2, 1;
+; CHECK-NEXT: st.local.v2.u64 [%rd1], {%rd5, %rd4};
+; CHECK-NEXT: ret;
+ %a.load = load <2 x i64>, ptr addrspace(5) %a
+ %a.add = add <2 x i64> %a.load, <i64 1, i64 1>
+ store <2 x i64> %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+define void @local_2xfloat(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_2xfloat(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_2xfloat_param_0];
+; CHECK-NEXT: ld.local.v2.f32 {%f1, %f2}, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000;
+; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000;
+; CHECK-NEXT: st.local.v2.f32 [%rd1], {%f4, %f3};
+; CHECK-NEXT: ret;
+ %a.load = load <2 x float>, ptr addrspace(5) %a
+ %a.add = fadd <2 x float> %a.load, <float 1., float 1.>
+ store <2 x float> %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+define void @local_4xfloat(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_4xfloat(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_4xfloat_param_0];
+; CHECK-NEXT: ld.local.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000;
+; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000;
+; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000;
+; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000;
+; CHECK-NEXT: st.local.v4.f32 [%rd1], {%f8, %f7, %f6, %f5};
+; CHECK-NEXT: ret;
+ %a.load = load <4 x float>, ptr addrspace(5) %a
+ %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.>
+ store <4 x float> %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+define void @local_2xdouble(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_2xdouble(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f64 %fd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_2xdouble_param_0];
+; CHECK-NEXT: ld.local.v2.f64 {%fd1, %fd2}, [%rd1];
+; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000;
+; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT: st.local.v2.f64 [%rd1], {%fd4, %fd3};
+; CHECK-NEXT: ret;
+ %a.load = load <2 x double>, ptr addrspace(5) %a
+ %a.add = fadd <2 x double> %a.load, <double 1., double 1.>
+ store <2 x double> %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+; local_volatile
+
+define void @local_volatile_i8(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_volatile_i8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_i8_param_0];
+; CHECK-NEXT: ld.local.u8 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.local.u8 [%rd1], %rs2;
+; CHECK-NEXT: ret;
%a.load = load volatile i8, ptr addrspace(5) %a
%a.add = add i8 %a.load, 1
- ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
store volatile i8 %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+define void @local_volatile_i16(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_volatile_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_i16_param_0];
+; CHECK-NEXT: ld.local.u16 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.local.u16 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load volatile i16, ptr addrspace(5) %a
+ %a.add = add i16 %a.load, 1
+ store volatile i16 %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+define void @local_volatile_i32(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_volatile_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_i32_param_0];
+; CHECK-NEXT: ld.local.u32 %r1, [%rd1];
+; CHECK-NEXT: add.s32 %r2, %r1, 1;
+; CHECK-NEXT: st.local.u32 [%rd1], %r2;
+; CHECK-NEXT: ret;
+ %a.load = load volatile i32, ptr addrspace(5) %a
+ %a.add = add i32 %a.load, 1
+ store volatile i32 %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+define void @local_volatile_i64(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_volatile_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_i64_param_0];
+; CHECK-NEXT: ld.local.u64 %rd2, [%rd1];
+; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
+; CHECK-NEXT: st.local.u64 [%rd1], %rd3;
+; CHECK-NEXT: ret;
+ %a.load = load volatile i64, ptr addrspace(5) %a
+ %a.add = add i64 %a.load, 1
+ store volatile i64 %a.add, ptr addrspace(5) %a
+ ret void
+}
- ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load volatile i16, ptr addrspace(5) %b
- %b.add = add i16 %b.load, 1
- ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store volatile i16 %b.add, ptr addrspace(5) %b
-
- ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load volatile i32, ptr addrspace(5) %c
- %c.add = add i32 %c.load, 1
- ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store volatile i32 %c.add, ptr addrspace(5) %c
-
- ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load volatile i64, ptr addrspace(5) %d
- %d.add = add i64 %d.load, 1
- ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store volatile i64 %d.add, ptr addrspace(5) %d
-
- ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load volatile float, ptr addrspace(5) %c
- %e.add = fadd float %e.load, 1.
- ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store volatile float %e.add, ptr addrspace(5) %c
-
- ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load volatile double, ptr addrspace(5) %c
- %f.add = fadd double %f.load, 1.
- ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store volatile double %f.add, ptr addrspace(5) %c
-
- ; CHECK: ld.local.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %h.load = load volatile <2 x i8>, ptr addrspace(5) %b
- %h.add = add <2 x i8> %h.load, <i8 1, i8 1>
- ; CHECK: st.local.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}}
- store volatile <2 x i8> %h.add, ptr addrspace(5) %b
-
- ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %i.load = load volatile <4 x i8>, ptr addrspace(5) %c
- %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1>
- ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store volatile <4 x i8> %i.add, ptr addrspace(5) %c
-
- ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %j.load = load volatile <2 x i16>, ptr addrspace(5) %c
- %j.add = add <2 x i16> %j.load, <i16 1, i16 1>
- ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store volatile <2 x i16> %j.add, ptr addrspace(5) %c
-
- ; CHECK: ld.local.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %k.load = load volatile <4 x i16>, ptr addrspace(5) %d
- %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1>
- ; CHECK: st.local.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}
- store volatile <4 x i16> %k.add, ptr addrspace(5) %d
-
- ; CHECK: ld.local.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %l.load = load volatile <2 x i32>, ptr addrspace(5) %d
- %l.add = add <2 x i32> %l.load, <i32 1, i32 1>
- ; CHECK: st.local.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}}
- store volatile <2 x i32> %l.add, ptr addrspace(5) %d
-
- ; CHECK: ld.local.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %m.load = load volatile <4 x i32>, ptr addrspace(5) %d
- %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1>
- ; CHECK: st.local.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
- store volatile <4 x i32> %m.add, ptr addrspace(5) %d
-
- ; CHECK: ld.local.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %n.load = load volatile <2 x i64>, ptr addrspace(5) %d
- %n.add = add <2 x i64> %n.load, <i64 1, i64 1>
- ; CHECK: st.local.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}}
- store volatile <2 x i64> %n.add, ptr addrspace(5) %d
-
- ; CHECK: ld.local.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %o.load = load volatile <2 x float>, ptr addrspace(5) %d
- %o.add = fadd <2 x float> %o.load, <float 1., float 1.>
- ; CHECK: st.local.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}}
- store volatile <2 x float> %o.add, ptr addrspace(5) %d
-
- ; CHECK: ld.local.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %p.load = load volatile <4 x float>, ptr addrspace(5) %d
- %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.>
- ; CHECK: st.local.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
- store volatile <4 x float> %p.add, ptr addrspace(5) %d
-
- ; CHECK: ld.local.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}]
- %q.load = load volatile <2 x double>, ptr addrspace(5) %d
- %q.add = fadd <2 x double> %q.load, <double 1., double 1.>
- ; CHECK: st.local.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}}
- store volatile <2 x double> %q.add, ptr addrspace(5) %d
-
- ret void
-}
-
-; CHECK-LABEL: local_unordered_sys
-define void @local_unordered_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
- ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+define void @local_volatile_float(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_volatile_float(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_float_param_0];
+; CHECK-NEXT: ld.local.f32 %f1, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT: st.local.f32 [%rd1], %f2;
+; CHECK-NEXT: ret;
+ %a.load = load volatile float, ptr addrspace(5) %a
+ %a.add = fadd float %a.load, 1.
+ store volatile float %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+define void @local_volatile_double(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_volatile_double(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_double_param_0];
+; CHECK-NEXT: ld.local.f64 %fd1, [%rd1];
+; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT: st.local.f64 [%rd1], %fd2;
+; CHECK-NEXT: ret;
+ %a.load = load volatile double, ptr addrspace(5) %a
+ %a.add = fadd double %a.load, 1.
+ store volatile double %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+define void @local_volatile_2xi8(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_volatile_2xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<5>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xi8_param_0];
+; CHECK-NEXT: ld.local.v2.u8 {%rs1, %rs2}, [%rd1];
+; CHECK-NEXT: add.s16 %rs3, %rs2, 1;
+; CHECK-NEXT: add.s16 %rs4, %rs1, 1;
+; CHECK-NEXT: st.local.v2.u8 [%rd1], {%rs4, %rs3};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <2 x i8>, ptr addrspace(5) %a
+ %a.add = add <2 x i8> %a.load, <i8 1, i8 1>
+ store volatile <2 x i8> %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+define void @local_volatile_4xi8(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_volatile_4xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<9>;
+; CHECK-NEXT: .reg .b32 %r<13>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_4xi8_param_0];
+; CHECK-NEXT: ld.local.u32 %r1, [%rd1];
+; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: cvt.u32.u16 %r3, %rs2;
+; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r4;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT: cvt.u32.u16 %r5, %rs4;
+; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8;
+; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
+; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
+; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8;
+; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
+; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8;
+; CHECK-NEXT: st.local.u32 [%rd1], %r12;
+; CHECK-NEXT: ret;
+ %a.load = load volatile <4 x i8>, ptr addrspace(5) %a
+ %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1>
+ store volatile <4 x i8> %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+define void @local_volatile_2xi16(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_volatile_2xi16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<5>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xi16_param_0];
+; CHECK-NEXT: ld.local.u32 %r1, [%rd1];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT: add.s16 %rs3, %rs2, 1;
+; CHECK-NEXT: add.s16 %rs4, %rs1, 1;
+; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3};
+; CHECK-NEXT: st.local.u32 [%rd1], %r2;
+; CHECK-NEXT: ret;
+ %a.load = load volatile <2 x i16>, ptr addrspace(5) %a
+ %a.add = add <2 x i16> %a.load, <i16 1, i16 1>
+ store volatile <2 x i16> %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+define void @local_volatile_4xi16(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_volatile_4xi16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_4xi16_param_0];
+; CHECK-NEXT: ld.local.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
+; CHECK-NEXT: add.s16 %rs5, %rs4, 1;
+; CHECK-NEXT: add.s16 %rs6, %rs3, 1;
+; CHECK-NEXT: add.s16 %rs7, %rs2, 1;
+; CHECK-NEXT: add.s16 %rs8, %rs1, 1;
+; CHECK-NEXT: st.local.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <4 x i16>, ptr addrspace(5) %a
+ %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1>
+ store volatile <4 x i16> %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+define void @local_volatile_2xi32(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_volatile_2xi32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xi32_param_0];
+; CHECK-NEXT: ld.local.v2.u32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT: add.s32 %r3, %r2, 1;
+; CHECK-NEXT: add.s32 %r4, %r1, 1;
+; CHECK-NEXT: st.local.v2.u32 [%rd1], {%r4, %r3};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <2 x i32>, ptr addrspace(5) %a
+ %a.add = add <2 x i32> %a.load, <i32 1, i32 1>
+ store volatile <2 x i32> %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+define void @local_volatile_4xi32(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_volatile_4xi32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_4xi32_param_0];
+; CHECK-NEXT: ld.local.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: add.s32 %r5, %r4, 1;
+; CHECK-NEXT: add.s32 %r6, %r3, 1;
+; CHECK-NEXT: add.s32 %r7, %r2, 1;
+; CHECK-NEXT: add.s32 %r8, %r1, 1;
+; CHECK-NEXT: st.local.v4.u32 [%rd1], {%r8, %r7, %r6, %r5};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <4 x i32>, ptr addrspace(5) %a
+ %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1>
+ store volatile <4 x i32> %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+define void @local_volatile_2xi64(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_volatile_2xi64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xi64_param_0];
+; CHECK-NEXT: ld.local.v2.u64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT: add.s64 %rd4, %rd3, 1;
+; CHECK-NEXT: add.s64 %rd5, %rd2, 1;
+; CHECK-NEXT: st.local.v2.u64 [%rd1], {%rd5, %rd4};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <2 x i64>, ptr addrspace(5) %a
+ %a.add = add <2 x i64> %a.load, <i64 1, i64 1>
+ store volatile <2 x i64> %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+define void @local_volatile_2xfloat(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_volatile_2xfloat(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xfloat_param_0];
+; CHECK-NEXT: ld.local.v2.f32 {%f1, %f2}, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000;
+; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000;
+; CHECK-NEXT: st.local.v2.f32 [%rd1], {%f4, %f3};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <2 x float>, ptr addrspace(5) %a
+ %a.add = fadd <2 x float> %a.load, <float 1., float 1.>
+ store volatile <2 x float> %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+define void @local_volatile_4xfloat(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_volatile_4xfloat(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_4xfloat_param_0];
+; CHECK-NEXT: ld.local.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000;
+; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000;
+; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000;
+; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000;
+; CHECK-NEXT: st.local.v4.f32 [%rd1], {%f8, %f7, %f6, %f5};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <4 x float>, ptr addrspace(5) %a
+ %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.>
+ store volatile <4 x float> %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+define void @local_volatile_2xdouble(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_volatile_2xdouble(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f64 %fd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xdouble_param_0];
+; CHECK-NEXT: ld.local.v2.f64 {%fd1, %fd2}, [%rd1];
+; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000;
+; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT: st.local.v2.f64 [%rd1], {%fd4, %fd3};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <2 x double>, ptr addrspace(5) %a
+ %a.add = fadd <2 x double> %a.load, <double 1., double 1.>
+ store volatile <2 x double> %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+; local_unordered_sys
+
+define void @local_unordered_sys_i8(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_unordered_sys_i8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_sys_i8_param_0];
+; CHECK-NEXT: ld.local.u8 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.local.u8 [%rd1], %rs2;
+; CHECK-NEXT: ret;
%a.load = load atomic i8, ptr addrspace(5) %a unordered, align 1
%a.add = add i8 %a.load, 1
- ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
store atomic i8 %a.add, ptr addrspace(5) %a unordered, align 1
+ ret void
+}
- ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr addrspace(5) %b unordered, align 2
- %b.add = add i16 %b.load, 1
- ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr addrspace(5) %b unordered, align 2
-
- ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr addrspace(5) %c unordered, align 4
- %c.add = add i32 %c.load, 1
- ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr addrspace(5) %c unordered, align 4
+define void @local_unordered_sys_i16(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_unordered_sys_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_sys_i16_param_0];
+; CHECK-NEXT: ld.local.u16 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.local.u16 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic i16, ptr addrspace(5) %a unordered, align 2
+ %a.add = add i16 %a.load, 1
+ store atomic i16 %a.add, ptr addrspace(5) %a unordered, align 2
+ ret void
+}
- ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr addrspace(5) %d unordered, align 8
- %d.add = add i64 %d.load, 1
- ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr addrspace(5) %d unordered, align 8
+define void @local_unordered_sys_i32(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_unordered_sys_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_sys_i32_param_0];
+; CHECK-NEXT: ld.local.u32 %r1, [%rd1];
+; CHECK-NEXT: add.s32 %r2, %r1, 1;
+; CHECK-NEXT: st.local.u32 [%rd1], %r2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic i32, ptr addrspace(5) %a unordered, align 4
+ %a.add = add i32 %a.load, 1
+ store atomic i32 %a.add, ptr addrspace(5) %a unordered, align 4
+ ret void
+}
- ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr addrspace(5) %e unordered, align 4
- %e.add = fadd float %e.load, 1.0
- ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr addrspace(5) %e unordered, align 4
+define void @local_unordered_sys_i64(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_unordered_sys_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_sys_i64_param_0];
+; CHECK-NEXT: ld.local.u64 %rd2, [%rd1];
+; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
+; CHECK-NEXT: st.local.u64 [%rd1], %rd3;
+; CHECK-NEXT: ret;
+ %a.load = load atomic i64, ptr addrspace(5) %a unordered, align 8
+ %a.add = add i64 %a.load, 1
+ store atomic i64 %a.add, ptr addrspace(5) %a unordered, align 8
+ ret void
+}
- ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr addrspace(5) %e unordered, align 8
- %f.add = fadd double %f.load, 1.
- ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr addrspace(5) %e unordered, align 8
+define void @local_unordered_sys_float(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_unordered_sys_float(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_sys_float_param_0];
+; CHECK-NEXT: ld.local.f32 %f1, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT: st.local.f32 [%rd1], %f2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic float, ptr addrspace(5) %a unordered, align 4
+ %a.add = fadd float %a.load, 1.
+ store atomic float %a.add, ptr addrspace(5) %a unordered, align 4
+ ret void
+}
+define void @local_unordered_sys_double(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_unordered_sys_double(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_sys_double_param_0];
+; CHECK-NEXT: ld.local.f64 %fd1, [%rd1];
+; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT: st.local.f64 [%rd1], %fd2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic double, ptr addrspace(5) %a unordered, align 8
+ %a.add = fadd double %a.load, 1.
+ store atomic double %a.add, ptr addrspace(5) %a unordered, align 8
ret void
}
-; CHECK-LABEL: local_unordered_volatile_sys
-define void @local_unordered_volatile_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
- ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; local_unordered_volatile_sys
+
+define void @local_unordered_volatile_sys_i8(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_unordered_volatile_sys_i8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_volatile_sys_i8_param_0];
+; CHECK-NEXT: ld.local.u8 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.local.u8 [%rd1], %rs2;
+; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr addrspace(5) %a unordered, align 1
%a.add = add i8 %a.load, 1
- ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
store atomic volatile i8 %a.add, ptr addrspace(5) %a unordered, align 1
+ ret void
+}
- ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic volatile i16, ptr addrspace(5) %b unordered, align 2
- %b.add = add i16 %b.load, 1
- ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i16 %b.add, ptr addrspace(5) %b unordered, align 2
-
- ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic volatile i32, ptr addrspace(5) %c unordered, align 4
- %c.add = add i32 %c.load, 1
- ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic volatile i32 %c.add, ptr addrspace(5) %c unordered, align 4
+define void @local_unordered_volatile_sys_i16(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_unordered_volatile_sys_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_volatile_sys_i16_param_0];
+; CHECK-NEXT: ld.local.u16 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.local.u16 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile i16, ptr addrspace(5) %a unordered, align 2
+ %a.add = add i16 %a.load, 1
+ store atomic volatile i16 %a.add, ptr addrspace(5) %a unordered, align 2
+ ret void
+}
- ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic volatile i64, ptr addrspace(5) %d unordered, align 8
- %d.add = add i64 %d.load, 1
- ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic volatile i64 %d.add, ptr addrspace(5) %d unordered, align 8
+define void @local_unordered_volatile_sys_i32(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_unordered_volatile_sys_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_volatile_sys_i32_param_0];
+; CHECK-NEXT: ld.local.u32 %r1, [%rd1];
+; CHECK-NEXT: add.s32 %r2, %r1, 1;
+; CHECK-NEXT: st.local.u32 [%rd1], %r2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile i32, ptr addrspace(5) %a unordered, align 4
+ %a.add = add i32 %a.load, 1
+ store atomic volatile i32 %a.add, ptr addrspace(5) %a unordered, align 4
+ ret void
+}
- ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic volatile float, ptr addrspace(5) %e unordered, align 4
- %e.add = fadd float %e.load, 1.0
- ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic volatile float %e.add, ptr addrspace(5) %e unordered, align 4
+define void @local_unordered_volatile_sys_i64(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_unordered_volatile_sys_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_volatile_sys_i64_param_0];
+; CHECK-NEXT: ld.local.u64 %rd2, [%rd1];
+; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
+; CHECK-NEXT: st.local.u64 [%rd1], %rd3;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile i64, ptr addrspace(5) %a unordered, align 8
+ %a.add = add i64 %a.load, 1
+ store atomic volatile i64 %a.add, ptr addrspace(5) %a unordered, align 8
+ ret void
+}
- ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic volatile double, ptr addrspace(5) %e unordered, align 8
- %f.add = fadd double %f.load, 1.
- ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic volatile double %f.add, ptr addrspace(5) %e unordered, align 8
+define void @local_unordered_volatile_sys_float(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_unordered_volatile_sys_float(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_volatile_sys_float_param_0];
+; CHECK-NEXT: ld.local.f32 %f1, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT: st.local.f32 [%rd1], %f2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile float, ptr addrspace(5) %a unordered, align 4
+ %a.add = fadd float %a.load, 1.
+ store atomic volatile float %a.add, ptr addrspace(5) %a unordered, align 4
+ ret void
+}
+define void @local_unordered_volatile_sys_double(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_unordered_volatile_sys_double(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_volatile_sys_double_param_0];
+; CHECK-NEXT: ld.local.f64 %fd1, [%rd1];
+; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT: st.local.f64 [%rd1], %fd2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile double, ptr addrspace(5) %a unordered, align 8
+ %a.add = fadd double %a.load, 1.
+ store atomic volatile double %a.add, ptr addrspace(5) %a unordered, align 8
ret void
}
-; CHECK-LABEL: local_monotonic_sys
-define void @local_monotonic_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
- ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; local_monotonic_sys
+
+define void @local_monotonic_sys_i8(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_monotonic_sys_i8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_sys_i8_param_0];
+; CHECK-NEXT: ld.local.u8 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.local.u8 [%rd1], %rs2;
+; CHECK-NEXT: ret;
%a.load = load atomic i8, ptr addrspace(5) %a monotonic, align 1
%a.add = add i8 %a.load, 1
- ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
store atomic i8 %a.add, ptr addrspace(5) %a monotonic, align 1
+ ret void
+}
- ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr addrspace(5) %b monotonic, align 2
- %b.add = add i16 %b.load, 1
- ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr addrspace(5) %b monotonic, align 2
-
- ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr addrspace(5) %c monotonic, align 4
- %c.add = add i32 %c.load, 1
- ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr addrspace(5) %c monotonic, align 4
+define void @local_monotonic_sys_i16(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_monotonic_sys_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_sys_i16_param_0];
+; CHECK-NEXT: ld.local.u16 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.local.u16 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic i16, ptr addrspace(5) %a monotonic, align 2
+ %a.add = add i16 %a.load, 1
+ store atomic i16 %a.add, ptr addrspace(5) %a monotonic, align 2
+ ret void
+}
- ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr addrspace(5) %d monotonic, align 8
- %d.add = add i64 %d.load, 1
- ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr addrspace(5) %d monotonic, align 8
+define void @local_monotonic_sys_i32(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_monotonic_sys_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_sys_i32_param_0];
+; CHECK-NEXT: ld.local.u32 %r1, [%rd1];
+; CHECK-NEXT: add.s32 %r2, %r1, 1;
+; CHECK-NEXT: st.local.u32 [%rd1], %r2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic i32, ptr addrspace(5) %a monotonic, align 4
+ %a.add = add i32 %a.load, 1
+ store atomic i32 %a.add, ptr addrspace(5) %a monotonic, align 4
+ ret void
+}
- ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr addrspace(5) %e monotonic, align 4
- %e.add = fadd float %e.load, 1.
- ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr addrspace(5) %e monotonic, align 4
+define void @local_monotonic_sys_i64(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_monotonic_sys_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_sys_i64_param_0];
+; CHECK-NEXT: ld.local.u64 %rd2, [%rd1];
+; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
+; CHECK-NEXT: st.local.u64 [%rd1], %rd3;
+; CHECK-NEXT: ret;
+ %a.load = load atomic i64, ptr addrspace(5) %a monotonic, align 8
+ %a.add = add i64 %a.load, 1
+ store atomic i64 %a.add, ptr addrspace(5) %a monotonic, align 8
+ ret void
+}
- ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr addrspace(5) %e monotonic, align 8
- %f.add = fadd double %f.load, 1.
- ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr addrspace(5) %e monotonic, align 8
+define void @local_monotonic_sys_float(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_monotonic_sys_float(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_sys_float_param_0];
+; CHECK-NEXT: ld.local.f32 %f1, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT: st.local.f32 [%rd1], %f2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic float, ptr addrspace(5) %a monotonic, align 4
+ %a.add = fadd float %a.load, 1.
+ store atomic float %a.add, ptr addrspace(5) %a monotonic, align 4
+ ret void
+}
+define void @local_monotonic_sys_double(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_monotonic_sys_double(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_sys_double_param_0];
+; CHECK-NEXT: ld.local.f64 %fd1, [%rd1];
+; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT: st.local.f64 [%rd1], %fd2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic double, ptr addrspace(5) %a monotonic, align 8
+ %a.add = fadd double %a.load, 1.
+ store atomic double %a.add, ptr addrspace(5) %a monotonic, align 8
ret void
}
-; CHECK-LABEL: local_monotonic_volatile
-define void @local_monotonic_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
- ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; local_monotonic_volatile_sys
+
+define void @local_monotonic_volatile_sys_i8(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_monotonic_volatile_sys_i8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_volatile_sys_i8_param_0];
+; CHECK-NEXT: ld.local.u8 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.local.u8 [%rd1], %rs2;
+; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr addrspace(5) %a monotonic, align 1
%a.add = add i8 %a.load, 1
- ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
store atomic volatile i8 %a.add, ptr addrspace(5) %a monotonic, align 1
+ ret void
+}
+
+define void @local_monotonic_volatile_sys_i16(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_monotonic_volatile_sys_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_volatile_sys_i16_param_0];
+; CHECK-NEXT: ld.local.u16 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.local.u16 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile i16, ptr addrspace(5) %a monotonic, align 2
+ %a.add = add i16 %a.load, 1
+ store atomic volatile i16 %a.add, ptr addrspace(5) %a monotonic, align 2
+ ret void
+}
- ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic volatile i16, ptr addrspace(5) %b monotonic, align 2
- %b.add = add i16 %b.load, 1
- ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i16 %b.add, ptr addrspace(5) %b monotonic, align 2
-
- ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic volatile i32, ptr addrspace(5) %c monotonic, align 4
- %c.add = add i32 %c.load, 1
- ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic volatile i32 %c.add, ptr addrspace(5) %c monotonic, align 4
-
- ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic volatile i64, ptr addrspace(5) %d monotonic, align 8
- %d.add = add i64 %d.load, 1
- ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic volatile i64 %d.add, ptr addrspace(5) %d monotonic, align 8
-
- ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic volatile float, ptr addrspace(5) %e monotonic, align 4
- %e.add = fadd float %e.load, 1.
- ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic volatile float %e.add, ptr addrspace(5) %e monotonic, align 4
-
- ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic volatile double, ptr addrspace(5) %e monotonic, align 8
- %f.add = fadd double %f.load, 1.
- ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic volatile double %f.add, ptr addrspace(5) %e monotonic, align 8
+define void @local_monotonic_volatile_sys_i32(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_monotonic_volatile_sys_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_volatile_sys_i32_param_0];
+; CHECK-NEXT: ld.local.u32 %r1, [%rd1];
+; CHECK-NEXT: add.s32 %r2, %r1, 1;
+; CHECK-NEXT: st.local.u32 [%rd1], %r2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile i32, ptr addrspace(5) %a monotonic, align 4
+ %a.add = add i32 %a.load, 1
+ store atomic volatile i32 %a.add, ptr addrspace(5) %a monotonic, align 4
+ ret void
+}
+
+define void @local_monotonic_volatile_sys_i64(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_monotonic_volatile_sys_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_volatile_sys_i64_param_0];
+; CHECK-NEXT: ld.local.u64 %rd2, [%rd1];
+; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
+; CHECK-NEXT: st.local.u64 [%rd1], %rd3;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile i64, ptr addrspace(5) %a monotonic, align 8
+ %a.add = add i64 %a.load, 1
+ store atomic volatile i64 %a.add, ptr addrspace(5) %a monotonic, align 8
+ ret void
+}
+
+define void @local_monotonic_volatile_sys_float(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_monotonic_volatile_sys_float(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_volatile_sys_float_param_0];
+; CHECK-NEXT: ld.local.f32 %f1, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT: st.local.f32 [%rd1], %f2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile float, ptr addrspace(5) %a monotonic, align 4
+ %a.add = fadd float %a.load, 1.
+ store atomic volatile float %a.add, ptr addrspace(5) %a monotonic, align 4
+ ret void
+}
+define void @local_monotonic_volatile_sys_double(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_monotonic_volatile_sys_double(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_volatile_sys_double_param_0];
+; CHECK-NEXT: ld.local.f64 %fd1, [%rd1];
+; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT: st.local.f64 [%rd1], %fd2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile double, ptr addrspace(5) %a monotonic, align 8
+ %a.add = fadd double %a.load, 1.
+ store atomic volatile double %a.add, ptr addrspace(5) %a monotonic, align 8
ret void
}
diff --git a/llvm/test/CodeGen/PowerPC/big-endian-store-forward.ll b/llvm/test/CodeGen/PowerPC/big-endian-store-forward.ll
index e139d3c..5bd3580 100644
--- a/llvm/test/CodeGen/PowerPC/big-endian-store-forward.ll
+++ b/llvm/test/CodeGen/PowerPC/big-endian-store-forward.ll
@@ -1,12 +1,18 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s
; The load is to the high byte of the 2-byte store
@g = global i8 -75
define void @f(i16 %v) {
-; CHECK-LABEL: f
-; CHECK: sth 3, -2(1)
-; CHECK: lbz 3, -2(1)
+; CHECK-LABEL: f:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addis 4, 2, .LC0@toc@ha
+; CHECK-NEXT: sth 3, -2(1)
+; CHECK-NEXT: ld 4, .LC0@toc@l(4)
+; CHECK-NEXT: lbz 3, -2(1)
+; CHECK-NEXT: stb 3, 0(4)
+; CHECK-NEXT: blr
%p32 = alloca i16
store i16 %v, ptr %p32
%tmp = load i8, ptr %p32
diff --git a/llvm/test/CodeGen/SPIRV/decoration-order.ll b/llvm/test/CodeGen/SPIRV/decoration-order.ll
new file mode 100644
index 0000000..e8299e9
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/decoration-order.ll
@@ -0,0 +1,15 @@
+; RUN: %if spirv-tools %{ llc -O0 -verify-machineinstrs -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+; This test checks the OpDecorate MIR is generated after the associated
+; vreg definition in the case of an array size declared through this lowering.
+
+define spir_func i32 @foo() {
+entry:
+ %var = alloca i64
+ br label %block
+
+block:
+ call void @llvm.memset.p0.i64(ptr align 8 %var, i8 0, i64 24, i1 false)
+ ret i32 0
+}
+
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg)
diff --git a/llvm/test/CodeGen/X86/tls-function-argument.ll b/llvm/test/CodeGen/X86/tls-function-argument.ll
new file mode 100644
index 0000000..9b6ab52
--- /dev/null
+++ b/llvm/test/CodeGen/X86/tls-function-argument.ll
@@ -0,0 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=x86_64 -verify-machineinstrs -relocation-model=pic < %s | FileCheck %s
+
+; Passing a pointer to thread-local storage to a function can be problematic
+; since computing such addresses requires a function call that is introduced
+; very late in instruction selection. We need to ensure that we don't introduce
+; nested call sequence markers if this function call happens in a call sequence.
+
+@TLS = internal thread_local global i64 zeroinitializer, align 8
+declare void @bar(ptr)
+define internal void @foo() {
+; CHECK-LABEL: foo:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset %rbx, -16
+; CHECK-NEXT: leaq TLS@TLSLD(%rip), %rdi
+; CHECK-NEXT: callq __tls_get_addr@PLT
+; CHECK-NEXT: leaq TLS@DTPOFF(%rax), %rbx
+; CHECK-NEXT: movq %rbx, %rdi
+; CHECK-NEXT: callq bar@PLT
+; CHECK-NEXT: movq %rbx, %rdi
+; CHECK-NEXT: callq bar@PLT
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
+ call void @bar(ptr @TLS)
+ call void @bar(ptr @TLS)
+ ret void
+}
diff --git a/llvm/test/Transforms/FunctionSpecialization/cmp-with-range.ll b/llvm/test/Transforms/FunctionSpecialization/cmp-with-range.ll
new file mode 100644
index 0000000..d692294
--- /dev/null
+++ b/llvm/test/Transforms/FunctionSpecialization/cmp-with-range.ll
@@ -0,0 +1,127 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 5
+; RUN: opt -passes="ipsccp<func-spec>" -funcspec-min-function-size=1 \
+; RUN: -funcspec-for-literal-constant=true \
+; RUN: -funcspec-min-codesize-savings=50 \
+; RUN: -funcspec-min-latency-savings=0 \
+; RUN: -S < %s | FileCheck %s
+
+; Verify that we are able to estimate the codesize savings arising from a branch
+; based on a comparison with a value found to have a constant range by IPSCCP.
+define i32 @main() {
+ %notspec = call i32 @test_use_on_lhs(i32 8)
+ %spec1 = call i32 @test_use_on_lhs(i32 0)
+ %spec2 = call i32 @test_use_on_rhs(i32 1)
+ %sum1 = add i32 %notspec, %spec1
+ %sum2 = add i32 %sum1, %spec2
+ ret i32 %sum2
+}
+
+define i32 @test_use_on_lhs(i32 %x) {
+entry:
+ %range = call i32 @foo(), !range !{ i32 1, i32 0 }
+ %bound = shl nsw nuw i32 %range, 3
+ %cmp = icmp uge i32 %x, %bound
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+ call void @do_something()
+ call void @do_something()
+ call void @do_something()
+ call void @do_something()
+ br label %if.end
+
+if.end:
+ %res = phi i32 [ 0, %entry ], [ 1, %if.then]
+ ret i32 %res
+}
+
+define i32 @test_use_on_rhs(i32 %x) {
+entry:
+ %range = call i32 @foo(), !range !{ i32 1, i32 0 }
+ %bound = shl nsw nuw i32 %range, 3
+ %x.sub = sub nsw nuw i32 %x, 1
+ %cmp = icmp ult i32 %bound, %x.sub
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+ call void @do_something()
+ call void @do_something()
+ call void @do_something()
+ call void @do_something()
+ br label %if.end
+
+if.end:
+ %res = phi i32 [ 0, %entry ], [ 1, %if.then]
+ ret i32 %res
+}
+
+declare i32 @foo()
+declare void @do_something()
+; CHECK-LABEL: define range(i32 0, 2) i32 @main() {
+; CHECK-NEXT: [[NOTSPEC:%.*]] = call i32 @test_use_on_lhs(i32 8)
+; CHECK-NEXT: [[SPEC1:%.*]] = call i32 @test_use_on_lhs.specialized.1(i32 0)
+; CHECK-NEXT: [[SPEC2:%.*]] = call i32 @test_use_on_rhs.specialized.2(i32 1)
+; CHECK-NEXT: [[SUM:%.*]] = add nuw nsw i32 [[NOTSPEC]], 0
+; CHECK-NEXT: [[RES:%.*]] = add nuw nsw i32 [[SUM]], 0
+; CHECK-NEXT: ret i32 [[RES]]
+;
+;
+; CHECK-LABEL: define range(i32 0, 2) i32 @test_use_on_lhs(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[RANGE:%.*]] = call i32 @foo(), !range [[RNG0:![0-9]+]]
+; CHECK-NEXT: [[BOUND:%.*]] = shl nuw nsw i32 [[RANGE]], 3
+; CHECK-NEXT: [[CMP:%.*]] = icmp uge i32 [[X]], [[BOUND]]
+; CHECK-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+; CHECK: [[IF_THEN]]:
+; CHECK-NEXT: call void @do_something()
+; CHECK-NEXT: call void @do_something()
+; CHECK-NEXT: call void @do_something()
+; CHECK-NEXT: call void @do_something()
+; CHECK-NEXT: br label %[[IF_END]]
+; CHECK: [[IF_END]]:
+; CHECK-NEXT: [[RES:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ 1, %[[IF_THEN]] ]
+; CHECK-NEXT: ret i32 [[RES]]
+;
+;
+; CHECK-LABEL: define range(i32 0, 2) i32 @test_use_on_rhs(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[RANGE:%.*]] = call i32 @foo(), !range [[RNG0]]
+; CHECK-NEXT: [[BOUND:%.*]] = shl nuw nsw i32 [[RANGE]], 3
+; CHECK-NEXT: [[X_SUB:%.*]] = sub nuw nsw i32 [[X]], 1
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[BOUND]], [[X_SUB]]
+; CHECK-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+; CHECK: [[IF_THEN]]:
+; CHECK-NEXT: call void @do_something()
+; CHECK-NEXT: call void @do_something()
+; CHECK-NEXT: call void @do_something()
+; CHECK-NEXT: call void @do_something()
+; CHECK-NEXT: br label %[[IF_END]]
+; CHECK: [[IF_END]]:
+; CHECK-NEXT: [[RES:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ 1, %[[IF_THEN]] ]
+; CHECK-NEXT: ret i32 [[RES]]
+;
+;
+; CHECK-LABEL: define internal i32 @test_use_on_lhs.specialized.1(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[RANGE:%.*]] = call i32 @foo(), !range [[RNG0]]
+; CHECK-NEXT: [[BOUND:%.*]] = shl nuw nsw i32 [[RANGE]], 3
+; CHECK-NEXT: br label %[[IF_END:.*]]
+; CHECK: [[IF_END]]:
+; CHECK-NEXT: ret i32 poison
+;
+;
+; CHECK-LABEL: define internal i32 @test_use_on_rhs.specialized.2(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[RANGE:%.*]] = call i32 @foo(), !range [[RNG0]]
+; CHECK-NEXT: [[BOUND:%.*]] = shl nuw nsw i32 [[RANGE]], 3
+; CHECK-NEXT: br label %[[IF_END:.*]]
+; CHECK: [[IF_END]]:
+; CHECK-NEXT: ret i32 poison
+;
+;.
+; CHECK: [[RNG0]] = !{i32 1, i32 0}
+;.
diff --git a/llvm/test/Transforms/SLPVectorizer/reudction-or-non-poisoned.ll b/llvm/test/Transforms/SLPVectorizer/reudction-or-non-poisoned.ll
new file mode 100644
index 0000000..ac47c60
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/reudction-or-non-poisoned.ll
@@ -0,0 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer < %s | FileCheck %s
+
+define i1 @test(i32 %x, i32 %a, i32 %b, i32 %c, i32 %d) {
+; CHECK-LABEL: define i1 @test(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]], i32 [[D:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[D]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[B]], i32 1
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[C]], i32 2
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[X]], i32 3
+; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt <4 x i32> [[TMP4]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT: ret i1 [[TMP6]]
+;
+ %cmp = icmp sgt i32 %x, 1
+ %cmp2 = icmp sgt i32 %b, 1
+ %cmp3 = icmp sgt i32 %c, 1
+ %cmp4 = icmp sgt i32 %d, 1
+ %sel2 = select i1 %cmp4, i1 true, i1 %cmp2
+ %sel3 = select i1 %sel2, i1 true, i1 %cmp3
+ %sel4 = select i1 %cmp, i1 true, i1 %cmp4
+ %ret = or i1 %sel3, %sel4
+ ret i1 %ret
+}
diff --git a/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp b/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp
index c8fd366..9f76e9f 100644
--- a/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp
+++ b/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp
@@ -469,3 +469,38 @@ TEST_F(FunctionSpecializationTest, PhiNode) {
EXPECT_TRUE(Test > 0);
}
+TEST_F(FunctionSpecializationTest, BinOp) {
+ // Verify that we can handle binary operators even when only one operand is
+ // constant.
+ const char *ModuleString = R"(
+ define i32 @foo(i1 %a, i1 %b) {
+ %and1 = and i1 %a, %b
+ %and2 = and i1 %b, %and1
+ %sel = select i1 %and2, i32 1, i32 0
+ ret i32 %sel
+ }
+ )";
+
+ Module &M = parseModule(ModuleString);
+ Function *F = M.getFunction("foo");
+ FunctionSpecializer Specializer = getSpecializerFor(F);
+ InstCostVisitor Visitor = Specializer.getInstCostVisitorFor(F);
+
+ Constant *False = ConstantInt::getFalse(M.getContext());
+ BasicBlock &BB = F->front();
+ Instruction &And1 = BB.front();
+ Instruction &And2 = *++BB.begin();
+ Instruction &Select = *++BB.begin();
+
+ Cost RefCodeSize = getCodeSizeSavings(And1) + getCodeSizeSavings(And2) +
+ getCodeSizeSavings(Select);
+ Cost RefLatency = getLatencySavings(F);
+
+ Cost TestCodeSize = Visitor.getCodeSizeSavingsForArg(F->getArg(0), False);
+ Cost TestLatency = Visitor.getLatencySavingsForKnownConstants();
+
+ EXPECT_EQ(TestCodeSize, RefCodeSize);
+ EXPECT_TRUE(TestCodeSize > 0);
+ EXPECT_EQ(TestLatency, RefLatency);
+ EXPECT_TRUE(TestLatency > 0);
+}